{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 100000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001, "grad_norm": 3.9445953369140625, "learning_rate": 2.97e-05, "loss": 6.7008, "num_input_tokens_seen": 6553600, "step": 100, "train_runtime": 61.1942, "train_tokens_per_second": 107095.166 }, { "epoch": 0.002, "grad_norm": 0.6828203797340393, "learning_rate": 5.97e-05, "loss": 3.3177, "num_input_tokens_seen": 13107200, "step": 200, "train_runtime": 107.6856, "train_tokens_per_second": 121717.274 }, { "epoch": 0.003, "grad_norm": 16.05720329284668, "learning_rate": 8.969999999999998e-05, "loss": 3.0024, "num_input_tokens_seen": 19660800, "step": 300, "train_runtime": 154.3564, "train_tokens_per_second": 127372.748 }, { "epoch": 0.004, "grad_norm": 13.74783706665039, "learning_rate": 0.0001197, "loss": 2.6797, "num_input_tokens_seen": 26214400, "step": 400, "train_runtime": 200.698, "train_tokens_per_second": 130616.167 }, { "epoch": 0.005, "grad_norm": 12.893468856811523, "learning_rate": 0.00014969999999999998, "loss": 2.4588, "num_input_tokens_seen": 32768000, "step": 500, "train_runtime": 252.1632, "train_tokens_per_second": 129947.566 }, { "epoch": 0.006, "grad_norm": 9.021939277648926, "learning_rate": 0.00017969999999999998, "loss": 2.276, "num_input_tokens_seen": 39321600, "step": 600, "train_runtime": 299.2712, "train_tokens_per_second": 131391.184 }, { "epoch": 0.007, "grad_norm": 8.669090270996094, "learning_rate": 0.00020969999999999997, "loss": 2.1203, "num_input_tokens_seen": 45875200, "step": 700, "train_runtime": 346.3366, "train_tokens_per_second": 132458.429 }, { "epoch": 0.008, "grad_norm": 7.335177898406982, "learning_rate": 0.0002397, "loss": 1.9886, "num_input_tokens_seen": 52428800, "step": 800, "train_runtime": 393.5299, "train_tokens_per_second": 133226.965 }, { "epoch": 0.009, "grad_norm": 6.051175117492676, "learning_rate": 0.0002697, "loss": 1.9128, "num_input_tokens_seen": 58982400, "step": 900, "train_runtime": 440.0136, "train_tokens_per_second": 134046.765 }, { "epoch": 0.01, "grad_norm": 5.503482818603516, "learning_rate": 0.00029969999999999997, "loss": 1.8296, "num_input_tokens_seen": 65536000, "step": 1000, "train_runtime": 492.2662, "train_tokens_per_second": 133131.222 }, { "epoch": 0.011, "grad_norm": 2.8459227085113525, "learning_rate": 0.00029999925978027874, "loss": 1.779, "num_input_tokens_seen": 72089600, "step": 1100, "train_runtime": 538.0301, "train_tokens_per_second": 133988.032 }, { "epoch": 0.012, "grad_norm": 2.292707920074463, "learning_rate": 0.0002999970091452017, "loss": 1.7037, "num_input_tokens_seen": 78643200, "step": 1200, "train_runtime": 585.618, "train_tokens_per_second": 134290.951 }, { "epoch": 0.013, "grad_norm": 3.362025737762451, "learning_rate": 0.00029999324804190795, "loss": 1.6688, "num_input_tokens_seen": 85196800, "step": 1300, "train_runtime": 632.1008, "train_tokens_per_second": 134783.565 }, { "epoch": 0.014, "grad_norm": 2.2756998538970947, "learning_rate": 0.0002999879765082716, "loss": 1.6397, "num_input_tokens_seen": 91750400, "step": 1400, "train_runtime": 684.3545, "train_tokens_per_second": 134068.525 }, { "epoch": 0.015, "grad_norm": 2.5730831623077393, "learning_rate": 0.000299981194597377, "loss": 1.605, "num_input_tokens_seen": 98304000, "step": 1500, "train_runtime": 730.5087, "train_tokens_per_second": 134569.247 }, { "epoch": 0.016, "grad_norm": 1.7514433860778809, "learning_rate": 0.0002999729023775179, "loss": 1.5838, "num_input_tokens_seen": 104857600, "step": 1600, "train_runtime": 781.9407, "train_tokens_per_second": 134099.179 }, { "epoch": 0.017, "grad_norm": 1.8343929052352905, "learning_rate": 0.0002999630999321969, "loss": 1.6037, "num_input_tokens_seen": 111411200, "step": 1700, "train_runtime": 824.7241, "train_tokens_per_second": 135089.057 }, { "epoch": 0.018, "grad_norm": 1.5672227144241333, "learning_rate": 0.00029995178736012443, "loss": 1.5627, "num_input_tokens_seen": 117964800, "step": 1800, "train_runtime": 871.9564, "train_tokens_per_second": 135287.497 }, { "epoch": 0.019, "grad_norm": 1.6202061176300049, "learning_rate": 0.0002999389647752181, "loss": 1.5398, "num_input_tokens_seen": 124518400, "step": 1900, "train_runtime": 923.402, "train_tokens_per_second": 134847.439 }, { "epoch": 0.02, "grad_norm": 1.5145666599273682, "learning_rate": 0.00029992463230660104, "loss": 1.5389, "num_input_tokens_seen": 131072000, "step": 2000, "train_runtime": 968.9283, "train_tokens_per_second": 135275.229 }, { "epoch": 0.021, "grad_norm": 1.0306257009506226, "learning_rate": 0.00029990879009860117, "loss": 1.5098, "num_input_tokens_seen": 137625600, "step": 2100, "train_runtime": 1020.8371, "train_tokens_per_second": 134816.412 }, { "epoch": 0.022, "grad_norm": 2.0710599422454834, "learning_rate": 0.0002998914383107493, "loss": 1.5081, "num_input_tokens_seen": 144179200, "step": 2200, "train_runtime": 1067.2796, "train_tokens_per_second": 135090.368 }, { "epoch": 0.023, "grad_norm": 1.4022581577301025, "learning_rate": 0.0002998725771177778, "loss": 1.521, "num_input_tokens_seen": 150732800, "step": 2300, "train_runtime": 1114.7094, "train_tokens_per_second": 135221.616 }, { "epoch": 0.024, "grad_norm": 1.4328904151916504, "learning_rate": 0.00029985220670961847, "loss": 1.4855, "num_input_tokens_seen": 157286400, "step": 2400, "train_runtime": 1160.6217, "train_tokens_per_second": 135519.092 }, { "epoch": 0.025, "grad_norm": 1.3760366439819336, "learning_rate": 0.0002998303272914014, "loss": 1.4966, "num_input_tokens_seen": 163840000, "step": 2500, "train_runtime": 1212.6489, "train_tokens_per_second": 135109.18 }, { "epoch": 0.026, "grad_norm": 0.9530190825462341, "learning_rate": 0.00029980693908345185, "loss": 1.4795, "num_input_tokens_seen": 170393600, "step": 2600, "train_runtime": 1258.3106, "train_tokens_per_second": 135414.576 }, { "epoch": 0.027, "grad_norm": 0.8715839385986328, "learning_rate": 0.00029978204232128895, "loss": 1.4601, "num_input_tokens_seen": 176947200, "step": 2700, "train_runtime": 1304.6837, "train_tokens_per_second": 135624.597 }, { "epoch": 0.028, "grad_norm": 1.1879854202270508, "learning_rate": 0.0002997556372556227, "loss": 1.487, "num_input_tokens_seen": 183500800, "step": 2800, "train_runtime": 1358.2195, "train_tokens_per_second": 135103.938 }, { "epoch": 0.029, "grad_norm": 1.0949848890304565, "learning_rate": 0.0002997277241523519, "loss": 1.4658, "num_input_tokens_seen": 190054400, "step": 2900, "train_runtime": 1404.4203, "train_tokens_per_second": 135325.869 }, { "epoch": 0.03, "grad_norm": 1.465809941291809, "learning_rate": 0.00029969830329256125, "loss": 1.4463, "num_input_tokens_seen": 196608000, "step": 3000, "train_runtime": 1451.3838, "train_tokens_per_second": 135462.45 }, { "epoch": 0.031, "grad_norm": 0.9500088095664978, "learning_rate": 0.00029966737497251836, "loss": 1.4533, "num_input_tokens_seen": 203161600, "step": 3100, "train_runtime": 1496.7114, "train_tokens_per_second": 135738.657 }, { "epoch": 0.032, "grad_norm": 1.3393683433532715, "learning_rate": 0.0002996349395036711, "loss": 1.4402, "num_input_tokens_seen": 209715200, "step": 3200, "train_runtime": 1549.2536, "train_tokens_per_second": 135365.316 }, { "epoch": 0.033, "grad_norm": 0.7998270988464355, "learning_rate": 0.00029960099721264435, "loss": 1.4467, "num_input_tokens_seen": 216268800, "step": 3300, "train_runtime": 1596.5035, "train_tokens_per_second": 135464.03 }, { "epoch": 0.034, "grad_norm": 0.8441318273544312, "learning_rate": 0.0002995655484412365, "loss": 1.4353, "num_input_tokens_seen": 222822400, "step": 3400, "train_runtime": 1642.6114, "train_tokens_per_second": 135651.317 }, { "epoch": 0.035, "grad_norm": 0.7577129006385803, "learning_rate": 0.00029952859354641636, "loss": 1.4253, "num_input_tokens_seen": 229376000, "step": 3500, "train_runtime": 1690.0779, "train_tokens_per_second": 135719.187 }, { "epoch": 0.036, "grad_norm": 0.8359817862510681, "learning_rate": 0.00029949013290031924, "loss": 1.4348, "num_input_tokens_seen": 235929600, "step": 3600, "train_runtime": 1736.0232, "train_tokens_per_second": 135902.33 }, { "epoch": 0.037, "grad_norm": 0.7565376162528992, "learning_rate": 0.00029945016689024353, "loss": 1.4114, "num_input_tokens_seen": 242483200, "step": 3700, "train_runtime": 1788.0113, "train_tokens_per_second": 135616.148 }, { "epoch": 0.038, "grad_norm": 0.9537010788917542, "learning_rate": 0.0002994086959186464, "loss": 1.4134, "num_input_tokens_seen": 249036800, "step": 3800, "train_runtime": 1835.9254, "train_tokens_per_second": 135646.47 }, { "epoch": 0.039, "grad_norm": 0.8911266922950745, "learning_rate": 0.00029936572040314014, "loss": 1.4224, "num_input_tokens_seen": 255590400, "step": 3900, "train_runtime": 1882.537, "train_tokens_per_second": 135769.123 }, { "epoch": 0.04, "grad_norm": 0.7832906246185303, "learning_rate": 0.0002993212407764877, "loss": 1.4177, "num_input_tokens_seen": 262144000, "step": 4000, "train_runtime": 1928.8118, "train_tokens_per_second": 135909.579 }, { "epoch": 0.041, "grad_norm": 0.8426671624183655, "learning_rate": 0.00029927525748659834, "loss": 1.4194, "num_input_tokens_seen": 268697600, "step": 4100, "train_runtime": 1981.7143, "train_tokens_per_second": 135588.467 }, { "epoch": 0.042, "grad_norm": 0.9675344824790955, "learning_rate": 0.0002992277709965234, "loss": 1.4059, "num_input_tokens_seen": 275251200, "step": 4200, "train_runtime": 2027.927, "train_tokens_per_second": 135730.33 }, { "epoch": 0.043, "grad_norm": 1.1866440773010254, "learning_rate": 0.0002991787817844513, "loss": 1.4065, "num_input_tokens_seen": 281804800, "step": 4300, "train_runtime": 2074.708, "train_tokens_per_second": 135828.659 }, { "epoch": 0.044, "grad_norm": 0.8417257070541382, "learning_rate": 0.0002991282903437028, "loss": 1.397, "num_input_tokens_seen": 288358400, "step": 4400, "train_runtime": 2126.0513, "train_tokens_per_second": 135630.972 }, { "epoch": 0.045, "grad_norm": 0.8226633071899414, "learning_rate": 0.0002990762971827262, "loss": 1.3996, "num_input_tokens_seen": 294912000, "step": 4500, "train_runtime": 2172.3837, "train_tokens_per_second": 135755.024 }, { "epoch": 0.046, "grad_norm": 0.8411224484443665, "learning_rate": 0.00029902280282509197, "loss": 1.4002, "num_input_tokens_seen": 301465600, "step": 4600, "train_runtime": 2220.1775, "train_tokens_per_second": 135784.456 }, { "epoch": 0.047, "grad_norm": 0.7082719802856445, "learning_rate": 0.0002989678078094878, "loss": 1.3804, "num_input_tokens_seen": 308019200, "step": 4700, "train_runtime": 2266.6848, "train_tokens_per_second": 135889.739 }, { "epoch": 0.048, "grad_norm": 0.7628137469291687, "learning_rate": 0.00029891131268971284, "loss": 1.3795, "num_input_tokens_seen": 314572800, "step": 4800, "train_runtime": 2318.5885, "train_tokens_per_second": 135674.269 }, { "epoch": 0.049, "grad_norm": 0.7231079936027527, "learning_rate": 0.0002988533180346723, "loss": 1.3789, "num_input_tokens_seen": 321126400, "step": 4900, "train_runtime": 2364.3453, "train_tokens_per_second": 135820.432 }, { "epoch": 0.05, "grad_norm": 0.7210503816604614, "learning_rate": 0.0002987938244283717, "loss": 1.3641, "num_input_tokens_seen": 327680000, "step": 5000, "train_runtime": 2410.3286, "train_tokens_per_second": 135948.267 }, { "epoch": 0.051, "grad_norm": 0.729364275932312, "learning_rate": 0.00029873283246991105, "loss": 1.3756, "num_input_tokens_seen": 334233600, "step": 5100, "train_runtime": 2458.4762, "train_tokens_per_second": 135951.532 }, { "epoch": 0.052, "grad_norm": 0.7513293027877808, "learning_rate": 0.0002986703427734787, "loss": 1.3778, "num_input_tokens_seen": 340787200, "step": 5200, "train_runtime": 2506.9032, "train_tokens_per_second": 135939.511 }, { "epoch": 0.053, "grad_norm": 0.7382386326789856, "learning_rate": 0.00029860635596834517, "loss": 1.3807, "num_input_tokens_seen": 347340800, "step": 5300, "train_runtime": 2559.5035, "train_tokens_per_second": 135706.321 }, { "epoch": 0.054, "grad_norm": 0.5869194269180298, "learning_rate": 0.0002985408726988569, "loss": 1.3695, "num_input_tokens_seen": 353894400, "step": 5400, "train_runtime": 2605.4484, "train_tokens_per_second": 135828.598 }, { "epoch": 0.055, "grad_norm": 0.7805973291397095, "learning_rate": 0.0002984738936244296, "loss": 1.3746, "num_input_tokens_seen": 360448000, "step": 5500, "train_runtime": 2655.8515, "train_tokens_per_second": 135718.431 }, { "epoch": 0.056, "grad_norm": 0.6918448209762573, "learning_rate": 0.0002984054194195419, "loss": 1.3855, "num_input_tokens_seen": 367001600, "step": 5600, "train_runtime": 2703.0299, "train_tokens_per_second": 135774.155 }, { "epoch": 0.057, "grad_norm": 0.6129201054573059, "learning_rate": 0.0002983354507737283, "loss": 1.3816, "num_input_tokens_seen": 373555200, "step": 5700, "train_runtime": 2750.071, "train_tokens_per_second": 135834.747 }, { "epoch": 0.058, "grad_norm": 0.7457948923110962, "learning_rate": 0.00029826398839157215, "loss": 1.3748, "num_input_tokens_seen": 380108800, "step": 5800, "train_runtime": 2795.4164, "train_tokens_per_second": 135975.735 }, { "epoch": 0.059, "grad_norm": 0.6171481013298035, "learning_rate": 0.000298191032992699, "loss": 1.3725, "num_input_tokens_seen": 386662400, "step": 5900, "train_runtime": 2842.5021, "train_tokens_per_second": 136028.889 }, { "epoch": 0.06, "grad_norm": 0.6233596205711365, "learning_rate": 0.0002981165853117688, "loss": 1.3624, "num_input_tokens_seen": 393216000, "step": 6000, "train_runtime": 2892.8273, "train_tokens_per_second": 135927.922 }, { "epoch": 0.061, "grad_norm": 0.5645745396614075, "learning_rate": 0.000298040646098469, "loss": 1.356, "num_input_tokens_seen": 399769600, "step": 6100, "train_runtime": 2940.1153, "train_tokens_per_second": 135970.721 }, { "epoch": 0.062, "grad_norm": 0.6580554246902466, "learning_rate": 0.0002979632161175064, "loss": 1.3627, "num_input_tokens_seen": 406323200, "step": 6200, "train_runtime": 2986.9073, "train_tokens_per_second": 136034.754 }, { "epoch": 0.063, "grad_norm": 0.6815545558929443, "learning_rate": 0.0002978842961486003, "loss": 1.3562, "num_input_tokens_seen": 412876800, "step": 6300, "train_runtime": 3038.4238, "train_tokens_per_second": 135885.191 }, { "epoch": 0.064, "grad_norm": 0.9602898955345154, "learning_rate": 0.0002978038869864738, "loss": 1.3562, "num_input_tokens_seen": 419430400, "step": 6400, "train_runtime": 3085.1228, "train_tokens_per_second": 135952.578 }, { "epoch": 0.065, "grad_norm": 0.7086384892463684, "learning_rate": 0.0002977219894408463, "loss": 1.3579, "num_input_tokens_seen": 425984000, "step": 6500, "train_runtime": 3130.8346, "train_tokens_per_second": 136060.844 }, { "epoch": 0.066, "grad_norm": 0.5864439010620117, "learning_rate": 0.0002976386043364251, "loss": 1.3563, "num_input_tokens_seen": 432537600, "step": 6600, "train_runtime": 3182.4893, "train_tokens_per_second": 135911.72 }, { "epoch": 0.067, "grad_norm": 0.6041991114616394, "learning_rate": 0.00029755373251289733, "loss": 1.3753, "num_input_tokens_seen": 439091200, "step": 6700, "train_runtime": 3229.4118, "train_tokens_per_second": 135966.308 }, { "epoch": 0.068, "grad_norm": 0.7153160572052002, "learning_rate": 0.0002974673748249213, "loss": 1.3475, "num_input_tokens_seen": 445644800, "step": 6800, "train_runtime": 3276.7034, "train_tokens_per_second": 136004.008 }, { "epoch": 0.069, "grad_norm": 0.5409119725227356, "learning_rate": 0.00029737953214211804, "loss": 1.3464, "num_input_tokens_seen": 452198400, "step": 6900, "train_runtime": 3324.3119, "train_tokens_per_second": 136027.67 }, { "epoch": 0.07, "grad_norm": 0.6369441151618958, "learning_rate": 0.0002972902053490623, "loss": 1.3546, "num_input_tokens_seen": 458752000, "step": 7000, "train_runtime": 3370.6322, "train_tokens_per_second": 136102.657 }, { "epoch": 0.071, "grad_norm": 0.8589248061180115, "learning_rate": 0.00029719939534527393, "loss": 1.3479, "num_input_tokens_seen": 465305600, "step": 7100, "train_runtime": 3424.7139, "train_tokens_per_second": 135867.0 }, { "epoch": 0.072, "grad_norm": 0.8014613389968872, "learning_rate": 0.00029710710304520866, "loss": 1.3667, "num_input_tokens_seen": 471859200, "step": 7200, "train_runtime": 3472.985, "train_tokens_per_second": 135865.601 }, { "epoch": 0.073, "grad_norm": 0.5970280766487122, "learning_rate": 0.00029701332937824885, "loss": 1.3423, "num_input_tokens_seen": 478412800, "step": 7300, "train_runtime": 3519.3052, "train_tokens_per_second": 135939.558 }, { "epoch": 0.074, "grad_norm": 0.6963617205619812, "learning_rate": 0.0002969180752886944, "loss": 1.3443, "num_input_tokens_seen": 484966400, "step": 7400, "train_runtime": 3565.8739, "train_tokens_per_second": 136002.118 }, { "epoch": 0.075, "grad_norm": 0.5769393444061279, "learning_rate": 0.0002968213417357529, "loss": 1.3576, "num_input_tokens_seen": 491520000, "step": 7500, "train_runtime": 3611.5043, "train_tokens_per_second": 136098.411 }, { "epoch": 0.076, "grad_norm": 0.5492929816246033, "learning_rate": 0.00029672312969353015, "loss": 1.3422, "num_input_tokens_seen": 498073600, "step": 7600, "train_runtime": 3664.3633, "train_tokens_per_second": 135923.642 }, { "epoch": 0.077, "grad_norm": 0.8065637946128845, "learning_rate": 0.00029662344015102027, "loss": 1.3395, "num_input_tokens_seen": 504627200, "step": 7700, "train_runtime": 3711.2689, "train_tokens_per_second": 135971.608 }, { "epoch": 0.078, "grad_norm": 0.552871584892273, "learning_rate": 0.00029652227411209594, "loss": 1.3427, "num_input_tokens_seen": 511180800, "step": 7800, "train_runtime": 3758.1209, "train_tokens_per_second": 136020.319 }, { "epoch": 0.079, "grad_norm": 0.6378001570701599, "learning_rate": 0.0002964196325954979, "loss": 1.3339, "num_input_tokens_seen": 517734400, "step": 7900, "train_runtime": 3804.2295, "train_tokens_per_second": 136094.417 }, { "epoch": 0.08, "grad_norm": 0.6196131706237793, "learning_rate": 0.0002963155166348253, "loss": 1.341, "num_input_tokens_seen": 524288000, "step": 8000, "train_runtime": 3855.6562, "train_tokens_per_second": 135978.93 }, { "epoch": 0.081, "grad_norm": 0.5841253399848938, "learning_rate": 0.0002962099272785246, "loss": 1.3366, "num_input_tokens_seen": 530841600, "step": 8100, "train_runtime": 3903.5348, "train_tokens_per_second": 135989.977 }, { "epoch": 0.082, "grad_norm": 0.5912770628929138, "learning_rate": 0.0002961028655898794, "loss": 1.3417, "num_input_tokens_seen": 537395200, "step": 8200, "train_runtime": 3951.3698, "train_tokens_per_second": 136002.255 }, { "epoch": 0.083, "grad_norm": 0.5480249524116516, "learning_rate": 0.0002959943326469998, "loss": 1.3419, "num_input_tokens_seen": 543948800, "step": 8300, "train_runtime": 3997.3554, "train_tokens_per_second": 136077.166 }, { "epoch": 0.084, "grad_norm": 0.49880343675613403, "learning_rate": 0.0002958843295428112, "loss": 1.3165, "num_input_tokens_seen": 550502400, "step": 8400, "train_runtime": 4044.3967, "train_tokens_per_second": 136114.838 }, { "epoch": 0.085, "grad_norm": 0.5670176148414612, "learning_rate": 0.0002957728573850438, "loss": 1.3314, "num_input_tokens_seen": 557056000, "step": 8500, "train_runtime": 4095.7201, "train_tokens_per_second": 136009.294 }, { "epoch": 0.086, "grad_norm": 2.3274426460266113, "learning_rate": 0.0002956599172962209, "loss": 1.3323, "num_input_tokens_seen": 563609600, "step": 8600, "train_runtime": 4143.1443, "train_tokens_per_second": 136034.268 }, { "epoch": 0.087, "grad_norm": 0.7660558819770813, "learning_rate": 0.0002955455104136479, "loss": 1.3382, "num_input_tokens_seen": 570163200, "step": 8700, "train_runtime": 4190.7065, "train_tokens_per_second": 136054.194 }, { "epoch": 0.088, "grad_norm": 0.5114762783050537, "learning_rate": 0.00029542963788940096, "loss": 1.3252, "num_input_tokens_seen": 576716800, "step": 8800, "train_runtime": 4237.8545, "train_tokens_per_second": 136086.974 }, { "epoch": 0.089, "grad_norm": 0.6698548197746277, "learning_rate": 0.00029531230089031505, "loss": 1.3449, "num_input_tokens_seen": 583270400, "step": 8900, "train_runtime": 4285.2299, "train_tokens_per_second": 136111.81 }, { "epoch": 0.09, "grad_norm": 0.5562598705291748, "learning_rate": 0.0002951935005979724, "loss": 1.3204, "num_input_tokens_seen": 589824000, "step": 9000, "train_runtime": 4336.4907, "train_tokens_per_second": 136014.126 }, { "epoch": 0.091, "grad_norm": 0.6327181458473206, "learning_rate": 0.0002950732382086907, "loss": 1.3178, "num_input_tokens_seen": 596377600, "step": 9100, "train_runtime": 4383.0811, "train_tokens_per_second": 136063.555 }, { "epoch": 0.092, "grad_norm": 0.6857426166534424, "learning_rate": 0.0002949515149335108, "loss": 1.3332, "num_input_tokens_seen": 602931200, "step": 9200, "train_runtime": 4431.4231, "train_tokens_per_second": 136058.142 }, { "epoch": 0.093, "grad_norm": 0.6040679812431335, "learning_rate": 0.0002948283319981848, "loss": 1.307, "num_input_tokens_seen": 609484800, "step": 9300, "train_runtime": 4478.1663, "train_tokens_per_second": 136101.423 }, { "epoch": 0.094, "grad_norm": 1.0060901641845703, "learning_rate": 0.00029470369064316354, "loss": 1.3108, "num_input_tokens_seen": 616038400, "step": 9400, "train_runtime": 4524.7167, "train_tokens_per_second": 136149.607 }, { "epoch": 0.095, "grad_norm": 0.504460871219635, "learning_rate": 0.00029457759212358397, "loss": 1.3169, "num_input_tokens_seen": 622592000, "step": 9500, "train_runtime": 4575.869, "train_tokens_per_second": 136059.84 }, { "epoch": 0.096, "grad_norm": 0.5062097907066345, "learning_rate": 0.00029445003770925686, "loss": 1.3137, "num_input_tokens_seen": 629145600, "step": 9600, "train_runtime": 4621.4422, "train_tokens_per_second": 136136.203 }, { "epoch": 0.097, "grad_norm": 0.5388786792755127, "learning_rate": 0.00029432102868465367, "loss": 1.3128, "num_input_tokens_seen": 635699200, "step": 9700, "train_runtime": 4668.6149, "train_tokens_per_second": 136164.411 }, { "epoch": 0.098, "grad_norm": 0.5705980062484741, "learning_rate": 0.0002941905663488939, "loss": 1.3065, "num_input_tokens_seen": 642252800, "step": 9800, "train_runtime": 4715.2389, "train_tokens_per_second": 136207.903 }, { "epoch": 0.099, "grad_norm": 0.5500839352607727, "learning_rate": 0.0002940586520157318, "loss": 1.3222, "num_input_tokens_seen": 648806400, "step": 9900, "train_runtime": 4767.1995, "train_tokens_per_second": 136098.019 }, { "epoch": 0.1, "grad_norm": 0.5740068554878235, "learning_rate": 0.00029392528701354325, "loss": 1.3173, "num_input_tokens_seen": 655360000, "step": 10000, "train_runtime": 4814.2762, "train_tokens_per_second": 136128.458 }, { "epoch": 0.101, "grad_norm": 0.47691279649734497, "learning_rate": 0.00029379047268531243, "loss": 1.3084, "num_input_tokens_seen": 661913600, "step": 10100, "train_runtime": 4861.0919, "train_tokens_per_second": 136165.622 }, { "epoch": 0.102, "grad_norm": 0.5993319153785706, "learning_rate": 0.00029365421038861795, "loss": 1.3299, "num_input_tokens_seen": 668467200, "step": 10200, "train_runtime": 4908.6949, "train_tokens_per_second": 136180.229 }, { "epoch": 0.103, "grad_norm": 0.556516170501709, "learning_rate": 0.0002935165014956198, "loss": 1.316, "num_input_tokens_seen": 675020800, "step": 10300, "train_runtime": 4956.5309, "train_tokens_per_second": 136188.156 }, { "epoch": 0.104, "grad_norm": 0.6757346391677856, "learning_rate": 0.0002933773473930448, "loss": 1.3048, "num_input_tokens_seen": 681574400, "step": 10400, "train_runtime": 5003.7965, "train_tokens_per_second": 136211.454 }, { "epoch": 0.105, "grad_norm": 0.9610360860824585, "learning_rate": 0.0002932367494821734, "loss": 1.3043, "num_input_tokens_seen": 688128000, "step": 10500, "train_runtime": 5050.8058, "train_tokens_per_second": 136241.232 }, { "epoch": 0.106, "grad_norm": 0.5780071020126343, "learning_rate": 0.00029309470917882497, "loss": 1.3015, "num_input_tokens_seen": 694681600, "step": 10600, "train_runtime": 5104.0171, "train_tokens_per_second": 136104.873 }, { "epoch": 0.107, "grad_norm": 0.6387894749641418, "learning_rate": 0.0002929512279133437, "loss": 1.3342, "num_input_tokens_seen": 701235200, "step": 10700, "train_runtime": 5151.2508, "train_tokens_per_second": 136129.112 }, { "epoch": 0.108, "grad_norm": 0.48744165897369385, "learning_rate": 0.0002928063071305844, "loss": 1.2999, "num_input_tokens_seen": 707788800, "step": 10800, "train_runtime": 5198.4813, "train_tokens_per_second": 136152.995 }, { "epoch": 0.109, "grad_norm": 0.5223510265350342, "learning_rate": 0.0002926599482898978, "loss": 1.2996, "num_input_tokens_seen": 714342400, "step": 10900, "train_runtime": 5244.0735, "train_tokens_per_second": 136218.99 }, { "epoch": 0.11, "grad_norm": 0.6020687222480774, "learning_rate": 0.00029251215286511573, "loss": 1.3029, "num_input_tokens_seen": 720896000, "step": 11000, "train_runtime": 5291.0983, "train_tokens_per_second": 136246.948 }, { "epoch": 0.111, "grad_norm": 0.5317751169204712, "learning_rate": 0.00029236292234453647, "loss": 1.316, "num_input_tokens_seen": 727449600, "step": 11100, "train_runtime": 5342.4851, "train_tokens_per_second": 136163.15 }, { "epoch": 0.112, "grad_norm": 1.2369730472564697, "learning_rate": 0.0002922122582309097, "loss": 1.298, "num_input_tokens_seen": 734003200, "step": 11200, "train_runtime": 5391.0041, "train_tokens_per_second": 136153.338 }, { "epoch": 0.113, "grad_norm": 0.5294257998466492, "learning_rate": 0.0002920601620414215, "loss": 1.316, "num_input_tokens_seen": 740556800, "step": 11300, "train_runtime": 5437.8422, "train_tokens_per_second": 136185.784 }, { "epoch": 0.114, "grad_norm": 0.5318885445594788, "learning_rate": 0.0002919066353076786, "loss": 1.2993, "num_input_tokens_seen": 747110400, "step": 11400, "train_runtime": 5484.1183, "train_tokens_per_second": 136231.635 }, { "epoch": 0.115, "grad_norm": 0.5208443403244019, "learning_rate": 0.00029175167957569366, "loss": 1.3066, "num_input_tokens_seen": 753664000, "step": 11500, "train_runtime": 5531.5155, "train_tokens_per_second": 136249.099 }, { "epoch": 0.116, "grad_norm": 0.5068408250808716, "learning_rate": 0.0002915952964058691, "loss": 1.3041, "num_input_tokens_seen": 760217600, "step": 11600, "train_runtime": 5578.6188, "train_tokens_per_second": 136273.445 }, { "epoch": 0.117, "grad_norm": 0.6206523776054382, "learning_rate": 0.00029143748737298173, "loss": 1.3061, "num_input_tokens_seen": 766771200, "step": 11700, "train_runtime": 5631.31, "train_tokens_per_second": 136162.136 }, { "epoch": 0.118, "grad_norm": 0.5741725564002991, "learning_rate": 0.00029127825406616677, "loss": 1.3097, "num_input_tokens_seen": 773324800, "step": 11800, "train_runtime": 5678.817, "train_tokens_per_second": 136177.096 }, { "epoch": 0.119, "grad_norm": 0.5251154899597168, "learning_rate": 0.0002911175980889019, "loss": 1.3054, "num_input_tokens_seen": 779878400, "step": 11900, "train_runtime": 5725.8659, "train_tokens_per_second": 136202.701 }, { "epoch": 0.12, "grad_norm": 0.4509083032608032, "learning_rate": 0.00029095552105899095, "loss": 1.301, "num_input_tokens_seen": 786432000, "step": 12000, "train_runtime": 5772.0962, "train_tokens_per_second": 136247.211 }, { "epoch": 0.121, "grad_norm": 0.4560108184814453, "learning_rate": 0.0002907920246085478, "loss": 1.2981, "num_input_tokens_seen": 792985600, "step": 12100, "train_runtime": 5817.8977, "train_tokens_per_second": 136301.056 }, { "epoch": 0.122, "grad_norm": 1.227121114730835, "learning_rate": 0.00029062711038397996, "loss": 1.302, "num_input_tokens_seen": 799539200, "step": 12200, "train_runtime": 5870.3451, "train_tokens_per_second": 136199.693 }, { "epoch": 0.123, "grad_norm": 0.4861258864402771, "learning_rate": 0.00029046078004597175, "loss": 1.318, "num_input_tokens_seen": 806092800, "step": 12300, "train_runtime": 5916.8489, "train_tokens_per_second": 136236.84 }, { "epoch": 0.124, "grad_norm": 0.9702387452125549, "learning_rate": 0.00029029303526946796, "loss": 1.2869, "num_input_tokens_seen": 812646400, "step": 12400, "train_runtime": 5964.0243, "train_tokens_per_second": 136258.063 }, { "epoch": 0.125, "grad_norm": 0.4712119400501251, "learning_rate": 0.0002901238777436565, "loss": 1.2924, "num_input_tokens_seen": 819200000, "step": 12500, "train_runtime": 6009.6089, "train_tokens_per_second": 136315.026 }, { "epoch": 0.126, "grad_norm": 0.4670332372188568, "learning_rate": 0.00028995330917195184, "loss": 1.2942, "num_input_tokens_seen": 825753600, "step": 12600, "train_runtime": 6061.3166, "train_tokens_per_second": 136233.371 }, { "epoch": 0.127, "grad_norm": 0.4821685552597046, "learning_rate": 0.00028978133127197765, "loss": 1.2856, "num_input_tokens_seen": 832307200, "step": 12700, "train_runtime": 6108.5206, "train_tokens_per_second": 136253.481 }, { "epoch": 0.128, "grad_norm": 0.5634518265724182, "learning_rate": 0.0002896079457755493, "loss": 1.2982, "num_input_tokens_seen": 838860800, "step": 12800, "train_runtime": 6155.2503, "train_tokens_per_second": 136283.785 }, { "epoch": 0.129, "grad_norm": 0.45673057436943054, "learning_rate": 0.000289433154428657, "loss": 1.2997, "num_input_tokens_seen": 845414400, "step": 12900, "train_runtime": 6202.1106, "train_tokens_per_second": 136310.758 }, { "epoch": 0.13, "grad_norm": 0.4386661648750305, "learning_rate": 0.0002892569589914476, "loss": 1.2985, "num_input_tokens_seen": 851968000, "step": 13000, "train_runtime": 6249.4681, "train_tokens_per_second": 136326.482 }, { "epoch": 0.131, "grad_norm": 0.4749270975589752, "learning_rate": 0.0002890793612382072, "loss": 1.2946, "num_input_tokens_seen": 858521600, "step": 13100, "train_runtime": 6301.6638, "train_tokens_per_second": 136237.291 }, { "epoch": 0.132, "grad_norm": 0.5405780673027039, "learning_rate": 0.0002889003629573432, "loss": 1.2857, "num_input_tokens_seen": 865075200, "step": 13200, "train_runtime": 6349.664, "train_tokens_per_second": 136239.523 }, { "epoch": 0.133, "grad_norm": 0.4045722782611847, "learning_rate": 0.00028871996595136626, "loss": 1.3009, "num_input_tokens_seen": 871628800, "step": 13300, "train_runtime": 6396.2349, "train_tokens_per_second": 136272.169 }, { "epoch": 0.134, "grad_norm": 0.5851114392280579, "learning_rate": 0.0002885381720368723, "loss": 1.3026, "num_input_tokens_seen": 878182400, "step": 13400, "train_runtime": 6442.8884, "train_tokens_per_second": 136302.594 }, { "epoch": 0.135, "grad_norm": 0.5135608315467834, "learning_rate": 0.000288354983044524, "loss": 1.2778, "num_input_tokens_seen": 884736000, "step": 13500, "train_runtime": 6489.2417, "train_tokens_per_second": 136338.889 }, { "epoch": 0.136, "grad_norm": 0.4828953742980957, "learning_rate": 0.00028817040081903245, "loss": 1.2864, "num_input_tokens_seen": 891289600, "step": 13600, "train_runtime": 6540.9813, "train_tokens_per_second": 136262.368 }, { "epoch": 0.137, "grad_norm": 0.5756350755691528, "learning_rate": 0.00028798442721913867, "loss": 1.2858, "num_input_tokens_seen": 897843200, "step": 13700, "train_runtime": 6588.3179, "train_tokens_per_second": 136278.063 }, { "epoch": 0.138, "grad_norm": 0.5231483578681946, "learning_rate": 0.00028779706411759465, "loss": 1.282, "num_input_tokens_seen": 904396800, "step": 13800, "train_runtime": 6635.0521, "train_tokens_per_second": 136305.909 }, { "epoch": 0.139, "grad_norm": 0.5475858449935913, "learning_rate": 0.00028760831340114484, "loss": 1.2797, "num_input_tokens_seen": 910950400, "step": 13900, "train_runtime": 6681.4731, "train_tokens_per_second": 136339.754 }, { "epoch": 0.14, "grad_norm": 0.7064163684844971, "learning_rate": 0.00028741817697050683, "loss": 1.2927, "num_input_tokens_seen": 917504000, "step": 14000, "train_runtime": 6730.4553, "train_tokens_per_second": 136321.238 }, { "epoch": 0.141, "grad_norm": 0.5267386436462402, "learning_rate": 0.00028722665674035233, "loss": 1.2815, "num_input_tokens_seen": 924057600, "step": 14100, "train_runtime": 6782.7717, "train_tokens_per_second": 136235.987 }, { "epoch": 0.142, "grad_norm": 0.5816136598587036, "learning_rate": 0.0002870337546392879, "loss": 1.2983, "num_input_tokens_seen": 930611200, "step": 14200, "train_runtime": 6829.7567, "train_tokens_per_second": 136258.323 }, { "epoch": 0.143, "grad_norm": 0.4982451796531677, "learning_rate": 0.00028683947260983576, "loss": 1.3026, "num_input_tokens_seen": 937164800, "step": 14300, "train_runtime": 6877.8163, "train_tokens_per_second": 136259.063 }, { "epoch": 0.144, "grad_norm": 0.49408379197120667, "learning_rate": 0.00028664381260841356, "loss": 1.2869, "num_input_tokens_seen": 943718400, "step": 14400, "train_runtime": 6923.5994, "train_tokens_per_second": 136304.593 }, { "epoch": 0.145, "grad_norm": 0.4885796904563904, "learning_rate": 0.0002864467766053154, "loss": 1.2768, "num_input_tokens_seen": 950272000, "step": 14500, "train_runtime": 6969.9199, "train_tokens_per_second": 136339.014 }, { "epoch": 0.146, "grad_norm": 0.5424348711967468, "learning_rate": 0.00028624836658469165, "loss": 1.2806, "num_input_tokens_seen": 956825600, "step": 14600, "train_runtime": 7020.7829, "train_tokens_per_second": 136284.743 }, { "epoch": 0.147, "grad_norm": 0.4333992898464203, "learning_rate": 0.00028604858454452906, "loss": 1.2776, "num_input_tokens_seen": 963379200, "step": 14700, "train_runtime": 7066.7012, "train_tokens_per_second": 136326.58 }, { "epoch": 0.148, "grad_norm": 1.3118066787719727, "learning_rate": 0.00028584743249663057, "loss": 1.3039, "num_input_tokens_seen": 969932800, "step": 14800, "train_runtime": 7115.8691, "train_tokens_per_second": 136305.6 }, { "epoch": 0.149, "grad_norm": 0.5320950150489807, "learning_rate": 0.000285644912466595, "loss": 1.2801, "num_input_tokens_seen": 976486400, "step": 14900, "train_runtime": 7162.6662, "train_tokens_per_second": 136330.016 }, { "epoch": 0.15, "grad_norm": 0.6902542114257812, "learning_rate": 0.00028544102649379684, "loss": 1.2832, "num_input_tokens_seen": 983040000, "step": 15000, "train_runtime": 7209.6657, "train_tokens_per_second": 136350.29 }, { "epoch": 0.151, "grad_norm": 0.544683039188385, "learning_rate": 0.00028523577663136556, "loss": 1.2948, "num_input_tokens_seen": 989593600, "step": 15100, "train_runtime": 7261.0326, "train_tokens_per_second": 136288.275 }, { "epoch": 0.152, "grad_norm": 0.500091552734375, "learning_rate": 0.000285029164946165, "loss": 1.2746, "num_input_tokens_seen": 996147200, "step": 15200, "train_runtime": 7306.6445, "train_tokens_per_second": 136334.427 }, { "epoch": 0.153, "grad_norm": 0.4995329678058624, "learning_rate": 0.0002848211935187725, "loss": 1.2893, "num_input_tokens_seen": 1002700800, "step": 15300, "train_runtime": 7353.2711, "train_tokens_per_second": 136361.19 }, { "epoch": 0.154, "grad_norm": 0.42985284328460693, "learning_rate": 0.0002846118644434581, "loss": 1.3077, "num_input_tokens_seen": 1009254400, "step": 15400, "train_runtime": 7400.7889, "train_tokens_per_second": 136371.192 }, { "epoch": 0.155, "grad_norm": 0.4847468137741089, "learning_rate": 0.00028440117982816326, "loss": 1.2723, "num_input_tokens_seen": 1015808000, "step": 15500, "train_runtime": 7452.7433, "train_tokens_per_second": 136299.877 }, { "epoch": 0.156, "grad_norm": 0.47867411375045776, "learning_rate": 0.0002841891417944796, "loss": 1.2754, "num_input_tokens_seen": 1022361600, "step": 15600, "train_runtime": 7498.8195, "train_tokens_per_second": 136336.339 }, { "epoch": 0.157, "grad_norm": 0.43365904688835144, "learning_rate": 0.0002839757524776279, "loss": 1.2737, "num_input_tokens_seen": 1028915200, "step": 15700, "train_runtime": 7545.0284, "train_tokens_per_second": 136369.957 }, { "epoch": 0.158, "grad_norm": 0.5739541053771973, "learning_rate": 0.0002837610140264361, "loss": 1.286, "num_input_tokens_seen": 1035468800, "step": 15800, "train_runtime": 7597.8039, "train_tokens_per_second": 136285.275 }, { "epoch": 0.159, "grad_norm": 0.4836307168006897, "learning_rate": 0.0002835449286033182, "loss": 1.2779, "num_input_tokens_seen": 1042022400, "step": 15900, "train_runtime": 7643.6023, "train_tokens_per_second": 136326.088 }, { "epoch": 0.16, "grad_norm": 0.5712729692459106, "learning_rate": 0.0002833274983842518, "loss": 1.2702, "num_input_tokens_seen": 1048576000, "step": 16000, "train_runtime": 7691.0096, "train_tokens_per_second": 136337.887 }, { "epoch": 0.161, "grad_norm": 0.48568034172058105, "learning_rate": 0.0002831087255587569, "loss": 1.2696, "num_input_tokens_seen": 1055129600, "step": 16100, "train_runtime": 7737.6132, "train_tokens_per_second": 136363.705 }, { "epoch": 0.162, "grad_norm": 0.5240116715431213, "learning_rate": 0.0002828886123298734, "loss": 1.2636, "num_input_tokens_seen": 1061683200, "step": 16200, "train_runtime": 7790.0975, "train_tokens_per_second": 136286.253 }, { "epoch": 0.163, "grad_norm": 0.4505080580711365, "learning_rate": 0.00028266716091413906, "loss": 1.2679, "num_input_tokens_seen": 1068236800, "step": 16300, "train_runtime": 7837.0156, "train_tokens_per_second": 136306.581 }, { "epoch": 0.164, "grad_norm": 0.38184958696365356, "learning_rate": 0.0002824443735415673, "loss": 1.2801, "num_input_tokens_seen": 1074790400, "step": 16400, "train_runtime": 7884.0198, "train_tokens_per_second": 136325.178 }, { "epoch": 0.165, "grad_norm": 0.860382616519928, "learning_rate": 0.0002822202524556243, "loss": 1.2737, "num_input_tokens_seen": 1081344000, "step": 16500, "train_runtime": 7930.486, "train_tokens_per_second": 136352.803 }, { "epoch": 0.166, "grad_norm": 0.771594226360321, "learning_rate": 0.00028199479991320695, "loss": 1.2876, "num_input_tokens_seen": 1087897600, "step": 16600, "train_runtime": 7977.0943, "train_tokens_per_second": 136377.678 }, { "epoch": 0.167, "grad_norm": 0.4533759653568268, "learning_rate": 0.00028176801818461994, "loss": 1.2769, "num_input_tokens_seen": 1094451200, "step": 16700, "train_runtime": 8024.6165, "train_tokens_per_second": 136386.73 }, { "epoch": 0.168, "grad_norm": 0.548772394657135, "learning_rate": 0.00028153990955355273, "loss": 1.2647, "num_input_tokens_seen": 1101004800, "step": 16800, "train_runtime": 8077.0632, "train_tokens_per_second": 136312.515 }, { "epoch": 0.169, "grad_norm": 0.5390068888664246, "learning_rate": 0.00028131047631705665, "loss": 1.2799, "num_input_tokens_seen": 1107558400, "step": 16900, "train_runtime": 8123.3347, "train_tokens_per_second": 136342.824 }, { "epoch": 0.17, "grad_norm": 0.4429817795753479, "learning_rate": 0.00028107972078552187, "loss": 1.2727, "num_input_tokens_seen": 1114112000, "step": 17000, "train_runtime": 8169.0719, "train_tokens_per_second": 136381.71 }, { "epoch": 0.171, "grad_norm": 0.6212127208709717, "learning_rate": 0.0002808476452826541, "loss": 1.2743, "num_input_tokens_seen": 1120665600, "step": 17100, "train_runtime": 8217.1136, "train_tokens_per_second": 136381.904 }, { "epoch": 0.172, "grad_norm": 0.44569867849349976, "learning_rate": 0.00028061425214545094, "loss": 1.2628, "num_input_tokens_seen": 1127219200, "step": 17200, "train_runtime": 8268.2495, "train_tokens_per_second": 136331.057 }, { "epoch": 0.173, "grad_norm": 0.5025371313095093, "learning_rate": 0.00028037954372417883, "loss": 1.2651, "num_input_tokens_seen": 1133772800, "step": 17300, "train_runtime": 8315.4333, "train_tokens_per_second": 136345.607 }, { "epoch": 0.174, "grad_norm": 0.5257975459098816, "learning_rate": 0.0002801435223823488, "loss": 1.2701, "num_input_tokens_seen": 1140326400, "step": 17400, "train_runtime": 8361.8666, "train_tokens_per_second": 136372.23 }, { "epoch": 0.175, "grad_norm": 0.6858969926834106, "learning_rate": 0.00027990619049669336, "loss": 1.2759, "num_input_tokens_seen": 1146880000, "step": 17500, "train_runtime": 8408.7431, "train_tokens_per_second": 136391.371 }, { "epoch": 0.176, "grad_norm": 0.5586578845977783, "learning_rate": 0.00027966755045714177, "loss": 1.2782, "num_input_tokens_seen": 1153433600, "step": 17600, "train_runtime": 8455.5155, "train_tokens_per_second": 136411.978 }, { "epoch": 0.177, "grad_norm": 0.583242654800415, "learning_rate": 0.00027942760466679673, "loss": 1.287, "num_input_tokens_seen": 1159987200, "step": 17700, "train_runtime": 8508.2754, "train_tokens_per_second": 136336.349 }, { "epoch": 0.178, "grad_norm": 0.5521747469902039, "learning_rate": 0.00027918635554190956, "loss": 1.2704, "num_input_tokens_seen": 1166540800, "step": 17800, "train_runtime": 8555.5497, "train_tokens_per_second": 136349.018 }, { "epoch": 0.179, "grad_norm": 0.6325215697288513, "learning_rate": 0.00027894380551185636, "loss": 1.2912, "num_input_tokens_seen": 1173094400, "step": 17900, "train_runtime": 8602.3857, "train_tokens_per_second": 136368.495 }, { "epoch": 0.18, "grad_norm": 0.44643789529800415, "learning_rate": 0.00027869995701911314, "loss": 1.2762, "num_input_tokens_seen": 1179648000, "step": 18000, "train_runtime": 8649.7648, "train_tokens_per_second": 136379.2 }, { "epoch": 0.181, "grad_norm": 0.49556615948677063, "learning_rate": 0.0002784548125192316, "loss": 1.2577, "num_input_tokens_seen": 1186201600, "step": 18100, "train_runtime": 8701.0558, "train_tokens_per_second": 136328.467 }, { "epoch": 0.182, "grad_norm": 0.5336231589317322, "learning_rate": 0.0002782083744808141, "loss": 1.2629, "num_input_tokens_seen": 1192755200, "step": 18200, "train_runtime": 8748.3794, "train_tokens_per_second": 136340.131 }, { "epoch": 0.183, "grad_norm": 0.3993295431137085, "learning_rate": 0.000277960645385489, "loss": 1.2621, "num_input_tokens_seen": 1199308800, "step": 18300, "train_runtime": 8795.9903, "train_tokens_per_second": 136347.217 }, { "epoch": 0.184, "grad_norm": 0.5608197450637817, "learning_rate": 0.00027771162772788544, "loss": 1.2746, "num_input_tokens_seen": 1205862400, "step": 18400, "train_runtime": 8844.0918, "train_tokens_per_second": 136346.663 }, { "epoch": 0.185, "grad_norm": 0.5299677848815918, "learning_rate": 0.00027746132401560857, "loss": 1.2608, "num_input_tokens_seen": 1212416000, "step": 18500, "train_runtime": 8890.974, "train_tokens_per_second": 136364.812 }, { "epoch": 0.186, "grad_norm": 0.5247559547424316, "learning_rate": 0.0002772097367692139, "loss": 1.2628, "num_input_tokens_seen": 1218969600, "step": 18600, "train_runtime": 8937.3092, "train_tokens_per_second": 136391.119 }, { "epoch": 0.187, "grad_norm": 0.4991471469402313, "learning_rate": 0.00027695686852218226, "loss": 1.2617, "num_input_tokens_seen": 1225523200, "step": 18700, "train_runtime": 8984.1463, "train_tokens_per_second": 136409.532 }, { "epoch": 0.188, "grad_norm": 0.4922790229320526, "learning_rate": 0.00027670272182089416, "loss": 1.277, "num_input_tokens_seen": 1232076800, "step": 18800, "train_runtime": 9036.4876, "train_tokens_per_second": 136344.656 }, { "epoch": 0.189, "grad_norm": 0.49377188086509705, "learning_rate": 0.0002764472992246039, "loss": 1.2767, "num_input_tokens_seen": 1238630400, "step": 18900, "train_runtime": 9084.3866, "train_tokens_per_second": 136347.169 }, { "epoch": 0.19, "grad_norm": 0.6417357921600342, "learning_rate": 0.0002761906033054143, "loss": 1.2616, "num_input_tokens_seen": 1245184000, "step": 19000, "train_runtime": 9130.7221, "train_tokens_per_second": 136373.004 }, { "epoch": 0.191, "grad_norm": 0.44580140709877014, "learning_rate": 0.00027593263664825045, "loss": 1.2686, "num_input_tokens_seen": 1251737600, "step": 19100, "train_runtime": 9176.6051, "train_tokens_per_second": 136405.303 }, { "epoch": 0.192, "grad_norm": 0.5867856740951538, "learning_rate": 0.00027567340185083363, "loss": 1.2638, "num_input_tokens_seen": 1258291200, "step": 19200, "train_runtime": 9229.719, "train_tokens_per_second": 136330.391 }, { "epoch": 0.193, "grad_norm": 0.4900195896625519, "learning_rate": 0.00027541290152365537, "loss": 1.263, "num_input_tokens_seen": 1264844800, "step": 19300, "train_runtime": 9276.2421, "train_tokens_per_second": 136353.147 }, { "epoch": 0.194, "grad_norm": 0.49572521448135376, "learning_rate": 0.00027515113828995117, "loss": 1.273, "num_input_tokens_seen": 1271398400, "step": 19400, "train_runtime": 9323.5363, "train_tokens_per_second": 136364.396 }, { "epoch": 0.195, "grad_norm": 0.440213680267334, "learning_rate": 0.00027488811478567374, "loss": 1.2657, "num_input_tokens_seen": 1277952000, "step": 19500, "train_runtime": 9371.4717, "train_tokens_per_second": 136366.201 }, { "epoch": 0.196, "grad_norm": 0.5604475736618042, "learning_rate": 0.0002746238336594671, "loss": 1.2619, "num_input_tokens_seen": 1284505600, "step": 19600, "train_runtime": 9417.129, "train_tokens_per_second": 136400.978 }, { "epoch": 0.197, "grad_norm": 0.45344123244285583, "learning_rate": 0.00027435829757263894, "loss": 1.2573, "num_input_tokens_seen": 1291059200, "step": 19700, "train_runtime": 9468.5748, "train_tokens_per_second": 136352.009 }, { "epoch": 0.198, "grad_norm": 0.7260287404060364, "learning_rate": 0.0002740915091991349, "loss": 1.2668, "num_input_tokens_seen": 1297612800, "step": 19800, "train_runtime": 9515.3702, "train_tokens_per_second": 136370.186 }, { "epoch": 0.199, "grad_norm": 0.47865310311317444, "learning_rate": 0.0002738234712255109, "loss": 1.2674, "num_input_tokens_seen": 1304166400, "step": 19900, "train_runtime": 9562.0606, "train_tokens_per_second": 136389.682 }, { "epoch": 0.2, "grad_norm": 0.8422930240631104, "learning_rate": 0.00027355418635090635, "loss": 1.2671, "num_input_tokens_seen": 1310720000, "step": 20000, "train_runtime": 9614.8867, "train_tokens_per_second": 136321.939 }, { "epoch": 0.201, "grad_norm": 0.8500565886497498, "learning_rate": 0.000273283657287017, "loss": 1.2722, "num_input_tokens_seen": 1317273600, "step": 20100, "train_runtime": 9662.5316, "train_tokens_per_second": 136327.999 }, { "epoch": 0.202, "grad_norm": 0.4511219263076782, "learning_rate": 0.00027301188675806745, "loss": 1.257, "num_input_tokens_seen": 1323827200, "step": 20200, "train_runtime": 9710.3614, "train_tokens_per_second": 136331.404 }, { "epoch": 0.203, "grad_norm": 0.6040441393852234, "learning_rate": 0.0002727388775007839, "loss": 1.2787, "num_input_tokens_seen": 1330380800, "step": 20300, "train_runtime": 9757.2415, "train_tokens_per_second": 136348.045 }, { "epoch": 0.204, "grad_norm": 0.531548798084259, "learning_rate": 0.0002724646322643666, "loss": 1.2567, "num_input_tokens_seen": 1336934400, "step": 20400, "train_runtime": 9803.907, "train_tokens_per_second": 136367.512 }, { "epoch": 0.205, "grad_norm": 0.5128377079963684, "learning_rate": 0.000272189153810462, "loss": 1.2634, "num_input_tokens_seen": 1343488000, "step": 20500, "train_runtime": 9849.6975, "train_tokens_per_second": 136398.909 }, { "epoch": 0.206, "grad_norm": 0.5763120651245117, "learning_rate": 0.0002719124449131351, "loss": 1.2708, "num_input_tokens_seen": 1350041600, "step": 20600, "train_runtime": 9902.5747, "train_tokens_per_second": 136332.382 }, { "epoch": 0.207, "grad_norm": 0.5266316533088684, "learning_rate": 0.00027163450835884144, "loss": 1.2579, "num_input_tokens_seen": 1356595200, "step": 20700, "train_runtime": 9950.4471, "train_tokens_per_second": 136335.1 }, { "epoch": 0.208, "grad_norm": 0.6279749274253845, "learning_rate": 0.00027135534694639894, "loss": 1.2566, "num_input_tokens_seen": 1363148800, "step": 20800, "train_runtime": 9997.0613, "train_tokens_per_second": 136354.951 }, { "epoch": 0.209, "grad_norm": 0.5421542525291443, "learning_rate": 0.00027107496348696003, "loss": 1.2687, "num_input_tokens_seen": 1369702400, "step": 20900, "train_runtime": 10044.3146, "train_tokens_per_second": 136365.939 }, { "epoch": 0.21, "grad_norm": 0.5376498699188232, "learning_rate": 0.00027079336080398296, "loss": 1.2772, "num_input_tokens_seen": 1376256000, "step": 21000, "train_runtime": 10090.6051, "train_tokens_per_second": 136389.839 }, { "epoch": 0.211, "grad_norm": 0.41719597578048706, "learning_rate": 0.00027051054173320366, "loss": 1.2502, "num_input_tokens_seen": 1382809600, "step": 21100, "train_runtime": 10143.3243, "train_tokens_per_second": 136327.063 }, { "epoch": 0.212, "grad_norm": 0.4714694321155548, "learning_rate": 0.000270226509122607, "loss": 1.2537, "num_input_tokens_seen": 1389363200, "step": 21200, "train_runtime": 10188.8874, "train_tokens_per_second": 136360.639 }, { "epoch": 0.213, "grad_norm": 0.4616274833679199, "learning_rate": 0.0002699412658323983, "loss": 1.2571, "num_input_tokens_seen": 1395916800, "step": 21300, "train_runtime": 10236.5378, "train_tokens_per_second": 136366.107 }, { "epoch": 0.214, "grad_norm": 0.4215717911720276, "learning_rate": 0.00026965481473497423, "loss": 1.2687, "num_input_tokens_seen": 1402470400, "step": 21400, "train_runtime": 10282.9404, "train_tokens_per_second": 136388.071 }, { "epoch": 0.215, "grad_norm": 0.5976271033287048, "learning_rate": 0.0002693671587148942, "loss": 1.2573, "num_input_tokens_seen": 1409024000, "step": 21500, "train_runtime": 10329.955, "train_tokens_per_second": 136401.756 }, { "epoch": 0.216, "grad_norm": 0.5200098752975464, "learning_rate": 0.0002690783006688511, "loss": 1.247, "num_input_tokens_seen": 1415577600, "step": 21600, "train_runtime": 10382.0767, "train_tokens_per_second": 136348.213 }, { "epoch": 0.217, "grad_norm": 0.8170623779296875, "learning_rate": 0.0002687882435056423, "loss": 1.2562, "num_input_tokens_seen": 1422131200, "step": 21700, "train_runtime": 10429.827, "train_tokens_per_second": 136352.329 }, { "epoch": 0.218, "grad_norm": 0.52497398853302, "learning_rate": 0.0002684969901461402, "loss": 1.2533, "num_input_tokens_seen": 1428684800, "step": 21800, "train_runtime": 10476.8104, "train_tokens_per_second": 136366.388 }, { "epoch": 0.219, "grad_norm": 0.4417087137699127, "learning_rate": 0.000268204543523263, "loss": 1.2721, "num_input_tokens_seen": 1435238400, "step": 21900, "train_runtime": 10524.1028, "train_tokens_per_second": 136376.319 }, { "epoch": 0.22, "grad_norm": 0.5729189515113831, "learning_rate": 0.0002679109065819447, "loss": 1.2654, "num_input_tokens_seen": 1441792000, "step": 22000, "train_runtime": 10572.3447, "train_tokens_per_second": 136373.911 }, { "epoch": 0.221, "grad_norm": 0.5111753940582275, "learning_rate": 0.0002676160822791062, "loss": 1.2581, "num_input_tokens_seen": 1448345600, "step": 22100, "train_runtime": 10619.3771, "train_tokens_per_second": 136387.057 }, { "epoch": 0.222, "grad_norm": 0.4302677512168884, "learning_rate": 0.00026732007358362496, "loss": 1.2581, "num_input_tokens_seen": 1454899200, "step": 22200, "train_runtime": 10666.0714, "train_tokens_per_second": 136404.413 }, { "epoch": 0.223, "grad_norm": 3.9242477416992188, "learning_rate": 0.0002670228834763052, "loss": 1.2872, "num_input_tokens_seen": 1461452800, "step": 22300, "train_runtime": 10719.3985, "train_tokens_per_second": 136337.203 }, { "epoch": 0.224, "grad_norm": 0.7662601470947266, "learning_rate": 0.00026672451494984804, "loss": 1.2602, "num_input_tokens_seen": 1468006400, "step": 22400, "train_runtime": 10767.2807, "train_tokens_per_second": 136339.568 }, { "epoch": 0.225, "grad_norm": 0.48544740676879883, "learning_rate": 0.0002664249710088213, "loss": 1.257, "num_input_tokens_seen": 1474560000, "step": 22500, "train_runtime": 10813.982, "train_tokens_per_second": 136356.802 }, { "epoch": 0.226, "grad_norm": 0.4495686888694763, "learning_rate": 0.00026612425466962893, "loss": 1.2552, "num_input_tokens_seen": 1481113600, "step": 22600, "train_runtime": 10860.2948, "train_tokens_per_second": 136378.766 }, { "epoch": 0.227, "grad_norm": 0.5733143091201782, "learning_rate": 0.00026582236896048134, "loss": 1.2403, "num_input_tokens_seen": 1487667200, "step": 22700, "train_runtime": 10907.2107, "train_tokens_per_second": 136393.001 }, { "epoch": 0.228, "grad_norm": 0.7318263649940491, "learning_rate": 0.00026551931692136413, "loss": 1.2468, "num_input_tokens_seen": 1494220800, "step": 22800, "train_runtime": 10953.9499, "train_tokens_per_second": 136409.315 }, { "epoch": 0.229, "grad_norm": 0.5192084312438965, "learning_rate": 0.00026521510160400804, "loss": 1.2458, "num_input_tokens_seen": 1500774400, "step": 22900, "train_runtime": 11006.6198, "train_tokens_per_second": 136351.98 }, { "epoch": 0.23, "grad_norm": 0.4651305079460144, "learning_rate": 0.00026490972607185793, "loss": 1.2601, "num_input_tokens_seen": 1507328000, "step": 23000, "train_runtime": 11053.8305, "train_tokens_per_second": 136362.504 }, { "epoch": 0.231, "grad_norm": 0.5470275282859802, "learning_rate": 0.0002646031934000421, "loss": 1.2405, "num_input_tokens_seen": 1513881600, "step": 23100, "train_runtime": 11099.6418, "train_tokens_per_second": 136390.132 }, { "epoch": 0.232, "grad_norm": 0.519235372543335, "learning_rate": 0.00026429550667534095, "loss": 1.2586, "num_input_tokens_seen": 1520435200, "step": 23200, "train_runtime": 11152.1986, "train_tokens_per_second": 136335.018 }, { "epoch": 0.233, "grad_norm": 0.4892626404762268, "learning_rate": 0.0002639866689961565, "loss": 1.2595, "num_input_tokens_seen": 1526988800, "step": 23300, "train_runtime": 11199.2653, "train_tokens_per_second": 136347.23 }, { "epoch": 0.234, "grad_norm": 0.4089221656322479, "learning_rate": 0.00026367668347248083, "loss": 1.2393, "num_input_tokens_seen": 1533542400, "step": 23400, "train_runtime": 11247.6635, "train_tokens_per_second": 136343.196 }, { "epoch": 0.235, "grad_norm": 0.467582106590271, "learning_rate": 0.0002633655532258646, "loss": 1.2534, "num_input_tokens_seen": 1540096000, "step": 23500, "train_runtime": 11294.1646, "train_tokens_per_second": 136362.099 }, { "epoch": 0.236, "grad_norm": 0.48117080330848694, "learning_rate": 0.000263053281389386, "loss": 1.2644, "num_input_tokens_seen": 1546649600, "step": 23600, "train_runtime": 11340.9021, "train_tokens_per_second": 136378.004 }, { "epoch": 0.237, "grad_norm": 0.4495629072189331, "learning_rate": 0.0002627398711076189, "loss": 1.2442, "num_input_tokens_seen": 1553203200, "step": 23700, "train_runtime": 11387.7566, "train_tokens_per_second": 136392.377 }, { "epoch": 0.238, "grad_norm": 0.4376384913921356, "learning_rate": 0.0002624253255366014, "loss": 1.2489, "num_input_tokens_seen": 1559756800, "step": 23800, "train_runtime": 11439.8893, "train_tokens_per_second": 136343.696 }, { "epoch": 0.239, "grad_norm": 0.4419648349285126, "learning_rate": 0.0002621096478438039, "loss": 1.2353, "num_input_tokens_seen": 1566310400, "step": 23900, "train_runtime": 11486.001, "train_tokens_per_second": 136366.904 }, { "epoch": 0.24, "grad_norm": 0.669739305973053, "learning_rate": 0.00026179284120809727, "loss": 1.2528, "num_input_tokens_seen": 1572864000, "step": 24000, "train_runtime": 11533.9608, "train_tokens_per_second": 136368.072 }, { "epoch": 0.241, "grad_norm": 0.4047415554523468, "learning_rate": 0.0002614749088197208, "loss": 1.2679, "num_input_tokens_seen": 1579417600, "step": 24100, "train_runtime": 11582.9583, "train_tokens_per_second": 136357.013 }, { "epoch": 0.242, "grad_norm": 0.5224933624267578, "learning_rate": 0.00026115585388025015, "loss": 1.2425, "num_input_tokens_seen": 1585971200, "step": 24200, "train_runtime": 11630.022, "train_tokens_per_second": 136368.719 }, { "epoch": 0.243, "grad_norm": 0.5125856399536133, "learning_rate": 0.00026083567960256493, "loss": 1.2423, "num_input_tokens_seen": 1592524800, "step": 24300, "train_runtime": 11677.13, "train_tokens_per_second": 136379.813 }, { "epoch": 0.244, "grad_norm": 0.5344144701957703, "learning_rate": 0.00026051438921081667, "loss": 1.2431, "num_input_tokens_seen": 1599078400, "step": 24400, "train_runtime": 11723.5349, "train_tokens_per_second": 136398.997 }, { "epoch": 0.245, "grad_norm": 0.4386890232563019, "learning_rate": 0.00026019198594039595, "loss": 1.2426, "num_input_tokens_seen": 1605632000, "step": 24500, "train_runtime": 11773.1296, "train_tokens_per_second": 136381.069 }, { "epoch": 0.246, "grad_norm": 0.4986630082130432, "learning_rate": 0.00025986847303790026, "loss": 1.2531, "num_input_tokens_seen": 1612185600, "step": 24600, "train_runtime": 11820.6579, "train_tokens_per_second": 136387.13 }, { "epoch": 0.247, "grad_norm": 0.5271715521812439, "learning_rate": 0.00025954385376110076, "loss": 1.249, "num_input_tokens_seen": 1618739200, "step": 24700, "train_runtime": 11867.4874, "train_tokens_per_second": 136401.172 }, { "epoch": 0.248, "grad_norm": 0.45263609290122986, "learning_rate": 0.00025921813137891005, "loss": 1.2507, "num_input_tokens_seen": 1625292800, "step": 24800, "train_runtime": 11919.9131, "train_tokens_per_second": 136351.061 }, { "epoch": 0.249, "grad_norm": 0.5932081937789917, "learning_rate": 0.000258891309171349, "loss": 1.2438, "num_input_tokens_seen": 1631846400, "step": 24900, "train_runtime": 11962.6395, "train_tokens_per_second": 136411.902 }, { "epoch": 0.25, "grad_norm": 0.5539859533309937, "learning_rate": 0.00025856339042951344, "loss": 1.2548, "num_input_tokens_seen": 1638400000, "step": 25000, "train_runtime": 12014.9411, "train_tokens_per_second": 136363.548 }, { "epoch": 0.251, "grad_norm": 0.5236772298812866, "learning_rate": 0.0002582343784555415, "loss": 1.2386, "num_input_tokens_seen": 1644953600, "step": 25100, "train_runtime": 12062.3997, "train_tokens_per_second": 136370.344 }, { "epoch": 0.252, "grad_norm": 0.5913048982620239, "learning_rate": 0.00025790427656258017, "loss": 1.2354, "num_input_tokens_seen": 1651507200, "step": 25200, "train_runtime": 12108.5333, "train_tokens_per_second": 136392.01 }, { "epoch": 0.253, "grad_norm": 0.5929732322692871, "learning_rate": 0.00025757308807475185, "loss": 1.2582, "num_input_tokens_seen": 1658060800, "step": 25300, "train_runtime": 12154.8252, "train_tokens_per_second": 136411.736 }, { "epoch": 0.254, "grad_norm": 0.4542764723300934, "learning_rate": 0.00025724081632712086, "loss": 1.2488, "num_input_tokens_seen": 1664614400, "step": 25400, "train_runtime": 12207.8935, "train_tokens_per_second": 136355.58 }, { "epoch": 0.255, "grad_norm": 1.0848513841629028, "learning_rate": 0.0002569074646656601, "loss": 1.2375, "num_input_tokens_seen": 1671168000, "step": 25500, "train_runtime": 12254.3162, "train_tokens_per_second": 136373.827 }, { "epoch": 0.256, "grad_norm": 0.5190780162811279, "learning_rate": 0.00025657303644721695, "loss": 1.236, "num_input_tokens_seen": 1677721600, "step": 25600, "train_runtime": 12301.2378, "train_tokens_per_second": 136386.405 }, { "epoch": 0.257, "grad_norm": 0.43418362736701965, "learning_rate": 0.00025623753503948004, "loss": 1.2484, "num_input_tokens_seen": 1684275200, "step": 25700, "train_runtime": 12347.684, "train_tokens_per_second": 136404.138 }, { "epoch": 0.258, "grad_norm": 0.4586409032344818, "learning_rate": 0.00025590096382094475, "loss": 1.2674, "num_input_tokens_seen": 1690828800, "step": 25800, "train_runtime": 12394.5809, "train_tokens_per_second": 136416.778 }, { "epoch": 0.259, "grad_norm": 0.5069702863693237, "learning_rate": 0.00025556332618087945, "loss": 1.2428, "num_input_tokens_seen": 1697382400, "step": 25900, "train_runtime": 12447.2116, "train_tokens_per_second": 136366.478 }, { "epoch": 0.26, "grad_norm": 0.591788649559021, "learning_rate": 0.00025522462551929155, "loss": 1.2417, "num_input_tokens_seen": 1703936000, "step": 26000, "train_runtime": 12492.8891, "train_tokens_per_second": 136392.47 }, { "epoch": 0.261, "grad_norm": 0.6001791954040527, "learning_rate": 0.00025488486524689283, "loss": 1.2407, "num_input_tokens_seen": 1710489600, "step": 26100, "train_runtime": 12539.4548, "train_tokens_per_second": 136408.61 }, { "epoch": 0.262, "grad_norm": 0.47005897760391235, "learning_rate": 0.00025454404878506555, "loss": 1.2558, "num_input_tokens_seen": 1717043200, "step": 26200, "train_runtime": 12587.1655, "train_tokens_per_second": 136412.221 }, { "epoch": 0.263, "grad_norm": 0.42708972096443176, "learning_rate": 0.0002542021795658276, "loss": 1.2445, "num_input_tokens_seen": 1723596800, "step": 26300, "train_runtime": 12634.1294, "train_tokens_per_second": 136423.868 }, { "epoch": 0.264, "grad_norm": 0.48100486397743225, "learning_rate": 0.0002538592610317984, "loss": 1.2416, "num_input_tokens_seen": 1730150400, "step": 26400, "train_runtime": 12686.5075, "train_tokens_per_second": 136377.202 }, { "epoch": 0.265, "grad_norm": 0.5689502954483032, "learning_rate": 0.00025351529663616355, "loss": 1.2476, "num_input_tokens_seen": 1736704000, "step": 26500, "train_runtime": 12733.1403, "train_tokens_per_second": 136392.435 }, { "epoch": 0.266, "grad_norm": 0.3999510705471039, "learning_rate": 0.00025317028984264087, "loss": 1.2507, "num_input_tokens_seen": 1743257600, "step": 26600, "train_runtime": 12780.4326, "train_tokens_per_second": 136400.515 }, { "epoch": 0.267, "grad_norm": 0.4349440336227417, "learning_rate": 0.0002528242441254448, "loss": 1.2359, "num_input_tokens_seen": 1749811200, "step": 26700, "train_runtime": 12826.6298, "train_tokens_per_second": 136420.184 }, { "epoch": 0.268, "grad_norm": 0.40468648076057434, "learning_rate": 0.000252477162969252, "loss": 1.2463, "num_input_tokens_seen": 1756364800, "step": 26800, "train_runtime": 12873.4848, "train_tokens_per_second": 136432.739 }, { "epoch": 0.269, "grad_norm": 0.5858653783798218, "learning_rate": 0.00025212904986916584, "loss": 1.2385, "num_input_tokens_seen": 1762918400, "step": 26900, "train_runtime": 12926.2009, "train_tokens_per_second": 136383.336 }, { "epoch": 0.27, "grad_norm": 0.4621046483516693, "learning_rate": 0.00025177990833068133, "loss": 1.2366, "num_input_tokens_seen": 1769472000, "step": 27000, "train_runtime": 12973.4952, "train_tokens_per_second": 136391.31 }, { "epoch": 0.271, "grad_norm": 0.4884892404079437, "learning_rate": 0.0002514297418696499, "loss": 1.2436, "num_input_tokens_seen": 1776025600, "step": 27100, "train_runtime": 13021.2871, "train_tokens_per_second": 136394.013 }, { "epoch": 0.272, "grad_norm": 0.5108981132507324, "learning_rate": 0.0002510785540122439, "loss": 1.2423, "num_input_tokens_seen": 1782579200, "step": 27200, "train_runtime": 13068.0423, "train_tokens_per_second": 136407.517 }, { "epoch": 0.273, "grad_norm": 0.3898067772388458, "learning_rate": 0.0002507263482949212, "loss": 1.2415, "num_input_tokens_seen": 1789132800, "step": 27300, "train_runtime": 13113.8421, "train_tokens_per_second": 136430.864 }, { "epoch": 0.274, "grad_norm": 0.5622383952140808, "learning_rate": 0.0002503731282643894, "loss": 1.2378, "num_input_tokens_seen": 1795686400, "step": 27400, "train_runtime": 13161.1635, "train_tokens_per_second": 136438.272 }, { "epoch": 0.275, "grad_norm": 0.7748796343803406, "learning_rate": 0.0002500188974775704, "loss": 1.248, "num_input_tokens_seen": 1802240000, "step": 27500, "train_runtime": 13209.4471, "train_tokens_per_second": 136435.688 }, { "epoch": 0.276, "grad_norm": 0.8867826461791992, "learning_rate": 0.00024966365950156416, "loss": 1.2409, "num_input_tokens_seen": 1808793600, "step": 27600, "train_runtime": 13256.4066, "train_tokens_per_second": 136446.751 }, { "epoch": 0.277, "grad_norm": 0.49997836351394653, "learning_rate": 0.00024930741791361326, "loss": 1.2382, "num_input_tokens_seen": 1815347200, "step": 27700, "train_runtime": 13309.6196, "train_tokens_per_second": 136393.62 }, { "epoch": 0.278, "grad_norm": 0.5048521161079407, "learning_rate": 0.0002489501763010664, "loss": 1.2351, "num_input_tokens_seen": 1821900800, "step": 27800, "train_runtime": 13356.706, "train_tokens_per_second": 136403.451 }, { "epoch": 0.279, "grad_norm": 0.5528578162193298, "learning_rate": 0.00024859193826134285, "loss": 1.2404, "num_input_tokens_seen": 1828454400, "step": 27900, "train_runtime": 13405.5813, "train_tokens_per_second": 136395.01 }, { "epoch": 0.28, "grad_norm": 0.44376805424690247, "learning_rate": 0.00024823270740189556, "loss": 1.2461, "num_input_tokens_seen": 1835008000, "step": 28000, "train_runtime": 13452.7686, "train_tokens_per_second": 136403.743 }, { "epoch": 0.281, "grad_norm": 0.5072674751281738, "learning_rate": 0.00024787248734017527, "loss": 1.2301, "num_input_tokens_seen": 1841561600, "step": 28100, "train_runtime": 13501.0413, "train_tokens_per_second": 136401.449 }, { "epoch": 0.282, "grad_norm": 0.46835577487945557, "learning_rate": 0.0002475112817035941, "loss": 1.237, "num_input_tokens_seen": 1848115200, "step": 28200, "train_runtime": 13547.4814, "train_tokens_per_second": 136417.622 }, { "epoch": 0.283, "grad_norm": 0.4893036186695099, "learning_rate": 0.0002471490941294887, "loss": 1.2612, "num_input_tokens_seen": 1854668800, "step": 28300, "train_runtime": 13593.9904, "train_tokens_per_second": 136432.993 }, { "epoch": 0.284, "grad_norm": 0.66542649269104, "learning_rate": 0.000246785928265084, "loss": 1.2405, "num_input_tokens_seen": 1861222400, "step": 28400, "train_runtime": 13646.3147, "train_tokens_per_second": 136390.113 }, { "epoch": 0.285, "grad_norm": 0.669306755065918, "learning_rate": 0.0002464217877674562, "loss": 1.2409, "num_input_tokens_seen": 1867776000, "step": 28500, "train_runtime": 13692.502, "train_tokens_per_second": 136408.671 }, { "epoch": 0.286, "grad_norm": 0.43464845418930054, "learning_rate": 0.0002460566763034961, "loss": 1.2435, "num_input_tokens_seen": 1874329600, "step": 28600, "train_runtime": 13738.7564, "train_tokens_per_second": 136426.438 }, { "epoch": 0.287, "grad_norm": 0.5084187388420105, "learning_rate": 0.00024569059754987196, "loss": 1.2572, "num_input_tokens_seen": 1880883200, "step": 28700, "train_runtime": 13785.6191, "train_tokens_per_second": 136438.065 }, { "epoch": 0.288, "grad_norm": 0.473603755235672, "learning_rate": 0.00024532355519299296, "loss": 1.2459, "num_input_tokens_seen": 1887436800, "step": 28800, "train_runtime": 13838.5181, "train_tokens_per_second": 136390.095 }, { "epoch": 0.289, "grad_norm": 0.493012011051178, "learning_rate": 0.0002449555529289714, "loss": 1.243, "num_input_tokens_seen": 1893990400, "step": 28900, "train_runtime": 13886.1283, "train_tokens_per_second": 136394.419 }, { "epoch": 0.29, "grad_norm": 0.7421333193778992, "learning_rate": 0.0002445865944635861, "loss": 1.2455, "num_input_tokens_seen": 1900544000, "step": 29000, "train_runtime": 13931.9406, "train_tokens_per_second": 136416.315 }, { "epoch": 0.291, "grad_norm": 0.5027185678482056, "learning_rate": 0.0002442166835122446, "loss": 1.2686, "num_input_tokens_seen": 1907097600, "step": 29100, "train_runtime": 13980.446, "train_tokens_per_second": 136411.785 }, { "epoch": 0.292, "grad_norm": 0.48427557945251465, "learning_rate": 0.00024384582379994614, "loss": 1.2369, "num_input_tokens_seen": 1913651200, "step": 29200, "train_runtime": 14028.0456, "train_tokens_per_second": 136416.095 }, { "epoch": 0.293, "grad_norm": 0.6620755195617676, "learning_rate": 0.00024347401906124388, "loss": 1.2317, "num_input_tokens_seen": 1920204800, "step": 29300, "train_runtime": 14074.3372, "train_tokens_per_second": 136433.054 }, { "epoch": 0.294, "grad_norm": 0.5745883584022522, "learning_rate": 0.0002431012730402075, "loss": 1.2443, "num_input_tokens_seen": 1926758400, "step": 29400, "train_runtime": 14125.645, "train_tokens_per_second": 136401.446 }, { "epoch": 0.295, "grad_norm": 0.441680908203125, "learning_rate": 0.00024272758949038517, "loss": 1.2393, "num_input_tokens_seen": 1933312000, "step": 29500, "train_runtime": 14172.5336, "train_tokens_per_second": 136412.588 }, { "epoch": 0.296, "grad_norm": 0.4417046904563904, "learning_rate": 0.00024235297217476616, "loss": 1.2371, "num_input_tokens_seen": 1939865600, "step": 29600, "train_runtime": 14220.1572, "train_tokens_per_second": 136416.608 }, { "epoch": 0.297, "grad_norm": 0.5888639688491821, "learning_rate": 0.00024197742486574268, "loss": 1.2344, "num_input_tokens_seen": 1946419200, "step": 29700, "train_runtime": 14267.366, "train_tokens_per_second": 136424.565 }, { "epoch": 0.298, "grad_norm": 0.4625283479690552, "learning_rate": 0.0002416009513450719, "loss": 1.2373, "num_input_tokens_seen": 1952972800, "step": 29800, "train_runtime": 14318.8989, "train_tokens_per_second": 136391.27 }, { "epoch": 0.299, "grad_norm": 0.47661375999450684, "learning_rate": 0.00024122355540383806, "loss": 1.2454, "num_input_tokens_seen": 1959526400, "step": 29900, "train_runtime": 14365.8797, "train_tokens_per_second": 136401.42 }, { "epoch": 0.3, "grad_norm": 0.727032482624054, "learning_rate": 0.00024084524084241405, "loss": 1.2379, "num_input_tokens_seen": 1966080000, "step": 30000, "train_runtime": 14415.1273, "train_tokens_per_second": 136390.055 }, { "epoch": 0.301, "grad_norm": 0.45500555634498596, "learning_rate": 0.00024046601147042332, "loss": 1.2358, "num_input_tokens_seen": 1972633600, "step": 30100, "train_runtime": 14461.5845, "train_tokens_per_second": 136405.08 }, { "epoch": 0.302, "grad_norm": 0.44596830010414124, "learning_rate": 0.0002400858711067015, "loss": 1.2301, "num_input_tokens_seen": 1979187200, "step": 30200, "train_runtime": 14508.0707, "train_tokens_per_second": 136419.737 }, { "epoch": 0.303, "grad_norm": 0.4207491874694824, "learning_rate": 0.00023970482357925772, "loss": 1.2441, "num_input_tokens_seen": 1985740800, "step": 30300, "train_runtime": 14555.5751, "train_tokens_per_second": 136424.757 }, { "epoch": 0.304, "grad_norm": 0.4833202064037323, "learning_rate": 0.00023932287272523646, "loss": 1.2351, "num_input_tokens_seen": 1992294400, "step": 30400, "train_runtime": 14601.9546, "train_tokens_per_second": 136440.255 }, { "epoch": 0.305, "grad_norm": 0.5268282294273376, "learning_rate": 0.00023894002239087847, "loss": 1.2384, "num_input_tokens_seen": 1998848000, "step": 30500, "train_runtime": 14654.2539, "train_tokens_per_second": 136400.53 }, { "epoch": 0.306, "grad_norm": 0.4639832377433777, "learning_rate": 0.0002385562764314825, "loss": 1.3007, "num_input_tokens_seen": 2005401600, "step": 30600, "train_runtime": 14702.026, "train_tokens_per_second": 136403.078 }, { "epoch": 0.307, "grad_norm": 0.526703953742981, "learning_rate": 0.00023817163871136596, "loss": 1.2481, "num_input_tokens_seen": 2011955200, "step": 30700, "train_runtime": 14749.4458, "train_tokens_per_second": 136408.868 }, { "epoch": 0.308, "grad_norm": 0.43404075503349304, "learning_rate": 0.00023778611310382652, "loss": 1.2273, "num_input_tokens_seen": 2018508800, "step": 30800, "train_runtime": 14796.5936, "train_tokens_per_second": 136417.128 }, { "epoch": 0.309, "grad_norm": 0.39956456422805786, "learning_rate": 0.0002373997034911027, "loss": 1.2275, "num_input_tokens_seen": 2025062400, "step": 30900, "train_runtime": 14843.3887, "train_tokens_per_second": 136428.578 }, { "epoch": 0.31, "grad_norm": 0.46024298667907715, "learning_rate": 0.00023701241376433506, "loss": 1.2353, "num_input_tokens_seen": 2031616000, "step": 31000, "train_runtime": 14890.8282, "train_tokens_per_second": 136434.05 }, { "epoch": 0.311, "grad_norm": 0.38429203629493713, "learning_rate": 0.0002366242478235268, "loss": 1.2403, "num_input_tokens_seen": 2038169600, "step": 31100, "train_runtime": 14937.8781, "train_tokens_per_second": 136443.047 }, { "epoch": 0.312, "grad_norm": 0.5401485562324524, "learning_rate": 0.00023623520957750471, "loss": 1.2273, "num_input_tokens_seen": 2044723200, "step": 31200, "train_runtime": 14990.0842, "train_tokens_per_second": 136405.051 }, { "epoch": 0.313, "grad_norm": 0.5360187888145447, "learning_rate": 0.00023584530294387953, "loss": 1.2312, "num_input_tokens_seen": 2051276800, "step": 31300, "train_runtime": 15037.4257, "train_tokens_per_second": 136411.434 }, { "epoch": 0.314, "grad_norm": 0.4468795359134674, "learning_rate": 0.00023545453184900682, "loss": 1.2383, "num_input_tokens_seen": 2057830400, "step": 31400, "train_runtime": 15083.4771, "train_tokens_per_second": 136429.444 }, { "epoch": 0.315, "grad_norm": 0.4575517177581787, "learning_rate": 0.00023506290022794706, "loss": 1.2354, "num_input_tokens_seen": 2064384000, "step": 31500, "train_runtime": 15131.2692, "train_tokens_per_second": 136431.648 }, { "epoch": 0.316, "grad_norm": 0.7983475923538208, "learning_rate": 0.00023467041202442643, "loss": 1.2309, "num_input_tokens_seen": 2070937600, "step": 31600, "train_runtime": 15178.6218, "train_tokens_per_second": 136437.789 }, { "epoch": 0.317, "grad_norm": 0.4316498339176178, "learning_rate": 0.00023427707119079669, "loss": 1.2462, "num_input_tokens_seen": 2077491200, "step": 31700, "train_runtime": 15225.1881, "train_tokens_per_second": 136450.938 }, { "epoch": 0.318, "grad_norm": 0.5765666365623474, "learning_rate": 0.0002338828816879957, "loss": 1.2367, "num_input_tokens_seen": 2084044800, "step": 31800, "train_runtime": 15277.5735, "train_tokens_per_second": 136412.029 }, { "epoch": 0.319, "grad_norm": 0.44825831055641174, "learning_rate": 0.00023348784748550744, "loss": 1.2354, "num_input_tokens_seen": 2090598400, "step": 31900, "train_runtime": 15324.8285, "train_tokens_per_second": 136419.04 }, { "epoch": 0.32, "grad_norm": 0.5602436661720276, "learning_rate": 0.00023309197256132184, "loss": 1.2324, "num_input_tokens_seen": 2097152000, "step": 32000, "train_runtime": 15371.4775, "train_tokens_per_second": 136431.387 }, { "epoch": 0.321, "grad_norm": 0.4002476930618286, "learning_rate": 0.00023269526090189505, "loss": 1.2396, "num_input_tokens_seen": 2103705600, "step": 32100, "train_runtime": 15419.2672, "train_tokens_per_second": 136433.565 }, { "epoch": 0.322, "grad_norm": 0.4306688606739044, "learning_rate": 0.00023229771650210907, "loss": 1.2468, "num_input_tokens_seen": 2110259200, "step": 32200, "train_runtime": 15466.1068, "train_tokens_per_second": 136444.111 }, { "epoch": 0.323, "grad_norm": 0.584658145904541, "learning_rate": 0.00023189934336523163, "loss": 1.2459, "num_input_tokens_seen": 2116812800, "step": 32300, "train_runtime": 15513.277, "train_tokens_per_second": 136451.686 }, { "epoch": 0.324, "grad_norm": 0.4049496352672577, "learning_rate": 0.00023150014550287574, "loss": 1.2455, "num_input_tokens_seen": 2123366400, "step": 32400, "train_runtime": 15565.7808, "train_tokens_per_second": 136412.456 }, { "epoch": 0.325, "grad_norm": 0.45713433623313904, "learning_rate": 0.00023110012693495943, "loss": 1.2308, "num_input_tokens_seen": 2129920000, "step": 32500, "train_runtime": 15610.6324, "train_tokens_per_second": 136440.341 }, { "epoch": 0.326, "grad_norm": 0.5710960030555725, "learning_rate": 0.00023069929168966527, "loss": 1.2434, "num_input_tokens_seen": 2136473600, "step": 32600, "train_runtime": 15657.7335, "train_tokens_per_second": 136448.458 }, { "epoch": 0.327, "grad_norm": 0.5807371735572815, "learning_rate": 0.0002302976438033997, "loss": 1.2292, "num_input_tokens_seen": 2143027200, "step": 32700, "train_runtime": 15710.1819, "train_tokens_per_second": 136410.082 }, { "epoch": 0.328, "grad_norm": 0.4462313652038574, "learning_rate": 0.0002298951873207525, "loss": 1.2427, "num_input_tokens_seen": 2149580800, "step": 32800, "train_runtime": 15757.3708, "train_tokens_per_second": 136417.479 }, { "epoch": 0.329, "grad_norm": 0.6099971532821655, "learning_rate": 0.00022949192629445606, "loss": 1.2313, "num_input_tokens_seen": 2156134400, "step": 32900, "train_runtime": 15804.1823, "train_tokens_per_second": 136428.089 }, { "epoch": 0.33, "grad_norm": 0.8630947470664978, "learning_rate": 0.0002290878647853443, "loss": 1.247, "num_input_tokens_seen": 2162688000, "step": 33000, "train_runtime": 15852.2039, "train_tokens_per_second": 136428.223 }, { "epoch": 0.331, "grad_norm": 0.5154317021369934, "learning_rate": 0.00022868300686231224, "loss": 1.2246, "num_input_tokens_seen": 2169241600, "step": 33100, "train_runtime": 15899.5617, "train_tokens_per_second": 136434.05 }, { "epoch": 0.332, "grad_norm": 0.5033185482025146, "learning_rate": 0.00022827735660227457, "loss": 1.2271, "num_input_tokens_seen": 2175795200, "step": 33200, "train_runtime": 15947.1716, "train_tokens_per_second": 136437.686 }, { "epoch": 0.333, "grad_norm": 0.7760284543037415, "learning_rate": 0.000227870918090125, "loss": 1.2445, "num_input_tokens_seen": 2182348800, "step": 33300, "train_runtime": 16000.1889, "train_tokens_per_second": 136395.189 }, { "epoch": 0.334, "grad_norm": 0.5042400360107422, "learning_rate": 0.00022746369541869476, "loss": 1.223, "num_input_tokens_seen": 2188902400, "step": 33400, "train_runtime": 16047.8873, "train_tokens_per_second": 136398.166 }, { "epoch": 0.335, "grad_norm": 0.421273410320282, "learning_rate": 0.00022705569268871163, "loss": 1.2222, "num_input_tokens_seen": 2195456000, "step": 33500, "train_runtime": 16094.6711, "train_tokens_per_second": 136408.876 }, { "epoch": 0.336, "grad_norm": 0.48292359709739685, "learning_rate": 0.00022664691400875865, "loss": 1.222, "num_input_tokens_seen": 2202009600, "step": 33600, "train_runtime": 16143.6943, "train_tokens_per_second": 136400.601 }, { "epoch": 0.337, "grad_norm": 0.4301004409790039, "learning_rate": 0.00022623736349523254, "loss": 1.2308, "num_input_tokens_seen": 2208563200, "step": 33700, "train_runtime": 16189.7469, "train_tokens_per_second": 136417.401 }, { "epoch": 0.338, "grad_norm": 0.6592893600463867, "learning_rate": 0.00022582704527230238, "loss": 1.2401, "num_input_tokens_seen": 2215116800, "step": 33800, "train_runtime": 16235.6512, "train_tokens_per_second": 136435.353 }, { "epoch": 0.339, "grad_norm": 0.6183221340179443, "learning_rate": 0.0002254159634718682, "loss": 1.2364, "num_input_tokens_seen": 2221670400, "step": 33900, "train_runtime": 16283.1306, "train_tokens_per_second": 136440.003 }, { "epoch": 0.34, "grad_norm": 0.529971182346344, "learning_rate": 0.00022500412223351915, "loss": 1.2222, "num_input_tokens_seen": 2228224000, "step": 34000, "train_runtime": 16330.1955, "train_tokens_per_second": 136448.091 }, { "epoch": 0.341, "grad_norm": 0.41906896233558655, "learning_rate": 0.0002245915257044919, "loss": 1.2261, "num_input_tokens_seen": 2234777600, "step": 34100, "train_runtime": 16381.7912, "train_tokens_per_second": 136418.391 }, { "epoch": 0.342, "grad_norm": 0.4326164722442627, "learning_rate": 0.00022417817803962892, "loss": 1.2452, "num_input_tokens_seen": 2241331200, "step": 34200, "train_runtime": 16429.3997, "train_tokens_per_second": 136421.978 }, { "epoch": 0.343, "grad_norm": 0.8329346179962158, "learning_rate": 0.0002237640834013366, "loss": 1.2197, "num_input_tokens_seen": 2247884800, "step": 34300, "train_runtime": 16476.2139, "train_tokens_per_second": 136432.121 }, { "epoch": 0.344, "grad_norm": 0.4649752378463745, "learning_rate": 0.0002233492459595434, "loss": 1.2255, "num_input_tokens_seen": 2254438400, "step": 34400, "train_runtime": 16523.092, "train_tokens_per_second": 136441.678 }, { "epoch": 0.345, "grad_norm": 0.5218563675880432, "learning_rate": 0.00022293366989165772, "loss": 1.2365, "num_input_tokens_seen": 2260992000, "step": 34500, "train_runtime": 16575.1624, "train_tokens_per_second": 136408.437 }, { "epoch": 0.346, "grad_norm": 0.8002403974533081, "learning_rate": 0.00022251735938252587, "loss": 1.2179, "num_input_tokens_seen": 2267545600, "step": 34600, "train_runtime": 16622.274, "train_tokens_per_second": 136416.088 }, { "epoch": 0.347, "grad_norm": 0.5648475289344788, "learning_rate": 0.0002221003186243902, "loss": 1.2301, "num_input_tokens_seen": 2274099200, "step": 34700, "train_runtime": 16668.9107, "train_tokens_per_second": 136427.583 }, { "epoch": 0.348, "grad_norm": 0.4631340801715851, "learning_rate": 0.00022168255181684643, "loss": 1.2292, "num_input_tokens_seen": 2280652800, "step": 34800, "train_runtime": 16715.4649, "train_tokens_per_second": 136439.687 }, { "epoch": 0.349, "grad_norm": 0.4492770731449127, "learning_rate": 0.00022126406316680172, "loss": 1.226, "num_input_tokens_seen": 2287206400, "step": 34900, "train_runtime": 16761.744, "train_tokens_per_second": 136453.963 }, { "epoch": 0.35, "grad_norm": 0.5984812378883362, "learning_rate": 0.00022084485688843208, "loss": 1.2332, "num_input_tokens_seen": 2293760000, "step": 35000, "train_runtime": 16816.4332, "train_tokens_per_second": 136399.912 }, { "epoch": 0.351, "grad_norm": 0.6245887875556946, "learning_rate": 0.00022042493720314003, "loss": 1.2324, "num_input_tokens_seen": 2300313600, "step": 35100, "train_runtime": 16864.2018, "train_tokens_per_second": 136402.163 }, { "epoch": 0.352, "grad_norm": 0.6719664335250854, "learning_rate": 0.00022000430833951228, "loss": 1.2272, "num_input_tokens_seen": 2306867200, "step": 35200, "train_runtime": 16910.313, "train_tokens_per_second": 136417.77 }, { "epoch": 0.353, "grad_norm": 0.43880173563957214, "learning_rate": 0.00021958297453327673, "loss": 1.2572, "num_input_tokens_seen": 2313420800, "step": 35300, "train_runtime": 16958.9376, "train_tokens_per_second": 136413.073 }, { "epoch": 0.354, "grad_norm": 0.6195557713508606, "learning_rate": 0.00021916094002726012, "loss": 1.2299, "num_input_tokens_seen": 2319974400, "step": 35400, "train_runtime": 17005.9814, "train_tokens_per_second": 136421.083 }, { "epoch": 0.355, "grad_norm": 0.5288188457489014, "learning_rate": 0.00021873820907134534, "loss": 1.2157, "num_input_tokens_seen": 2326528000, "step": 35500, "train_runtime": 17053.3579, "train_tokens_per_second": 136426.387 }, { "epoch": 0.356, "grad_norm": 0.4962466061115265, "learning_rate": 0.0002183147859224283, "loss": 1.2282, "num_input_tokens_seen": 2333081600, "step": 35600, "train_runtime": 17099.0541, "train_tokens_per_second": 136445.068 }, { "epoch": 0.357, "grad_norm": 0.4940129518508911, "learning_rate": 0.00021789067484437544, "loss": 1.2349, "num_input_tokens_seen": 2339635200, "step": 35700, "train_runtime": 17146.892, "train_tokens_per_second": 136446.605 }, { "epoch": 0.358, "grad_norm": 0.5929033160209656, "learning_rate": 0.00021746588010798068, "loss": 1.2368, "num_input_tokens_seen": 2346188800, "step": 35800, "train_runtime": 17199.6266, "train_tokens_per_second": 136409.287 }, { "epoch": 0.359, "grad_norm": 0.4825666546821594, "learning_rate": 0.00021704040599092216, "loss": 1.2215, "num_input_tokens_seen": 2352742400, "step": 35900, "train_runtime": 17246.2748, "train_tokens_per_second": 136420.324 }, { "epoch": 0.36, "grad_norm": 0.4572449028491974, "learning_rate": 0.00021661425677771965, "loss": 1.2291, "num_input_tokens_seen": 2359296000, "step": 36000, "train_runtime": 17292.1332, "train_tokens_per_second": 136437.533 }, { "epoch": 0.361, "grad_norm": 0.467132568359375, "learning_rate": 0.00021618743675969095, "loss": 1.2295, "num_input_tokens_seen": 2365849600, "step": 36100, "train_runtime": 17339.1599, "train_tokens_per_second": 136445.457 }, { "epoch": 0.362, "grad_norm": 0.4863705635070801, "learning_rate": 0.0002157599502349089, "loss": 1.2154, "num_input_tokens_seen": 2372403200, "step": 36200, "train_runtime": 17386.7454, "train_tokens_per_second": 136448.952 }, { "epoch": 0.363, "grad_norm": 0.43923652172088623, "learning_rate": 0.00021533180150815802, "loss": 1.2268, "num_input_tokens_seen": 2378956800, "step": 36300, "train_runtime": 17439.0785, "train_tokens_per_second": 136415.282 }, { "epoch": 0.364, "grad_norm": 0.5028465390205383, "learning_rate": 0.00021490299489089132, "loss": 1.2293, "num_input_tokens_seen": 2385510400, "step": 36400, "train_runtime": 17485.9662, "train_tokens_per_second": 136424.283 }, { "epoch": 0.365, "grad_norm": 0.4366530478000641, "learning_rate": 0.00021447353470118656, "loss": 1.2276, "num_input_tokens_seen": 2392064000, "step": 36500, "train_runtime": 17533.3809, "train_tokens_per_second": 136429.136 }, { "epoch": 0.366, "grad_norm": 0.46415793895721436, "learning_rate": 0.00021404342526370326, "loss": 1.2227, "num_input_tokens_seen": 2398617600, "step": 36600, "train_runtime": 17580.8443, "train_tokens_per_second": 136433.584 }, { "epoch": 0.367, "grad_norm": 0.6382859349250793, "learning_rate": 0.00021361267090963846, "loss": 1.2212, "num_input_tokens_seen": 2405171200, "step": 36700, "train_runtime": 17626.7905, "train_tokens_per_second": 136449.753 }, { "epoch": 0.368, "grad_norm": 0.6642177700996399, "learning_rate": 0.0002131812759766839, "loss": 1.2317, "num_input_tokens_seen": 2411724800, "step": 36800, "train_runtime": 17679.381, "train_tokens_per_second": 136414.55 }, { "epoch": 0.369, "grad_norm": 0.4071521461009979, "learning_rate": 0.00021274924480898169, "loss": 1.2262, "num_input_tokens_seen": 2418278400, "step": 36900, "train_runtime": 17726.5473, "train_tokens_per_second": 136421.288 }, { "epoch": 0.37, "grad_norm": 0.5301467776298523, "learning_rate": 0.00021231658175708087, "loss": 1.2192, "num_input_tokens_seen": 2424832000, "step": 37000, "train_runtime": 17772.7667, "train_tokens_per_second": 136435.258 }, { "epoch": 0.371, "grad_norm": 0.5216257572174072, "learning_rate": 0.00021188329117789357, "loss": 1.213, "num_input_tokens_seen": 2431385600, "step": 37100, "train_runtime": 17824.6083, "train_tokens_per_second": 136406.116 }, { "epoch": 0.372, "grad_norm": 0.5098195672035217, "learning_rate": 0.0002114493774346512, "loss": 1.2311, "num_input_tokens_seen": 2437939200, "step": 37200, "train_runtime": 17870.9901, "train_tokens_per_second": 136418.81 }, { "epoch": 0.373, "grad_norm": 0.47295039892196655, "learning_rate": 0.00021101484489686025, "loss": 1.2211, "num_input_tokens_seen": 2444492800, "step": 37300, "train_runtime": 17918.4906, "train_tokens_per_second": 136422.919 }, { "epoch": 0.374, "grad_norm": 0.49752944707870483, "learning_rate": 0.00021057969794025866, "loss": 1.2292, "num_input_tokens_seen": 2451046400, "step": 37400, "train_runtime": 17965.5373, "train_tokens_per_second": 136430.453 }, { "epoch": 0.375, "grad_norm": 0.9500930905342102, "learning_rate": 0.00021014394094677128, "loss": 1.2187, "num_input_tokens_seen": 2457600000, "step": 37500, "train_runtime": 18012.267, "train_tokens_per_second": 136440.349 }, { "epoch": 0.376, "grad_norm": 0.4800110459327698, "learning_rate": 0.00020970757830446633, "loss": 1.2336, "num_input_tokens_seen": 2464153600, "step": 37600, "train_runtime": 18059.6653, "train_tokens_per_second": 136445.143 }, { "epoch": 0.377, "grad_norm": 0.48905813694000244, "learning_rate": 0.00020927061440751072, "loss": 1.2189, "num_input_tokens_seen": 2470707200, "step": 37700, "train_runtime": 18111.7548, "train_tokens_per_second": 136414.567 }, { "epoch": 0.378, "grad_norm": 0.593604564666748, "learning_rate": 0.00020883305365612602, "loss": 1.2178, "num_input_tokens_seen": 2477260800, "step": 37800, "train_runtime": 18157.6424, "train_tokens_per_second": 136430.751 }, { "epoch": 0.379, "grad_norm": 0.46399399638175964, "learning_rate": 0.00020839490045654425, "loss": 1.2141, "num_input_tokens_seen": 2483814400, "step": 37900, "train_runtime": 18204.4326, "train_tokens_per_second": 136440.089 }, { "epoch": 0.38, "grad_norm": 0.5679593086242676, "learning_rate": 0.00020795615922096313, "loss": 1.2332, "num_input_tokens_seen": 2490368000, "step": 38000, "train_runtime": 18252.6627, "train_tokens_per_second": 136438.614 }, { "epoch": 0.381, "grad_norm": 0.48073315620422363, "learning_rate": 0.00020751683436750207, "loss": 1.2369, "num_input_tokens_seen": 2496921600, "step": 38100, "train_runtime": 18300.6025, "train_tokens_per_second": 136439.311 }, { "epoch": 0.382, "grad_norm": 0.4134567677974701, "learning_rate": 0.00020707693032015752, "loss": 1.2168, "num_input_tokens_seen": 2503475200, "step": 38200, "train_runtime": 18351.6848, "train_tokens_per_second": 136416.641 }, { "epoch": 0.383, "grad_norm": 0.4675845503807068, "learning_rate": 0.00020663645150875834, "loss": 1.2272, "num_input_tokens_seen": 2510028800, "step": 38300, "train_runtime": 18398.2852, "train_tokens_per_second": 136427.323 }, { "epoch": 0.384, "grad_norm": 0.4632211923599243, "learning_rate": 0.00020619540236892125, "loss": 1.2444, "num_input_tokens_seen": 2516582400, "step": 38400, "train_runtime": 18445.2271, "train_tokens_per_second": 136435.425 }, { "epoch": 0.385, "grad_norm": 0.5543389916419983, "learning_rate": 0.00020575378734200616, "loss": 1.22, "num_input_tokens_seen": 2523136000, "step": 38500, "train_runtime": 18492.3307, "train_tokens_per_second": 136442.292 }, { "epoch": 0.386, "grad_norm": 0.5775281190872192, "learning_rate": 0.0002053116108750715, "loss": 1.2277, "num_input_tokens_seen": 2529689600, "step": 38600, "train_runtime": 18544.2017, "train_tokens_per_second": 136414.047 }, { "epoch": 0.387, "grad_norm": 0.5202789306640625, "learning_rate": 0.0002048688774208294, "loss": 1.2203, "num_input_tokens_seen": 2536243200, "step": 38700, "train_runtime": 18591.8641, "train_tokens_per_second": 136416.832 }, { "epoch": 0.388, "grad_norm": 0.44833704829216003, "learning_rate": 0.0002044255914376009, "loss": 1.2209, "num_input_tokens_seen": 2542796800, "step": 38800, "train_runtime": 18637.8905, "train_tokens_per_second": 136431.577 }, { "epoch": 0.389, "grad_norm": 0.5180789828300476, "learning_rate": 0.00020398175738927082, "loss": 1.2105, "num_input_tokens_seen": 2549350400, "step": 38900, "train_runtime": 18684.0663, "train_tokens_per_second": 136445.159 }, { "epoch": 0.39, "grad_norm": 0.6083468794822693, "learning_rate": 0.00020353737974524312, "loss": 1.2136, "num_input_tokens_seen": 2555904000, "step": 39000, "train_runtime": 18730.572, "train_tokens_per_second": 136456.27 }, { "epoch": 0.391, "grad_norm": 0.39693883061408997, "learning_rate": 0.00020309246298039584, "loss": 1.2285, "num_input_tokens_seen": 2562457600, "step": 39100, "train_runtime": 18784.1544, "train_tokens_per_second": 136415.914 }, { "epoch": 0.392, "grad_norm": 0.5166248679161072, "learning_rate": 0.0002026470115750357, "loss": 1.223, "num_input_tokens_seen": 2569011200, "step": 39200, "train_runtime": 18830.687, "train_tokens_per_second": 136426.844 }, { "epoch": 0.393, "grad_norm": 0.4967111051082611, "learning_rate": 0.0002022010300148535, "loss": 1.2163, "num_input_tokens_seen": 2575564800, "step": 39300, "train_runtime": 18876.8963, "train_tokens_per_second": 136440.057 }, { "epoch": 0.394, "grad_norm": 0.627816915512085, "learning_rate": 0.0002017545227908786, "loss": 1.2328, "num_input_tokens_seen": 2582118400, "step": 39400, "train_runtime": 18923.6736, "train_tokens_per_second": 136449.109 }, { "epoch": 0.395, "grad_norm": 0.489969938993454, "learning_rate": 0.00020130749439943376, "loss": 1.224, "num_input_tokens_seen": 2588672000, "step": 39500, "train_runtime": 18970.0964, "train_tokens_per_second": 136460.666 }, { "epoch": 0.396, "grad_norm": 0.6713995933532715, "learning_rate": 0.00020085994934208998, "loss": 1.2156, "num_input_tokens_seen": 2595225600, "step": 39600, "train_runtime": 19023.1241, "train_tokens_per_second": 136424.784 }, { "epoch": 0.397, "grad_norm": 0.4549367427825928, "learning_rate": 0.00020041189212562094, "loss": 1.2094, "num_input_tokens_seen": 2601779200, "step": 39700, "train_runtime": 19070.6234, "train_tokens_per_second": 136428.639 }, { "epoch": 0.398, "grad_norm": 0.47548773884773254, "learning_rate": 0.0001999633272619579, "loss": 1.2244, "num_input_tokens_seen": 2608332800, "step": 39800, "train_runtime": 19117.4992, "train_tokens_per_second": 136436.925 }, { "epoch": 0.399, "grad_norm": 0.46569159626960754, "learning_rate": 0.00019951425926814404, "loss": 1.2189, "num_input_tokens_seen": 2614886400, "step": 39900, "train_runtime": 19164.3173, "train_tokens_per_second": 136445.581 }, { "epoch": 0.4, "grad_norm": 0.5518438220024109, "learning_rate": 0.00019906469266628904, "loss": 1.2097, "num_input_tokens_seen": 2621440000, "step": 40000, "train_runtime": 19211.1586, "train_tokens_per_second": 136454.029 }, { "epoch": 0.401, "grad_norm": 0.4615115821361542, "learning_rate": 0.0001986146319835236, "loss": 1.2177, "num_input_tokens_seen": 2627993600, "step": 40100, "train_runtime": 19263.5816, "train_tokens_per_second": 136422.897 }, { "epoch": 0.402, "grad_norm": 0.4154411554336548, "learning_rate": 0.00019816408175195383, "loss": 1.2262, "num_input_tokens_seen": 2634547200, "step": 40200, "train_runtime": 19310.6242, "train_tokens_per_second": 136429.935 }, { "epoch": 0.403, "grad_norm": 0.48504838347435, "learning_rate": 0.0001977130465086155, "loss": 1.2205, "num_input_tokens_seen": 2641100800, "step": 40300, "train_runtime": 19356.9428, "train_tokens_per_second": 136442.042 }, { "epoch": 0.404, "grad_norm": 0.477006196975708, "learning_rate": 0.0001972615307954286, "loss": 1.2099, "num_input_tokens_seen": 2647654400, "step": 40400, "train_runtime": 19403.4467, "train_tokens_per_second": 136452.788 }, { "epoch": 0.405, "grad_norm": 0.46401214599609375, "learning_rate": 0.00019680953915915124, "loss": 1.2142, "num_input_tokens_seen": 2654208000, "step": 40500, "train_runtime": 19456.0604, "train_tokens_per_second": 136420.629 }, { "epoch": 0.406, "grad_norm": 0.4205267131328583, "learning_rate": 0.00019635707615133427, "loss": 1.2233, "num_input_tokens_seen": 2660761600, "step": 40600, "train_runtime": 19503.129, "train_tokens_per_second": 136427.422 }, { "epoch": 0.407, "grad_norm": 0.7298253178596497, "learning_rate": 0.00019590414632827513, "loss": 1.2143, "num_input_tokens_seen": 2667315200, "step": 40700, "train_runtime": 19550.1113, "train_tokens_per_second": 136434.783 }, { "epoch": 0.408, "grad_norm": 0.47734642028808594, "learning_rate": 0.00019545075425097204, "loss": 1.222, "num_input_tokens_seen": 2673868800, "step": 40800, "train_runtime": 19596.9887, "train_tokens_per_second": 136442.84 }, { "epoch": 0.409, "grad_norm": 0.4535351097583771, "learning_rate": 0.00019499690448507827, "loss": 1.2373, "num_input_tokens_seen": 2680422400, "step": 40900, "train_runtime": 19649.1805, "train_tokens_per_second": 136413.954 }, { "epoch": 0.41, "grad_norm": 0.572079062461853, "learning_rate": 0.00019454260160085588, "loss": 1.2125, "num_input_tokens_seen": 2686976000, "step": 41000, "train_runtime": 19697.7854, "train_tokens_per_second": 136410.056 }, { "epoch": 0.411, "grad_norm": 0.4487378001213074, "learning_rate": 0.0001940878501731299, "loss": 1.2124, "num_input_tokens_seen": 2693529600, "step": 41100, "train_runtime": 19744.9135, "train_tokens_per_second": 136416.379 }, { "epoch": 0.412, "grad_norm": 0.47419917583465576, "learning_rate": 0.00019363265478124214, "loss": 1.2037, "num_input_tokens_seen": 2700083200, "step": 41200, "train_runtime": 19791.8314, "train_tokens_per_second": 136424.121 }, { "epoch": 0.413, "grad_norm": 0.6295040845870972, "learning_rate": 0.00019317702000900516, "loss": 1.2246, "num_input_tokens_seen": 2706636800, "step": 41300, "train_runtime": 19838.5236, "train_tokens_per_second": 136433.379 }, { "epoch": 0.414, "grad_norm": 0.53326016664505, "learning_rate": 0.000192720950444656, "loss": 1.2192, "num_input_tokens_seen": 2713190400, "step": 41400, "train_runtime": 19885.4264, "train_tokens_per_second": 136441.147 }, { "epoch": 0.415, "grad_norm": 0.49727046489715576, "learning_rate": 0.00019226445068081018, "loss": 1.2279, "num_input_tokens_seen": 2719744000, "step": 41500, "train_runtime": 19937.4737, "train_tokens_per_second": 136413.672 }, { "epoch": 0.416, "grad_norm": 0.47963398694992065, "learning_rate": 0.00019180752531441523, "loss": 1.2226, "num_input_tokens_seen": 2726297600, "step": 41600, "train_runtime": 19984.6667, "train_tokens_per_second": 136419.468 }, { "epoch": 0.417, "grad_norm": 0.4789304733276367, "learning_rate": 0.00019135017894670456, "loss": 1.2222, "num_input_tokens_seen": 2732851200, "step": 41700, "train_runtime": 20032.7071, "train_tokens_per_second": 136419.465 }, { "epoch": 0.418, "grad_norm": 0.6693325638771057, "learning_rate": 0.0001908924161831509, "loss": 1.2366, "num_input_tokens_seen": 2739404800, "step": 41800, "train_runtime": 20078.7138, "train_tokens_per_second": 136433.281 }, { "epoch": 0.419, "grad_norm": 0.41989439725875854, "learning_rate": 0.0001904342416334203, "loss": 1.2212, "num_input_tokens_seen": 2745958400, "step": 41900, "train_runtime": 20125.0521, "train_tokens_per_second": 136444.785 }, { "epoch": 0.42, "grad_norm": 0.5444014072418213, "learning_rate": 0.00018997565991132532, "loss": 1.2164, "num_input_tokens_seen": 2752512000, "step": 42000, "train_runtime": 20177.4596, "train_tokens_per_second": 136415.191 }, { "epoch": 0.421, "grad_norm": 0.5790873169898987, "learning_rate": 0.0001895166756347789, "loss": 1.215, "num_input_tokens_seen": 2759065600, "step": 42100, "train_runtime": 20224.878, "train_tokens_per_second": 136419.394 }, { "epoch": 0.422, "grad_norm": 0.4666343927383423, "learning_rate": 0.0001890572934257475, "loss": 1.2229, "num_input_tokens_seen": 2765619200, "step": 42200, "train_runtime": 20270.922, "train_tokens_per_second": 136432.827 }, { "epoch": 0.423, "grad_norm": 0.4322357177734375, "learning_rate": 0.00018859751791020497, "loss": 1.2258, "num_input_tokens_seen": 2772172800, "step": 42300, "train_runtime": 20317.4494, "train_tokens_per_second": 136442.954 }, { "epoch": 0.424, "grad_norm": 0.6240208148956299, "learning_rate": 0.0001881373537180856, "loss": 1.221, "num_input_tokens_seen": 2778726400, "step": 42400, "train_runtime": 20364.5753, "train_tokens_per_second": 136449.023 }, { "epoch": 0.425, "grad_norm": 0.5865579843521118, "learning_rate": 0.00018767680548323766, "loss": 1.2244, "num_input_tokens_seen": 2785280000, "step": 42500, "train_runtime": 20417.9029, "train_tokens_per_second": 136413.617 }, { "epoch": 0.426, "grad_norm": 0.5201649069786072, "learning_rate": 0.0001872158778433768, "loss": 1.2076, "num_input_tokens_seen": 2791833600, "step": 42600, "train_runtime": 20464.7135, "train_tokens_per_second": 136421.827 }, { "epoch": 0.427, "grad_norm": 0.5092735290527344, "learning_rate": 0.0001867545754400392, "loss": 1.2057, "num_input_tokens_seen": 2798387200, "step": 42700, "train_runtime": 20511.0273, "train_tokens_per_second": 136433.303 }, { "epoch": 0.428, "grad_norm": 0.4439486265182495, "learning_rate": 0.000186292902918535, "loss": 1.209, "num_input_tokens_seen": 2804940800, "step": 42800, "train_runtime": 20558.3684, "train_tokens_per_second": 136437.909 }, { "epoch": 0.429, "grad_norm": 0.4466177225112915, "learning_rate": 0.00018583086492790136, "loss": 1.218, "num_input_tokens_seen": 2811494400, "step": 42900, "train_runtime": 20605.5543, "train_tokens_per_second": 136443.522 }, { "epoch": 0.43, "grad_norm": 0.5813594460487366, "learning_rate": 0.00018536846612085566, "loss": 1.2161, "num_input_tokens_seen": 2818048000, "step": 43000, "train_runtime": 20658.6134, "train_tokens_per_second": 136410.317 }, { "epoch": 0.431, "grad_norm": 0.49140629172325134, "learning_rate": 0.00018490571115374878, "loss": 1.227, "num_input_tokens_seen": 2824601600, "step": 43100, "train_runtime": 20705.6255, "train_tokens_per_second": 136417.11 }, { "epoch": 0.432, "grad_norm": 0.4938826858997345, "learning_rate": 0.00018444260468651816, "loss": 1.2252, "num_input_tokens_seen": 2831155200, "step": 43200, "train_runtime": 20752.3571, "train_tokens_per_second": 136425.717 }, { "epoch": 0.433, "grad_norm": 0.5228791832923889, "learning_rate": 0.00018397915138264068, "loss": 1.2274, "num_input_tokens_seen": 2837708800, "step": 43300, "train_runtime": 20799.4436, "train_tokens_per_second": 136431.957 }, { "epoch": 0.434, "grad_norm": 0.46896296739578247, "learning_rate": 0.00018351535590908606, "loss": 1.2043, "num_input_tokens_seen": 2844262400, "step": 43400, "train_runtime": 20845.6184, "train_tokens_per_second": 136444.137 }, { "epoch": 0.435, "grad_norm": 0.4269004464149475, "learning_rate": 0.00018305122293626948, "loss": 1.2213, "num_input_tokens_seen": 2850816000, "step": 43500, "train_runtime": 20897.7485, "train_tokens_per_second": 136417.375 }, { "epoch": 0.436, "grad_norm": 0.6213890314102173, "learning_rate": 0.00018258675713800492, "loss": 1.2096, "num_input_tokens_seen": 2857369600, "step": 43600, "train_runtime": 20944.9642, "train_tokens_per_second": 136422.749 }, { "epoch": 0.437, "grad_norm": 0.4281384348869324, "learning_rate": 0.00018212196319145773, "loss": 1.2111, "num_input_tokens_seen": 2863923200, "step": 43700, "train_runtime": 20992.0443, "train_tokens_per_second": 136428.98 }, { "epoch": 0.438, "grad_norm": 1.044310212135315, "learning_rate": 0.00018165684577709778, "loss": 1.2142, "num_input_tokens_seen": 2870476800, "step": 43800, "train_runtime": 21039.718, "train_tokens_per_second": 136431.334 }, { "epoch": 0.439, "grad_norm": 0.445425808429718, "learning_rate": 0.0001811914095786524, "loss": 1.218, "num_input_tokens_seen": 2877030400, "step": 43900, "train_runtime": 21088.215, "train_tokens_per_second": 136428.351 }, { "epoch": 0.44, "grad_norm": 0.43947216868400574, "learning_rate": 0.0001807256592830588, "loss": 1.2124, "num_input_tokens_seen": 2883584000, "step": 44000, "train_runtime": 21136.0286, "train_tokens_per_second": 136429.793 }, { "epoch": 0.441, "grad_norm": 0.5147203803062439, "learning_rate": 0.00018025959958041732, "loss": 1.2227, "num_input_tokens_seen": 2890137600, "step": 44100, "train_runtime": 21182.9913, "train_tokens_per_second": 136436.708 }, { "epoch": 0.442, "grad_norm": 0.473652184009552, "learning_rate": 0.00017979323516394407, "loss": 1.2277, "num_input_tokens_seen": 2896691200, "step": 44200, "train_runtime": 21236.5796, "train_tokens_per_second": 136401.024 }, { "epoch": 0.443, "grad_norm": 0.4356568157672882, "learning_rate": 0.00017932657072992344, "loss": 1.2018, "num_input_tokens_seen": 2903244800, "step": 44300, "train_runtime": 21282.9387, "train_tokens_per_second": 136411.838 }, { "epoch": 0.444, "grad_norm": 0.4458017647266388, "learning_rate": 0.00017885961097766117, "loss": 1.2124, "num_input_tokens_seen": 2909798400, "step": 44400, "train_runtime": 21331.1223, "train_tokens_per_second": 136410.938 }, { "epoch": 0.445, "grad_norm": 0.5065773725509644, "learning_rate": 0.00017839236060943674, "loss": 1.2262, "num_input_tokens_seen": 2916352000, "step": 44500, "train_runtime": 21377.5493, "train_tokens_per_second": 136421.25 }, { "epoch": 0.446, "grad_norm": 0.5424425601959229, "learning_rate": 0.0001779248243304562, "loss": 1.2171, "num_input_tokens_seen": 2922905600, "step": 44600, "train_runtime": 21424.9021, "train_tokens_per_second": 136425.622 }, { "epoch": 0.447, "grad_norm": 0.4595748484134674, "learning_rate": 0.00017745700684880465, "loss": 1.2039, "num_input_tokens_seen": 2929459200, "step": 44700, "train_runtime": 21472.2167, "train_tokens_per_second": 136430.218 }, { "epoch": 0.448, "grad_norm": 0.5353960990905762, "learning_rate": 0.000176988912875399, "loss": 1.2075, "num_input_tokens_seen": 2936012800, "step": 44800, "train_runtime": 21524.5148, "train_tokens_per_second": 136403.205 }, { "epoch": 0.449, "grad_norm": 0.4949302673339844, "learning_rate": 0.00017652054712394028, "loss": 1.2174, "num_input_tokens_seen": 2942566400, "step": 44900, "train_runtime": 21571.6626, "train_tokens_per_second": 136408.883 }, { "epoch": 0.45, "grad_norm": 0.5596060752868652, "learning_rate": 0.0001760519143108665, "loss": 1.2178, "num_input_tokens_seen": 2949120000, "step": 45000, "train_runtime": 21618.3195, "train_tokens_per_second": 136417.634 }, { "epoch": 0.451, "grad_norm": 0.5348083972930908, "learning_rate": 0.00017558301915530483, "loss": 1.215, "num_input_tokens_seen": 2955673600, "step": 45100, "train_runtime": 21666.1069, "train_tokens_per_second": 136419.229 }, { "epoch": 0.452, "grad_norm": 0.46748441457748413, "learning_rate": 0.00017511386637902428, "loss": 1.2104, "num_input_tokens_seen": 2962227200, "step": 45200, "train_runtime": 21713.1957, "train_tokens_per_second": 136425.206 }, { "epoch": 0.453, "grad_norm": 0.47188806533813477, "learning_rate": 0.00017464446070638814, "loss": 1.213, "num_input_tokens_seen": 2968780800, "step": 45300, "train_runtime": 21760.1393, "train_tokens_per_second": 136432.068 }, { "epoch": 0.454, "grad_norm": 0.5225762128829956, "learning_rate": 0.00017417480686430622, "loss": 1.2152, "num_input_tokens_seen": 2975334400, "step": 45400, "train_runtime": 21812.7666, "train_tokens_per_second": 136403.348 }, { "epoch": 0.455, "grad_norm": 0.5889186263084412, "learning_rate": 0.00017370490958218765, "loss": 1.2214, "num_input_tokens_seen": 2981888000, "step": 45500, "train_runtime": 21859.0263, "train_tokens_per_second": 136414.493 }, { "epoch": 0.456, "grad_norm": 0.6613258719444275, "learning_rate": 0.00017323477359189272, "loss": 1.2334, "num_input_tokens_seen": 2988441600, "step": 45600, "train_runtime": 21905.9003, "train_tokens_per_second": 136421.766 }, { "epoch": 0.457, "grad_norm": 0.4657646715641022, "learning_rate": 0.00017276440362768564, "loss": 1.2132, "num_input_tokens_seen": 2994995200, "step": 45700, "train_runtime": 21952.9851, "train_tokens_per_second": 136427.697 }, { "epoch": 0.458, "grad_norm": 0.8410550355911255, "learning_rate": 0.0001722938044261868, "loss": 1.2073, "num_input_tokens_seen": 3001548800, "step": 45800, "train_runtime": 22005.352, "train_tokens_per_second": 136400.854 }, { "epoch": 0.459, "grad_norm": 0.7687750458717346, "learning_rate": 0.0001718229807263249, "loss": 1.2116, "num_input_tokens_seen": 3008102400, "step": 45900, "train_runtime": 22051.2762, "train_tokens_per_second": 136413.982 }, { "epoch": 0.46, "grad_norm": 0.40700653195381165, "learning_rate": 0.0001713519372692894, "loss": 1.2082, "num_input_tokens_seen": 3014656000, "step": 46000, "train_runtime": 22102.8898, "train_tokens_per_second": 136391.939 }, { "epoch": 0.461, "grad_norm": 0.44239944219589233, "learning_rate": 0.0001708806787984826, "loss": 1.2177, "num_input_tokens_seen": 3021209600, "step": 46100, "train_runtime": 22149.1222, "train_tokens_per_second": 136403.13 }, { "epoch": 0.462, "grad_norm": 0.4981868267059326, "learning_rate": 0.00017040921005947212, "loss": 1.2073, "num_input_tokens_seen": 3027763200, "step": 46200, "train_runtime": 22195.5009, "train_tokens_per_second": 136413.376 }, { "epoch": 0.463, "grad_norm": 0.5651112198829651, "learning_rate": 0.0001699375357999429, "loss": 1.2098, "num_input_tokens_seen": 3034316800, "step": 46300, "train_runtime": 22241.367, "train_tokens_per_second": 136426.722 }, { "epoch": 0.464, "grad_norm": 1.1314237117767334, "learning_rate": 0.0001694656607696496, "loss": 1.2335, "num_input_tokens_seen": 3040870400, "step": 46400, "train_runtime": 22294.8896, "train_tokens_per_second": 136393.158 }, { "epoch": 0.465, "grad_norm": 0.568980872631073, "learning_rate": 0.0001689935897203684, "loss": 1.2096, "num_input_tokens_seen": 3047424000, "step": 46500, "train_runtime": 22342.7849, "train_tokens_per_second": 136394.098 }, { "epoch": 0.466, "grad_norm": 0.7110226154327393, "learning_rate": 0.0001685213274058496, "loss": 1.2136, "num_input_tokens_seen": 3053977600, "step": 46600, "train_runtime": 22393.3193, "train_tokens_per_second": 136378.96 }, { "epoch": 0.467, "grad_norm": 0.5052018761634827, "learning_rate": 0.00016804887858176944, "loss": 1.2237, "num_input_tokens_seen": 3060531200, "step": 46700, "train_runtime": 22441.2606, "train_tokens_per_second": 136379.647 }, { "epoch": 0.468, "grad_norm": 0.4663156270980835, "learning_rate": 0.00016757624800568238, "loss": 1.2071, "num_input_tokens_seen": 3067084800, "step": 46800, "train_runtime": 22487.9084, "train_tokens_per_second": 136388.175 }, { "epoch": 0.469, "grad_norm": 0.5441033840179443, "learning_rate": 0.00016710344043697301, "loss": 1.2078, "num_input_tokens_seen": 3073638400, "step": 46900, "train_runtime": 22534.6023, "train_tokens_per_second": 136396.39 }, { "epoch": 0.47, "grad_norm": 0.4578142464160919, "learning_rate": 0.0001666304606368083, "loss": 1.1956, "num_input_tokens_seen": 3080192000, "step": 47000, "train_runtime": 22587.0441, "train_tokens_per_second": 136369.858 }, { "epoch": 0.471, "grad_norm": 0.6252749562263489, "learning_rate": 0.00016615731336808962, "loss": 1.1911, "num_input_tokens_seen": 3086745600, "step": 47100, "train_runtime": 22634.7186, "train_tokens_per_second": 136372.166 }, { "epoch": 0.472, "grad_norm": 0.45418813824653625, "learning_rate": 0.0001656840033954047, "loss": 1.22, "num_input_tokens_seen": 3093299200, "step": 47200, "train_runtime": 22681.221, "train_tokens_per_second": 136381.511 }, { "epoch": 0.473, "grad_norm": 0.55946284532547, "learning_rate": 0.00016521053548497973, "loss": 1.2073, "num_input_tokens_seen": 3099852800, "step": 47300, "train_runtime": 22728.7635, "train_tokens_per_second": 136384.577 }, { "epoch": 0.474, "grad_norm": 0.508859395980835, "learning_rate": 0.0001647369144046313, "loss": 1.1957, "num_input_tokens_seen": 3106406400, "step": 47400, "train_runtime": 22775.8652, "train_tokens_per_second": 136390.27 }, { "epoch": 0.475, "grad_norm": 0.5557622313499451, "learning_rate": 0.00016426314492371842, "loss": 1.1996, "num_input_tokens_seen": 3112960000, "step": 47500, "train_runtime": 22823.5391, "train_tokens_per_second": 136392.519 }, { "epoch": 0.476, "grad_norm": 0.5686858296394348, "learning_rate": 0.0001637892318130945, "loss": 1.201, "num_input_tokens_seen": 3119513600, "step": 47600, "train_runtime": 22875.0526, "train_tokens_per_second": 136371.866 }, { "epoch": 0.477, "grad_norm": 0.47568413615226746, "learning_rate": 0.00016331517984505934, "loss": 1.2132, "num_input_tokens_seen": 3126067200, "step": 47700, "train_runtime": 22923.2754, "train_tokens_per_second": 136370.878 }, { "epoch": 0.478, "grad_norm": 0.40612325072288513, "learning_rate": 0.00016284099379331092, "loss": 1.2085, "num_input_tokens_seen": 3132620800, "step": 47800, "train_runtime": 22970.2831, "train_tokens_per_second": 136377.109 }, { "epoch": 0.479, "grad_norm": 0.491755872964859, "learning_rate": 0.00016236667843289759, "loss": 1.206, "num_input_tokens_seen": 3139174400, "step": 47900, "train_runtime": 23016.8676, "train_tokens_per_second": 136385.821 }, { "epoch": 0.48, "grad_norm": 1.2421867847442627, "learning_rate": 0.00016189223854016973, "loss": 1.1991, "num_input_tokens_seen": 3145728000, "step": 48000, "train_runtime": 23070.1067, "train_tokens_per_second": 136355.156 }, { "epoch": 0.481, "grad_norm": 0.44709935784339905, "learning_rate": 0.00016141767889273182, "loss": 1.1987, "num_input_tokens_seen": 3152281600, "step": 48100, "train_runtime": 23117.6704, "train_tokens_per_second": 136358.1 }, { "epoch": 0.482, "grad_norm": 0.6956078410148621, "learning_rate": 0.00016094300426939417, "loss": 1.206, "num_input_tokens_seen": 3158835200, "step": 48200, "train_runtime": 23164.6084, "train_tokens_per_second": 136364.714 }, { "epoch": 0.483, "grad_norm": 0.4756148159503937, "learning_rate": 0.00016046821945012505, "loss": 1.213, "num_input_tokens_seen": 3165388800, "step": 48300, "train_runtime": 23212.4256, "train_tokens_per_second": 136366.137 }, { "epoch": 0.484, "grad_norm": 0.4668136239051819, "learning_rate": 0.00015999332921600226, "loss": 1.2027, "num_input_tokens_seen": 3171942400, "step": 48400, "train_runtime": 23260.1957, "train_tokens_per_second": 136367.829 }, { "epoch": 0.485, "grad_norm": 0.48166415095329285, "learning_rate": 0.00015951833834916532, "loss": 1.1885, "num_input_tokens_seen": 3178496000, "step": 48500, "train_runtime": 23308.4042, "train_tokens_per_second": 136366.951 }, { "epoch": 0.486, "grad_norm": 1.4835230112075806, "learning_rate": 0.00015904325163276672, "loss": 1.2144, "num_input_tokens_seen": 3185049600, "step": 48600, "train_runtime": 23355.0119, "train_tokens_per_second": 136375.422 }, { "epoch": 0.487, "grad_norm": 0.47993043065071106, "learning_rate": 0.00015856807385092466, "loss": 1.2092, "num_input_tokens_seen": 3191603200, "step": 48700, "train_runtime": 23408.2289, "train_tokens_per_second": 136345.352 }, { "epoch": 0.488, "grad_norm": 0.4617721736431122, "learning_rate": 0.00015809280978867405, "loss": 1.2079, "num_input_tokens_seen": 3198156800, "step": 48800, "train_runtime": 23456.3091, "train_tokens_per_second": 136345.27 }, { "epoch": 0.489, "grad_norm": 0.4698822796344757, "learning_rate": 0.0001576174642319187, "loss": 1.2221, "num_input_tokens_seen": 3204710400, "step": 48900, "train_runtime": 23502.92, "train_tokens_per_second": 136353.713 }, { "epoch": 0.49, "grad_norm": 0.5454009771347046, "learning_rate": 0.0001571420419673831, "loss": 1.201, "num_input_tokens_seen": 3211264000, "step": 49000, "train_runtime": 23550.5868, "train_tokens_per_second": 136356.008 }, { "epoch": 0.491, "grad_norm": 0.9021556973457336, "learning_rate": 0.0001566665477825642, "loss": 1.2047, "num_input_tokens_seen": 3217817600, "step": 49100, "train_runtime": 23597.4655, "train_tokens_per_second": 136362.848 }, { "epoch": 0.492, "grad_norm": 0.4959240257740021, "learning_rate": 0.0001561909864656831, "loss": 1.2042, "num_input_tokens_seen": 3224371200, "step": 49200, "train_runtime": 23650.6048, "train_tokens_per_second": 136333.562 }, { "epoch": 0.493, "grad_norm": 0.554251492023468, "learning_rate": 0.00015571536280563705, "loss": 1.2163, "num_input_tokens_seen": 3230924800, "step": 49300, "train_runtime": 23697.3685, "train_tokens_per_second": 136341.079 }, { "epoch": 0.494, "grad_norm": 0.5000952482223511, "learning_rate": 0.000155239681591951, "loss": 1.2086, "num_input_tokens_seen": 3237478400, "step": 49400, "train_runtime": 23745.4988, "train_tokens_per_second": 136340.72 }, { "epoch": 0.495, "grad_norm": 0.7438832521438599, "learning_rate": 0.00015476394761472953, "loss": 1.1999, "num_input_tokens_seen": 3244032000, "step": 49500, "train_runtime": 23793.3349, "train_tokens_per_second": 136342.048 }, { "epoch": 0.496, "grad_norm": 0.4872761368751526, "learning_rate": 0.00015428816566460843, "loss": 1.194, "num_input_tokens_seen": 3250585600, "step": 49600, "train_runtime": 23839.649, "train_tokens_per_second": 136352.074 }, { "epoch": 0.497, "grad_norm": 0.48635321855545044, "learning_rate": 0.00015381234053270669, "loss": 1.1957, "num_input_tokens_seen": 3257139200, "step": 49700, "train_runtime": 23886.4418, "train_tokens_per_second": 136359.33 }, { "epoch": 0.498, "grad_norm": 0.899361252784729, "learning_rate": 0.0001533364770105781, "loss": 1.201, "num_input_tokens_seen": 3263692800, "step": 49800, "train_runtime": 23933.6337, "train_tokens_per_second": 136364.283 }, { "epoch": 0.499, "grad_norm": 0.5460925698280334, "learning_rate": 0.0001528605798901631, "loss": 1.2086, "num_input_tokens_seen": 3270246400, "step": 49900, "train_runtime": 23985.6033, "train_tokens_per_second": 136342.053 }, { "epoch": 0.5, "grad_norm": 0.4763907194137573, "learning_rate": 0.00015238465396374027, "loss": 1.1987, "num_input_tokens_seen": 3276800000, "step": 50000, "train_runtime": 24033.0829, "train_tokens_per_second": 136345.388 }, { "epoch": 0.501, "grad_norm": 0.4716530442237854, "learning_rate": 0.00015190870402387858, "loss": 1.2083, "num_input_tokens_seen": 3283353600, "step": 50100, "train_runtime": 24080.0017, "train_tokens_per_second": 136351.884 }, { "epoch": 0.502, "grad_norm": 0.65655517578125, "learning_rate": 0.00015143273486338857, "loss": 1.2026, "num_input_tokens_seen": 3289907200, "step": 50200, "train_runtime": 24132.759, "train_tokens_per_second": 136325.366 }, { "epoch": 0.503, "grad_norm": 0.494205117225647, "learning_rate": 0.00015095675127527438, "loss": 1.208, "num_input_tokens_seen": 3296460800, "step": 50300, "train_runtime": 24179.9126, "train_tokens_per_second": 136330.551 }, { "epoch": 0.504, "grad_norm": 0.485307902097702, "learning_rate": 0.00015048075805268547, "loss": 1.1965, "num_input_tokens_seen": 3303014400, "step": 50400, "train_runtime": 24227.2685, "train_tokens_per_second": 136334.577 }, { "epoch": 0.505, "grad_norm": 0.4843132793903351, "learning_rate": 0.00015000475998886825, "loss": 1.2028, "num_input_tokens_seen": 3309568000, "step": 50500, "train_runtime": 24274.7092, "train_tokens_per_second": 136338.111 }, { "epoch": 0.506, "grad_norm": 0.4654887020587921, "learning_rate": 0.00014952876187711804, "loss": 1.2151, "num_input_tokens_seen": 3316121600, "step": 50600, "train_runtime": 24321.273, "train_tokens_per_second": 136346.547 }, { "epoch": 0.507, "grad_norm": 0.4625457525253296, "learning_rate": 0.00014905276851073053, "loss": 1.209, "num_input_tokens_seen": 3322675200, "step": 50700, "train_runtime": 24374.7609, "train_tokens_per_second": 136316.217 }, { "epoch": 0.508, "grad_norm": 0.527594268321991, "learning_rate": 0.00014857678468295352, "loss": 1.2043, "num_input_tokens_seen": 3329228800, "step": 50800, "train_runtime": 24422.59, "train_tokens_per_second": 136317.598 }, { "epoch": 0.509, "grad_norm": 0.4604775011539459, "learning_rate": 0.00014810081518693902, "loss": 1.1895, "num_input_tokens_seen": 3335782400, "step": 50900, "train_runtime": 24468.7673, "train_tokens_per_second": 136328.176 }, { "epoch": 0.51, "grad_norm": 0.4973219335079193, "learning_rate": 0.0001476248648156945, "loss": 1.1977, "num_input_tokens_seen": 3342336000, "step": 51000, "train_runtime": 24516.9703, "train_tokens_per_second": 136327.448 }, { "epoch": 0.511, "grad_norm": 0.42552006244659424, "learning_rate": 0.00014714893836203485, "loss": 1.2109, "num_input_tokens_seen": 3348889600, "step": 51100, "train_runtime": 24564.5614, "train_tokens_per_second": 136330.12 }, { "epoch": 0.512, "grad_norm": 0.5027197003364563, "learning_rate": 0.0001466730406185343, "loss": 1.1949, "num_input_tokens_seen": 3355443200, "step": 51200, "train_runtime": 24611.9784, "train_tokens_per_second": 136333.745 }, { "epoch": 0.513, "grad_norm": 0.6097121238708496, "learning_rate": 0.0001461971763774778, "loss": 1.2, "num_input_tokens_seen": 3361996800, "step": 51300, "train_runtime": 24665.0046, "train_tokens_per_second": 136306.352 }, { "epoch": 0.514, "grad_norm": 0.9953346848487854, "learning_rate": 0.0001457213504308129, "loss": 1.1919, "num_input_tokens_seen": 3368550400, "step": 51400, "train_runtime": 24711.3817, "train_tokens_per_second": 136315.745 }, { "epoch": 0.515, "grad_norm": 0.5582478642463684, "learning_rate": 0.00014524556757010177, "loss": 1.1924, "num_input_tokens_seen": 3375104000, "step": 51500, "train_runtime": 24758.0554, "train_tokens_per_second": 136323.469 }, { "epoch": 0.516, "grad_norm": 0.5084798336029053, "learning_rate": 0.00014476983258647234, "loss": 1.2068, "num_input_tokens_seen": 3381657600, "step": 51600, "train_runtime": 24807.6959, "train_tokens_per_second": 136314.86 }, { "epoch": 0.517, "grad_norm": 0.6907379627227783, "learning_rate": 0.0001442941502705707, "loss": 1.1945, "num_input_tokens_seen": 3388211200, "step": 51700, "train_runtime": 24855.3849, "train_tokens_per_second": 136316.988 }, { "epoch": 0.518, "grad_norm": 0.6037150025367737, "learning_rate": 0.0001438185254125125, "loss": 1.2053, "num_input_tokens_seen": 3394764800, "step": 51800, "train_runtime": 24901.8712, "train_tokens_per_second": 136325.691 }, { "epoch": 0.519, "grad_norm": 0.6816796064376831, "learning_rate": 0.00014334296280183473, "loss": 1.2019, "num_input_tokens_seen": 3401318400, "step": 51900, "train_runtime": 24955.4949, "train_tokens_per_second": 136295.37 }, { "epoch": 0.52, "grad_norm": 0.5201036930084229, "learning_rate": 0.00014286746722744768, "loss": 1.206, "num_input_tokens_seen": 3407872000, "step": 52000, "train_runtime": 25002.9753, "train_tokens_per_second": 136298.659 }, { "epoch": 0.521, "grad_norm": 0.5104642510414124, "learning_rate": 0.00014239204347758647, "loss": 1.2029, "num_input_tokens_seen": 3414425600, "step": 52100, "train_runtime": 25051.9745, "train_tokens_per_second": 136293.672 }, { "epoch": 0.522, "grad_norm": 0.4965505003929138, "learning_rate": 0.00014191669633976294, "loss": 1.1961, "num_input_tokens_seen": 3420979200, "step": 52200, "train_runtime": 25099.3949, "train_tokens_per_second": 136297.278 }, { "epoch": 0.523, "grad_norm": 0.5390327572822571, "learning_rate": 0.00014144143060071756, "loss": 1.194, "num_input_tokens_seen": 3427532800, "step": 52300, "train_runtime": 25146.6291, "train_tokens_per_second": 136301.879 }, { "epoch": 0.524, "grad_norm": 2.647089719772339, "learning_rate": 0.000140966251046371, "loss": 1.2006, "num_input_tokens_seen": 3434086400, "step": 52400, "train_runtime": 25194.2742, "train_tokens_per_second": 136304.24 }, { "epoch": 0.525, "grad_norm": 0.46030643582344055, "learning_rate": 0.0001404911624617761, "loss": 1.2071, "num_input_tokens_seen": 3440640000, "step": 52500, "train_runtime": 25247.7567, "train_tokens_per_second": 136275.077 }, { "epoch": 0.526, "grad_norm": 0.487699031829834, "learning_rate": 0.00014001616963106966, "loss": 1.2046, "num_input_tokens_seen": 3447193600, "step": 52600, "train_runtime": 25295.5054, "train_tokens_per_second": 136276.921 }, { "epoch": 0.527, "grad_norm": 0.4782906472682953, "learning_rate": 0.00013954127733742416, "loss": 1.1891, "num_input_tokens_seen": 3453747200, "step": 52700, "train_runtime": 25344.1317, "train_tokens_per_second": 136274.039 }, { "epoch": 0.528, "grad_norm": 0.595632016658783, "learning_rate": 0.0001390664903629998, "loss": 1.1867, "num_input_tokens_seen": 3460300800, "step": 52800, "train_runtime": 25391.6777, "train_tokens_per_second": 136276.966 }, { "epoch": 0.529, "grad_norm": 0.5201537609100342, "learning_rate": 0.0001385918134888961, "loss": 1.1955, "num_input_tokens_seen": 3466854400, "step": 52900, "train_runtime": 25439.3874, "train_tokens_per_second": 136279.005 }, { "epoch": 0.53, "grad_norm": 0.4726644456386566, "learning_rate": 0.00013811725149510387, "loss": 1.206, "num_input_tokens_seen": 3473408000, "step": 53000, "train_runtime": 25492.0415, "train_tokens_per_second": 136254.603 }, { "epoch": 0.531, "grad_norm": 0.5846008062362671, "learning_rate": 0.0001376428091604572, "loss": 1.2117, "num_input_tokens_seen": 3479961600, "step": 53100, "train_runtime": 25540.3083, "train_tokens_per_second": 136253.704 }, { "epoch": 0.532, "grad_norm": 0.4758647382259369, "learning_rate": 0.00013716849126258512, "loss": 1.2042, "num_input_tokens_seen": 3486515200, "step": 53200, "train_runtime": 25589.0853, "train_tokens_per_second": 136250.091 }, { "epoch": 0.533, "grad_norm": 0.4607105255126953, "learning_rate": 0.00013669430257786354, "loss": 1.1992, "num_input_tokens_seen": 3493068800, "step": 53300, "train_runtime": 25636.4376, "train_tokens_per_second": 136254.063 }, { "epoch": 0.534, "grad_norm": 0.6885077357292175, "learning_rate": 0.00013622024788136728, "loss": 1.2006, "num_input_tokens_seen": 3499622400, "step": 53400, "train_runtime": 25684.4816, "train_tokens_per_second": 136254.352 }, { "epoch": 0.535, "grad_norm": 0.6578366160392761, "learning_rate": 0.00013574633194682185, "loss": 1.1948, "num_input_tokens_seen": 3506176000, "step": 53500, "train_runtime": 25730.7322, "train_tokens_per_second": 136264.136 }, { "epoch": 0.536, "grad_norm": 0.4718693196773529, "learning_rate": 0.0001352725595465555, "loss": 1.2, "num_input_tokens_seen": 3512729600, "step": 53600, "train_runtime": 25783.9922, "train_tokens_per_second": 136236.839 }, { "epoch": 0.537, "grad_norm": 0.5561531186103821, "learning_rate": 0.000134798935451451, "loss": 1.2052, "num_input_tokens_seen": 3519283200, "step": 53700, "train_runtime": 25832.8858, "train_tokens_per_second": 136232.677 }, { "epoch": 0.538, "grad_norm": 0.5250628590583801, "learning_rate": 0.00013432546443089768, "loss": 1.2, "num_input_tokens_seen": 3525836800, "step": 53800, "train_runtime": 25880.0084, "train_tokens_per_second": 136237.854 }, { "epoch": 0.539, "grad_norm": 0.5457636117935181, "learning_rate": 0.0001338521512527436, "loss": 1.1944, "num_input_tokens_seen": 3532390400, "step": 53900, "train_runtime": 25927.8228, "train_tokens_per_second": 136239.376 }, { "epoch": 0.54, "grad_norm": 0.4437522292137146, "learning_rate": 0.00013337900068324712, "loss": 1.1912, "num_input_tokens_seen": 3538944000, "step": 54000, "train_runtime": 25975.6777, "train_tokens_per_second": 136240.68 }, { "epoch": 0.541, "grad_norm": 0.5343025326728821, "learning_rate": 0.00013290601748702918, "loss": 1.188, "num_input_tokens_seen": 3545497600, "step": 54100, "train_runtime": 26027.6243, "train_tokens_per_second": 136220.562 }, { "epoch": 0.542, "grad_norm": 0.4907335042953491, "learning_rate": 0.00013243320642702543, "loss": 1.1909, "num_input_tokens_seen": 3552051200, "step": 54200, "train_runtime": 26075.5648, "train_tokens_per_second": 136221.448 }, { "epoch": 0.543, "grad_norm": 0.7268043160438538, "learning_rate": 0.0001319605722644379, "loss": 1.1911, "num_input_tokens_seen": 3558604800, "step": 54300, "train_runtime": 26122.2114, "train_tokens_per_second": 136229.079 }, { "epoch": 0.544, "grad_norm": 1.3769776821136475, "learning_rate": 0.0001314881197586874, "loss": 1.224, "num_input_tokens_seen": 3565158400, "step": 54400, "train_runtime": 26170.2324, "train_tokens_per_second": 136229.528 }, { "epoch": 0.545, "grad_norm": 0.7141419649124146, "learning_rate": 0.0001310158536673654, "loss": 1.2025, "num_input_tokens_seen": 3571712000, "step": 54500, "train_runtime": 26217.6992, "train_tokens_per_second": 136232.854 }, { "epoch": 0.546, "grad_norm": 0.5124280452728271, "learning_rate": 0.0001305437787461862, "loss": 1.1972, "num_input_tokens_seen": 3578265600, "step": 54600, "train_runtime": 26264.9719, "train_tokens_per_second": 136237.176 }, { "epoch": 0.547, "grad_norm": 0.5609524250030518, "learning_rate": 0.00013007189974893903, "loss": 1.1924, "num_input_tokens_seen": 3584819200, "step": 54700, "train_runtime": 26319.2824, "train_tokens_per_second": 136205.051 }, { "epoch": 0.548, "grad_norm": 0.5220986604690552, "learning_rate": 0.00012960022142744016, "loss": 1.188, "num_input_tokens_seen": 3591372800, "step": 54800, "train_runtime": 26367.119, "train_tokens_per_second": 136206.493 }, { "epoch": 0.549, "grad_norm": 0.5159165263175964, "learning_rate": 0.00012912874853148506, "loss": 1.1891, "num_input_tokens_seen": 3597926400, "step": 54900, "train_runtime": 26415.2651, "train_tokens_per_second": 136206.333 }, { "epoch": 0.55, "grad_norm": 0.5019519925117493, "learning_rate": 0.00012865748580880053, "loss": 1.1827, "num_input_tokens_seen": 3604480000, "step": 55000, "train_runtime": 26462.5595, "train_tokens_per_second": 136210.558 }, { "epoch": 0.551, "grad_norm": 0.5309172868728638, "learning_rate": 0.0001281864380049969, "loss": 1.1876, "num_input_tokens_seen": 3611033600, "step": 55100, "train_runtime": 26514.9513, "train_tokens_per_second": 136188.581 }, { "epoch": 0.552, "grad_norm": 0.5431755781173706, "learning_rate": 0.00012771560986352042, "loss": 1.2038, "num_input_tokens_seen": 3617587200, "step": 55200, "train_runtime": 26562.7975, "train_tokens_per_second": 136189.993 }, { "epoch": 0.553, "grad_norm": 0.5063371658325195, "learning_rate": 0.0001272450061256052, "loss": 1.1837, "num_input_tokens_seen": 3624140800, "step": 55300, "train_runtime": 26609.2594, "train_tokens_per_second": 136198.484 }, { "epoch": 0.554, "grad_norm": 0.502314567565918, "learning_rate": 0.00012677463153022565, "loss": 1.1988, "num_input_tokens_seen": 3630694400, "step": 55400, "train_runtime": 26655.8656, "train_tokens_per_second": 136206.209 }, { "epoch": 0.555, "grad_norm": 0.5824739336967468, "learning_rate": 0.0001263044908140488, "loss": 1.1917, "num_input_tokens_seen": 3637248000, "step": 55500, "train_runtime": 26707.6694, "train_tokens_per_second": 136187.398 }, { "epoch": 0.556, "grad_norm": 0.5498598217964172, "learning_rate": 0.00012583458871138632, "loss": 1.1908, "num_input_tokens_seen": 3643801600, "step": 55600, "train_runtime": 26755.8413, "train_tokens_per_second": 136187.144 }, { "epoch": 0.557, "grad_norm": 0.5867239832878113, "learning_rate": 0.00012536492995414723, "loss": 1.193, "num_input_tokens_seen": 3650355200, "step": 55700, "train_runtime": 26804.5182, "train_tokens_per_second": 136184.324 }, { "epoch": 0.558, "grad_norm": 0.5584626197814941, "learning_rate": 0.00012489551927179007, "loss": 1.1833, "num_input_tokens_seen": 3656908800, "step": 55800, "train_runtime": 26850.8981, "train_tokens_per_second": 136193.165 }, { "epoch": 0.559, "grad_norm": 0.48578086495399475, "learning_rate": 0.00012442636139127508, "loss": 1.1919, "num_input_tokens_seen": 3663462400, "step": 55900, "train_runtime": 26898.2376, "train_tokens_per_second": 136197.116 }, { "epoch": 0.56, "grad_norm": 0.5344805121421814, "learning_rate": 0.00012395746103701695, "loss": 1.1978, "num_input_tokens_seen": 3670016000, "step": 56000, "train_runtime": 26951.1383, "train_tokens_per_second": 136172.95 }, { "epoch": 0.561, "grad_norm": 0.5378079414367676, "learning_rate": 0.00012348882293083708, "loss": 1.192, "num_input_tokens_seen": 3676569600, "step": 56100, "train_runtime": 26999.7429, "train_tokens_per_second": 136170.541 }, { "epoch": 0.562, "grad_norm": 0.6195780038833618, "learning_rate": 0.00012302045179191594, "loss": 1.1919, "num_input_tokens_seen": 3683123200, "step": 56200, "train_runtime": 27047.827, "train_tokens_per_second": 136170.761 }, { "epoch": 0.563, "grad_norm": 0.5348559617996216, "learning_rate": 0.00012255235233674572, "loss": 1.1875, "num_input_tokens_seen": 3689676800, "step": 56300, "train_runtime": 27094.1422, "train_tokens_per_second": 136179.871 }, { "epoch": 0.564, "grad_norm": 0.48098888993263245, "learning_rate": 0.00012208452927908278, "loss": 1.1818, "num_input_tokens_seen": 3696230400, "step": 56400, "train_runtime": 27141.6856, "train_tokens_per_second": 136182.787 }, { "epoch": 0.565, "grad_norm": 0.585021436214447, "learning_rate": 0.00012161698732990003, "loss": 1.1887, "num_input_tokens_seen": 3702784000, "step": 56500, "train_runtime": 27194.4825, "train_tokens_per_second": 136159.385 }, { "epoch": 0.566, "grad_norm": 0.5269266963005066, "learning_rate": 0.00012114973119733987, "loss": 1.187, "num_input_tokens_seen": 3709337600, "step": 56600, "train_runtime": 27242.6521, "train_tokens_per_second": 136159.196 }, { "epoch": 0.567, "grad_norm": 0.5563040971755981, "learning_rate": 0.00012068276558666616, "loss": 1.1996, "num_input_tokens_seen": 3715891200, "step": 56700, "train_runtime": 27290.3101, "train_tokens_per_second": 136161.56 }, { "epoch": 0.568, "grad_norm": 0.6131460666656494, "learning_rate": 0.00012021609520021752, "loss": 1.195, "num_input_tokens_seen": 3722444800, "step": 56800, "train_runtime": 27337.7804, "train_tokens_per_second": 136164.851 }, { "epoch": 0.569, "grad_norm": 0.5921023488044739, "learning_rate": 0.00011974972473735957, "loss": 1.2018, "num_input_tokens_seen": 3728998400, "step": 56900, "train_runtime": 27384.9126, "train_tokens_per_second": 136169.812 }, { "epoch": 0.57, "grad_norm": 0.4582422375679016, "learning_rate": 0.00011928365889443764, "loss": 1.1914, "num_input_tokens_seen": 3735552000, "step": 57000, "train_runtime": 27436.2125, "train_tokens_per_second": 136154.07 }, { "epoch": 0.571, "grad_norm": 0.6521887183189392, "learning_rate": 0.00011881790236472966, "loss": 1.2041, "num_input_tokens_seen": 3742105600, "step": 57100, "train_runtime": 27484.9505, "train_tokens_per_second": 136151.076 }, { "epoch": 0.572, "grad_norm": 0.5971055030822754, "learning_rate": 0.00011835245983839869, "loss": 1.1992, "num_input_tokens_seen": 3748659200, "step": 57200, "train_runtime": 27531.7756, "train_tokens_per_second": 136157.553 }, { "epoch": 0.573, "grad_norm": 0.5187013745307922, "learning_rate": 0.00011788733600244575, "loss": 1.193, "num_input_tokens_seen": 3755212800, "step": 57300, "train_runtime": 27579.3239, "train_tokens_per_second": 136160.437 }, { "epoch": 0.574, "grad_norm": 0.5805628299713135, "learning_rate": 0.00011742253554066278, "loss": 1.1925, "num_input_tokens_seen": 3761766400, "step": 57400, "train_runtime": 27633.4529, "train_tokens_per_second": 136130.885 }, { "epoch": 0.575, "grad_norm": 0.5242844223976135, "learning_rate": 0.00011695806313358523, "loss": 1.1991, "num_input_tokens_seen": 3768320000, "step": 57500, "train_runtime": 27681.3237, "train_tokens_per_second": 136132.218 }, { "epoch": 0.576, "grad_norm": 0.7652018666267395, "learning_rate": 0.00011649392345844506, "loss": 1.192, "num_input_tokens_seen": 3774873600, "step": 57600, "train_runtime": 27728.8266, "train_tokens_per_second": 136135.353 }, { "epoch": 0.577, "grad_norm": 0.5232011675834656, "learning_rate": 0.00011603012118912372, "loss": 1.2019, "num_input_tokens_seen": 3781427200, "step": 57700, "train_runtime": 27778.1555, "train_tokens_per_second": 136129.528 }, { "epoch": 0.578, "grad_norm": 0.5537053942680359, "learning_rate": 0.00011556666099610485, "loss": 1.1948, "num_input_tokens_seen": 3787980800, "step": 57800, "train_runtime": 27824.9287, "train_tokens_per_second": 136136.227 }, { "epoch": 0.579, "grad_norm": 0.6031852960586548, "learning_rate": 0.00011510354754642745, "loss": 1.1888, "num_input_tokens_seen": 3794534400, "step": 57900, "train_runtime": 27872.2044, "train_tokens_per_second": 136140.448 }, { "epoch": 0.58, "grad_norm": 0.5748854875564575, "learning_rate": 0.00011464078550363887, "loss": 1.1921, "num_input_tokens_seen": 3801088000, "step": 58000, "train_runtime": 27925.2055, "train_tokens_per_second": 136116.742 }, { "epoch": 0.581, "grad_norm": 0.5586141347885132, "learning_rate": 0.0001141783795277477, "loss": 1.2024, "num_input_tokens_seen": 3807641600, "step": 58100, "train_runtime": 27972.7534, "train_tokens_per_second": 136119.657 }, { "epoch": 0.582, "grad_norm": 0.4893476366996765, "learning_rate": 0.00011371633427517696, "loss": 1.2034, "num_input_tokens_seen": 3814195200, "step": 58200, "train_runtime": 28020.2529, "train_tokens_per_second": 136122.797 }, { "epoch": 0.583, "grad_norm": 0.5007518529891968, "learning_rate": 0.00011325465439871731, "loss": 1.1885, "num_input_tokens_seen": 3820748800, "step": 58300, "train_runtime": 28067.154, "train_tokens_per_second": 136128.829 }, { "epoch": 0.584, "grad_norm": 0.5260310769081116, "learning_rate": 0.00011279334454747989, "loss": 1.1931, "num_input_tokens_seen": 3827302400, "step": 58400, "train_runtime": 28120.6157, "train_tokens_per_second": 136103.08 }, { "epoch": 0.585, "grad_norm": 0.5364392399787903, "learning_rate": 0.00011233240936684981, "loss": 1.1928, "num_input_tokens_seen": 3833856000, "step": 58500, "train_runtime": 28168.5149, "train_tokens_per_second": 136104.3 }, { "epoch": 0.586, "grad_norm": 0.49333399534225464, "learning_rate": 0.00011187185349843916, "loss": 1.1935, "num_input_tokens_seen": 3840409600, "step": 58600, "train_runtime": 28215.0596, "train_tokens_per_second": 136112.05 }, { "epoch": 0.587, "grad_norm": 0.5711957216262817, "learning_rate": 0.00011141168158004053, "loss": 1.1812, "num_input_tokens_seen": 3846963200, "step": 58700, "train_runtime": 28264.2863, "train_tokens_per_second": 136106.858 }, { "epoch": 0.588, "grad_norm": 1.0157184600830078, "learning_rate": 0.00011095189824557998, "loss": 1.1929, "num_input_tokens_seen": 3853516800, "step": 58800, "train_runtime": 28311.6057, "train_tokens_per_second": 136110.853 }, { "epoch": 0.589, "grad_norm": 0.552700936794281, "learning_rate": 0.00011049250812507054, "loss": 1.1909, "num_input_tokens_seen": 3860070400, "step": 58900, "train_runtime": 28359.0956, "train_tokens_per_second": 136114.016 }, { "epoch": 0.59, "grad_norm": 0.46860748529434204, "learning_rate": 0.00011003351584456571, "loss": 1.1972, "num_input_tokens_seen": 3866624000, "step": 59000, "train_runtime": 28412.8978, "train_tokens_per_second": 136086.929 }, { "epoch": 0.591, "grad_norm": 0.5399055480957031, "learning_rate": 0.0001095749260261126, "loss": 1.1895, "num_input_tokens_seen": 3873177600, "step": 59100, "train_runtime": 28462.0603, "train_tokens_per_second": 136082.123 }, { "epoch": 0.592, "grad_norm": 0.49921005964279175, "learning_rate": 0.00010911674328770559, "loss": 1.1968, "num_input_tokens_seen": 3879731200, "step": 59200, "train_runtime": 28510.9551, "train_tokens_per_second": 136078.612 }, { "epoch": 0.593, "grad_norm": 0.5357686877250671, "learning_rate": 0.00010865897224323979, "loss": 1.1889, "num_input_tokens_seen": 3886284800, "step": 59300, "train_runtime": 28558.3344, "train_tokens_per_second": 136082.334 }, { "epoch": 0.594, "grad_norm": 0.5710283517837524, "learning_rate": 0.00010820161750246453, "loss": 1.1864, "num_input_tokens_seen": 3892838400, "step": 59400, "train_runtime": 28606.454, "train_tokens_per_second": 136082.522 }, { "epoch": 0.595, "grad_norm": 0.6333475112915039, "learning_rate": 0.00010774468367093696, "loss": 1.2009, "num_input_tokens_seen": 3899392000, "step": 59500, "train_runtime": 28653.986, "train_tokens_per_second": 136085.5 }, { "epoch": 0.596, "grad_norm": 0.5585243701934814, "learning_rate": 0.00010728817534997573, "loss": 1.1877, "num_input_tokens_seen": 3905945600, "step": 59600, "train_runtime": 28701.832, "train_tokens_per_second": 136086.979 }, { "epoch": 0.597, "grad_norm": 0.5805736184120178, "learning_rate": 0.00010683209713661453, "loss": 1.211, "num_input_tokens_seen": 3912499200, "step": 59700, "train_runtime": 28751.7229, "train_tokens_per_second": 136078.774 }, { "epoch": 0.598, "grad_norm": 0.5607670545578003, "learning_rate": 0.00010637645362355589, "loss": 1.196, "num_input_tokens_seen": 3919052800, "step": 59800, "train_runtime": 28798.1873, "train_tokens_per_second": 136086.788 }, { "epoch": 0.599, "grad_norm": 0.4962175488471985, "learning_rate": 0.00010592124939912497, "loss": 1.1889, "num_input_tokens_seen": 3925606400, "step": 59900, "train_runtime": 28852.3337, "train_tokens_per_second": 136058.54 }, { "epoch": 0.6, "grad_norm": 0.6488810777664185, "learning_rate": 0.00010546648904722326, "loss": 1.1968, "num_input_tokens_seen": 3932160000, "step": 60000, "train_runtime": 28898.713, "train_tokens_per_second": 136066.959 }, { "epoch": 0.601, "grad_norm": 0.9370976686477661, "learning_rate": 0.0001050121771472824, "loss": 1.183, "num_input_tokens_seen": 3938713600, "step": 60100, "train_runtime": 28946.5523, "train_tokens_per_second": 136068.488 }, { "epoch": 0.602, "grad_norm": 0.5040610432624817, "learning_rate": 0.0001045583182742182, "loss": 1.2023, "num_input_tokens_seen": 3945267200, "step": 60200, "train_runtime": 28994.2594, "train_tokens_per_second": 136070.632 }, { "epoch": 0.603, "grad_norm": 0.5120612382888794, "learning_rate": 0.00010410491699838448, "loss": 1.1865, "num_input_tokens_seen": 3951820800, "step": 60300, "train_runtime": 29042.095, "train_tokens_per_second": 136072.167 }, { "epoch": 0.604, "grad_norm": 0.8983064889907837, "learning_rate": 0.00010365197788552707, "loss": 1.1734, "num_input_tokens_seen": 3958374400, "step": 60400, "train_runtime": 29090.1772, "train_tokens_per_second": 136072.543 }, { "epoch": 0.605, "grad_norm": 0.5155735015869141, "learning_rate": 0.00010319950549673778, "loss": 1.1923, "num_input_tokens_seen": 3964928000, "step": 60500, "train_runtime": 29143.642, "train_tokens_per_second": 136047.787 }, { "epoch": 0.606, "grad_norm": 1.5562913417816162, "learning_rate": 0.00010274750438840855, "loss": 1.1877, "num_input_tokens_seen": 3971481600, "step": 60600, "train_runtime": 29191.8256, "train_tokens_per_second": 136047.73 }, { "epoch": 0.607, "grad_norm": 0.5603190064430237, "learning_rate": 0.00010229597911218554, "loss": 1.1862, "num_input_tokens_seen": 3978035200, "step": 60700, "train_runtime": 29240.4534, "train_tokens_per_second": 136045.606 }, { "epoch": 0.608, "grad_norm": 0.550956130027771, "learning_rate": 0.00010184493421492324, "loss": 1.1869, "num_input_tokens_seen": 3984588800, "step": 60800, "train_runtime": 29287.1822, "train_tokens_per_second": 136052.31 }, { "epoch": 0.609, "grad_norm": 0.5152813196182251, "learning_rate": 0.0001013943742386388, "loss": 1.1902, "num_input_tokens_seen": 3991142400, "step": 60900, "train_runtime": 29335.0152, "train_tokens_per_second": 136053.872 }, { "epoch": 0.61, "grad_norm": 0.5258508324623108, "learning_rate": 0.00010094430372046616, "loss": 1.1843, "num_input_tokens_seen": 3997696000, "step": 61000, "train_runtime": 29387.778, "train_tokens_per_second": 136032.605 }, { "epoch": 0.611, "grad_norm": 0.5804030895233154, "learning_rate": 0.0001004947271926104, "loss": 1.1872, "num_input_tokens_seen": 4004249600, "step": 61100, "train_runtime": 29435.5024, "train_tokens_per_second": 136034.695 }, { "epoch": 0.612, "grad_norm": 0.5679774284362793, "learning_rate": 0.00010004564918230222, "loss": 1.1933, "num_input_tokens_seen": 4010803200, "step": 61200, "train_runtime": 29483.504, "train_tokens_per_second": 136035.5 }, { "epoch": 0.613, "grad_norm": 0.611191987991333, "learning_rate": 9.959707421175217e-05, "loss": 1.1926, "num_input_tokens_seen": 4017356800, "step": 61300, "train_runtime": 29529.7223, "train_tokens_per_second": 136044.517 }, { "epoch": 0.614, "grad_norm": 0.5725626945495605, "learning_rate": 9.914900679810522e-05, "loss": 1.1812, "num_input_tokens_seen": 4023910400, "step": 61400, "train_runtime": 29577.4052, "train_tokens_per_second": 136046.769 }, { "epoch": 0.615, "grad_norm": 0.6058773398399353, "learning_rate": 9.870145145339529e-05, "loss": 1.1904, "num_input_tokens_seen": 4030464000, "step": 61500, "train_runtime": 29630.1636, "train_tokens_per_second": 136025.708 }, { "epoch": 0.616, "grad_norm": 0.5151665806770325, "learning_rate": 9.825441268449969e-05, "loss": 1.1783, "num_input_tokens_seen": 4037017600, "step": 61600, "train_runtime": 29677.4813, "train_tokens_per_second": 136029.657 }, { "epoch": 0.617, "grad_norm": 0.5461622476577759, "learning_rate": 9.780789499309391e-05, "loss": 1.1825, "num_input_tokens_seen": 4043571200, "step": 61700, "train_runtime": 29725.432, "train_tokens_per_second": 136030.696 }, { "epoch": 0.618, "grad_norm": 0.8243169784545898, "learning_rate": 9.736190287560608e-05, "loss": 1.1933, "num_input_tokens_seen": 4050124800, "step": 61800, "train_runtime": 29772.1739, "train_tokens_per_second": 136037.255 }, { "epoch": 0.619, "grad_norm": 0.4877258539199829, "learning_rate": 9.691644082317186e-05, "loss": 1.1881, "num_input_tokens_seen": 4056678400, "step": 61900, "train_runtime": 29825.721, "train_tokens_per_second": 136012.752 }, { "epoch": 0.62, "grad_norm": 0.5376379489898682, "learning_rate": 9.647151332158926e-05, "loss": 1.1812, "num_input_tokens_seen": 4063232000, "step": 62000, "train_runtime": 29872.1612, "train_tokens_per_second": 136020.691 }, { "epoch": 0.621, "grad_norm": 0.5128985643386841, "learning_rate": 9.60271248512732e-05, "loss": 1.1719, "num_input_tokens_seen": 4069785600, "step": 62100, "train_runtime": 29919.8698, "train_tokens_per_second": 136022.838 }, { "epoch": 0.622, "grad_norm": 0.6911051273345947, "learning_rate": 9.558327988721068e-05, "loss": 1.199, "num_input_tokens_seen": 4076339200, "step": 62200, "train_runtime": 29967.7263, "train_tokens_per_second": 136024.307 }, { "epoch": 0.623, "grad_norm": 0.5334423184394836, "learning_rate": 9.513998289891559e-05, "loss": 1.1922, "num_input_tokens_seen": 4082892800, "step": 62300, "train_runtime": 30014.7483, "train_tokens_per_second": 136029.553 }, { "epoch": 0.624, "grad_norm": 0.47934290766716003, "learning_rate": 9.469723835038361e-05, "loss": 1.1864, "num_input_tokens_seen": 4089446400, "step": 62400, "train_runtime": 30062.3944, "train_tokens_per_second": 136031.959 }, { "epoch": 0.625, "grad_norm": 0.6690011620521545, "learning_rate": 9.42550507000475e-05, "loss": 1.1887, "num_input_tokens_seen": 4096000000, "step": 62500, "train_runtime": 30115.1503, "train_tokens_per_second": 136011.275 }, { "epoch": 0.626, "grad_norm": 0.5379562973976135, "learning_rate": 9.381342440073194e-05, "loss": 1.1873, "num_input_tokens_seen": 4102553600, "step": 62600, "train_runtime": 30162.8214, "train_tokens_per_second": 136013.589 }, { "epoch": 0.627, "grad_norm": 0.5619449615478516, "learning_rate": 9.337236389960886e-05, "loss": 1.184, "num_input_tokens_seen": 4109107200, "step": 62700, "train_runtime": 30211.3171, "train_tokens_per_second": 136012.183 }, { "epoch": 0.628, "grad_norm": 0.9017994999885559, "learning_rate": 9.293187363815265e-05, "loss": 1.1869, "num_input_tokens_seen": 4115660800, "step": 62800, "train_runtime": 30263.5761, "train_tokens_per_second": 135993.869 }, { "epoch": 0.629, "grad_norm": 0.6502019762992859, "learning_rate": 9.249195805209533e-05, "loss": 1.1944, "num_input_tokens_seen": 4122214400, "step": 62900, "train_runtime": 30310.6247, "train_tokens_per_second": 135998.992 }, { "epoch": 0.63, "grad_norm": 0.5749123096466064, "learning_rate": 9.205262157138192e-05, "loss": 1.1896, "num_input_tokens_seen": 4128768000, "step": 63000, "train_runtime": 30359.0787, "train_tokens_per_second": 135997.803 }, { "epoch": 0.631, "grad_norm": 0.4843611419200897, "learning_rate": 9.161386862012601e-05, "loss": 1.1932, "num_input_tokens_seen": 4135321600, "step": 63100, "train_runtime": 30406.8492, "train_tokens_per_second": 135999.675 }, { "epoch": 0.632, "grad_norm": 0.634504497051239, "learning_rate": 9.11757036165649e-05, "loss": 1.181, "num_input_tokens_seen": 4141875200, "step": 63200, "train_runtime": 30453.794, "train_tokens_per_second": 136005.228 }, { "epoch": 0.633, "grad_norm": 0.605948269367218, "learning_rate": 9.073813097301521e-05, "loss": 1.1742, "num_input_tokens_seen": 4148428800, "step": 63300, "train_runtime": 30506.719, "train_tokens_per_second": 135984.102 }, { "epoch": 0.634, "grad_norm": 0.5731847882270813, "learning_rate": 9.030115509582883e-05, "loss": 1.1809, "num_input_tokens_seen": 4154982400, "step": 63400, "train_runtime": 30554.7018, "train_tokens_per_second": 135985.042 }, { "epoch": 0.635, "grad_norm": 0.9707246422767639, "learning_rate": 8.986478038534775e-05, "loss": 1.1981, "num_input_tokens_seen": 4161536000, "step": 63500, "train_runtime": 30602.1945, "train_tokens_per_second": 135988.156 }, { "epoch": 0.636, "grad_norm": 0.7120965719223022, "learning_rate": 8.942901123586059e-05, "loss": 1.1816, "num_input_tokens_seen": 4168089600, "step": 63600, "train_runtime": 30649.6499, "train_tokens_per_second": 135991.426 }, { "epoch": 0.637, "grad_norm": 0.5136720538139343, "learning_rate": 8.899385203555781e-05, "loss": 1.177, "num_input_tokens_seen": 4174643200, "step": 63700, "train_runtime": 30696.6221, "train_tokens_per_second": 135996.827 }, { "epoch": 0.638, "grad_norm": 0.5284336805343628, "learning_rate": 8.855930716648774e-05, "loss": 1.184, "num_input_tokens_seen": 4181196800, "step": 63800, "train_runtime": 30745.5123, "train_tokens_per_second": 135993.727 }, { "epoch": 0.639, "grad_norm": 0.5269259810447693, "learning_rate": 8.812538100451239e-05, "loss": 1.2174, "num_input_tokens_seen": 4187750400, "step": 63900, "train_runtime": 30792.1632, "train_tokens_per_second": 136000.526 }, { "epoch": 0.64, "grad_norm": 0.5354572534561157, "learning_rate": 8.769207791926338e-05, "loss": 1.1771, "num_input_tokens_seen": 4194304000, "step": 64000, "train_runtime": 30846.5823, "train_tokens_per_second": 135973.054 }, { "epoch": 0.641, "grad_norm": 0.7058772444725037, "learning_rate": 8.725940227409797e-05, "loss": 1.179, "num_input_tokens_seen": 4200857600, "step": 64100, "train_runtime": 30893.4429, "train_tokens_per_second": 135978.94 }, { "epoch": 0.642, "grad_norm": 0.5777366161346436, "learning_rate": 8.682735842605509e-05, "loss": 1.182, "num_input_tokens_seen": 4207411200, "step": 64200, "train_runtime": 30940.3826, "train_tokens_per_second": 135984.459 }, { "epoch": 0.643, "grad_norm": 0.5608710646629333, "learning_rate": 8.639595072581158e-05, "loss": 1.1904, "num_input_tokens_seen": 4213964800, "step": 64300, "train_runtime": 30988.4894, "train_tokens_per_second": 135984.841 }, { "epoch": 0.644, "grad_norm": 0.6048064231872559, "learning_rate": 8.596518351763806e-05, "loss": 1.1851, "num_input_tokens_seen": 4220518400, "step": 64400, "train_runtime": 31041.3711, "train_tokens_per_second": 135964.304 }, { "epoch": 0.645, "grad_norm": 0.47835734486579895, "learning_rate": 8.553506113935561e-05, "loss": 1.1803, "num_input_tokens_seen": 4227072000, "step": 64500, "train_runtime": 31089.9624, "train_tokens_per_second": 135962.596 }, { "epoch": 0.646, "grad_norm": 1.1150704622268677, "learning_rate": 8.510558792229183e-05, "loss": 1.1878, "num_input_tokens_seen": 4233625600, "step": 64600, "train_runtime": 31137.4325, "train_tokens_per_second": 135965.79 }, { "epoch": 0.647, "grad_norm": 0.6650880575180054, "learning_rate": 8.467676819123716e-05, "loss": 1.1951, "num_input_tokens_seen": 4240179200, "step": 64700, "train_runtime": 31185.0957, "train_tokens_per_second": 135968.132 }, { "epoch": 0.648, "grad_norm": 0.7750310897827148, "learning_rate": 8.424860626440158e-05, "loss": 1.1829, "num_input_tokens_seen": 4246732800, "step": 64800, "train_runtime": 31237.5852, "train_tokens_per_second": 135949.459 }, { "epoch": 0.649, "grad_norm": 0.595783531665802, "learning_rate": 8.382110645337102e-05, "loss": 1.1856, "num_input_tokens_seen": 4253286400, "step": 64900, "train_runtime": 31285.0064, "train_tokens_per_second": 135952.87 }, { "epoch": 0.65, "grad_norm": 0.6093938946723938, "learning_rate": 8.339427306306365e-05, "loss": 1.1842, "num_input_tokens_seen": 4259840000, "step": 65000, "train_runtime": 31332.1176, "train_tokens_per_second": 135957.615 }, { "epoch": 0.651, "grad_norm": 0.6823499798774719, "learning_rate": 8.296811039168716e-05, "loss": 1.1818, "num_input_tokens_seen": 4266393600, "step": 65100, "train_runtime": 31381.0925, "train_tokens_per_second": 135954.273 }, { "epoch": 0.652, "grad_norm": 0.5052744746208191, "learning_rate": 8.254262273069477e-05, "loss": 1.2034, "num_input_tokens_seen": 4272947200, "step": 65200, "train_runtime": 31428.8012, "train_tokens_per_second": 135956.417 }, { "epoch": 0.653, "grad_norm": 0.5003641247749329, "learning_rate": 8.211781436474263e-05, "loss": 1.177, "num_input_tokens_seen": 4279500800, "step": 65300, "train_runtime": 31476.0702, "train_tokens_per_second": 135960.454 }, { "epoch": 0.654, "grad_norm": 0.5675527453422546, "learning_rate": 8.169368957164613e-05, "loss": 1.1707, "num_input_tokens_seen": 4286054400, "step": 65400, "train_runtime": 31524.8831, "train_tokens_per_second": 135957.82 }, { "epoch": 0.655, "grad_norm": 0.5109818577766418, "learning_rate": 8.127025262233731e-05, "loss": 1.187, "num_input_tokens_seen": 4292608000, "step": 65500, "train_runtime": 31578.0721, "train_tokens_per_second": 135936.354 }, { "epoch": 0.656, "grad_norm": 0.6228885054588318, "learning_rate": 8.084750778082159e-05, "loss": 1.1944, "num_input_tokens_seen": 4299161600, "step": 65600, "train_runtime": 31626.6624, "train_tokens_per_second": 135934.723 }, { "epoch": 0.657, "grad_norm": 0.6139951348304749, "learning_rate": 8.042545930413473e-05, "loss": 1.1788, "num_input_tokens_seen": 4305715200, "step": 65700, "train_runtime": 31673.2442, "train_tokens_per_second": 135941.717 }, { "epoch": 0.658, "grad_norm": 0.6792371273040771, "learning_rate": 8.000411144230025e-05, "loss": 1.2019, "num_input_tokens_seen": 4312268800, "step": 65800, "train_runtime": 31721.455, "train_tokens_per_second": 135941.709 }, { "epoch": 0.659, "grad_norm": 0.546470582485199, "learning_rate": 7.95834684382865e-05, "loss": 1.1905, "num_input_tokens_seen": 4318822400, "step": 65900, "train_runtime": 31770.1998, "train_tokens_per_second": 135939.416 }, { "epoch": 0.66, "grad_norm": 0.5273057818412781, "learning_rate": 7.916353452796378e-05, "loss": 1.1769, "num_input_tokens_seen": 4325376000, "step": 66000, "train_runtime": 31818.123, "train_tokens_per_second": 135940.64 }, { "epoch": 0.661, "grad_norm": 0.5213398933410645, "learning_rate": 7.874431394006188e-05, "loss": 1.1834, "num_input_tokens_seen": 4331929600, "step": 66100, "train_runtime": 31870.8187, "train_tokens_per_second": 135921.504 }, { "epoch": 0.662, "grad_norm": 0.5762707591056824, "learning_rate": 7.832581089612762e-05, "loss": 1.1875, "num_input_tokens_seen": 4338483200, "step": 66200, "train_runtime": 31918.6258, "train_tokens_per_second": 135923.245 }, { "epoch": 0.663, "grad_norm": 0.6153529286384583, "learning_rate": 7.790802961048183e-05, "loss": 1.1895, "num_input_tokens_seen": 4345036800, "step": 66300, "train_runtime": 31967.5441, "train_tokens_per_second": 135920.257 }, { "epoch": 0.664, "grad_norm": 0.6668293476104736, "learning_rate": 7.749097429017749e-05, "loss": 1.1835, "num_input_tokens_seen": 4351590400, "step": 66400, "train_runtime": 32014.502, "train_tokens_per_second": 135925.6 }, { "epoch": 0.665, "grad_norm": 0.49117180705070496, "learning_rate": 7.70746491349571e-05, "loss": 1.1762, "num_input_tokens_seen": 4358144000, "step": 66500, "train_runtime": 32062.234, "train_tokens_per_second": 135927.646 }, { "epoch": 0.666, "grad_norm": 0.5580335259437561, "learning_rate": 7.665905833721025e-05, "loss": 1.1751, "num_input_tokens_seen": 4364697600, "step": 66600, "train_runtime": 32116.4057, "train_tokens_per_second": 135902.431 }, { "epoch": 0.667, "grad_norm": 0.4941908121109009, "learning_rate": 7.624420608193171e-05, "loss": 1.1991, "num_input_tokens_seen": 4371251200, "step": 66700, "train_runtime": 32164.7962, "train_tokens_per_second": 135901.722 }, { "epoch": 0.668, "grad_norm": 0.5203377604484558, "learning_rate": 7.583009654667912e-05, "loss": 1.1892, "num_input_tokens_seen": 4377804800, "step": 66800, "train_runtime": 32211.7614, "train_tokens_per_second": 135907.029 }, { "epoch": 0.669, "grad_norm": 0.5924380421638489, "learning_rate": 7.541673390153087e-05, "loss": 1.1749, "num_input_tokens_seen": 4384358400, "step": 66900, "train_runtime": 32259.5523, "train_tokens_per_second": 135908.842 }, { "epoch": 0.67, "grad_norm": 0.5180861353874207, "learning_rate": 7.500412230904416e-05, "loss": 1.1833, "num_input_tokens_seen": 4390912000, "step": 67000, "train_runtime": 32305.7062, "train_tokens_per_second": 135917.536 }, { "epoch": 0.671, "grad_norm": 0.5575404167175293, "learning_rate": 7.459226592421318e-05, "loss": 1.1908, "num_input_tokens_seen": 4397465600, "step": 67100, "train_runtime": 32353.5616, "train_tokens_per_second": 135919.058 }, { "epoch": 0.672, "grad_norm": 0.519868016242981, "learning_rate": 7.418116889442721e-05, "loss": 1.191, "num_input_tokens_seen": 4404019200, "step": 67200, "train_runtime": 32407.2129, "train_tokens_per_second": 135896.265 }, { "epoch": 0.673, "grad_norm": 0.5036019086837769, "learning_rate": 7.377083535942868e-05, "loss": 1.1771, "num_input_tokens_seen": 4410572800, "step": 67300, "train_runtime": 32454.4825, "train_tokens_per_second": 135900.266 }, { "epoch": 0.674, "grad_norm": 0.5349675416946411, "learning_rate": 7.336126945127178e-05, "loss": 1.1834, "num_input_tokens_seen": 4417126400, "step": 67400, "train_runtime": 32501.8427, "train_tokens_per_second": 135903.876 }, { "epoch": 0.675, "grad_norm": 0.675538957118988, "learning_rate": 7.29524752942807e-05, "loss": 1.1852, "num_input_tokens_seen": 4423680000, "step": 67500, "train_runtime": 32550.3797, "train_tokens_per_second": 135902.562 }, { "epoch": 0.676, "grad_norm": 0.5116747617721558, "learning_rate": 7.254445700500798e-05, "loss": 1.1816, "num_input_tokens_seen": 4430233600, "step": 67600, "train_runtime": 32598.0387, "train_tokens_per_second": 135904.913 }, { "epoch": 0.677, "grad_norm": 0.5892815589904785, "learning_rate": 7.213721869219329e-05, "loss": 1.1827, "num_input_tokens_seen": 4436787200, "step": 67700, "train_runtime": 32650.3715, "train_tokens_per_second": 135887.802 }, { "epoch": 0.678, "grad_norm": 0.6862092614173889, "learning_rate": 7.173076445672198e-05, "loss": 1.1801, "num_input_tokens_seen": 4443340800, "step": 67800, "train_runtime": 32698.6817, "train_tokens_per_second": 135887.46 }, { "epoch": 0.679, "grad_norm": 0.8308249115943909, "learning_rate": 7.132509839158359e-05, "loss": 1.1887, "num_input_tokens_seen": 4449894400, "step": 67900, "train_runtime": 32745.9782, "train_tokens_per_second": 135891.326 }, { "epoch": 0.68, "grad_norm": 0.5063105225563049, "learning_rate": 7.092022458183096e-05, "loss": 1.1949, "num_input_tokens_seen": 4456448000, "step": 68000, "train_runtime": 32794.3077, "train_tokens_per_second": 135890.9 }, { "epoch": 0.681, "grad_norm": 0.6090216040611267, "learning_rate": 7.051614710453888e-05, "loss": 1.1827, "num_input_tokens_seen": 4463001600, "step": 68100, "train_runtime": 32841.7871, "train_tokens_per_second": 135893.993 }, { "epoch": 0.682, "grad_norm": 0.5802315473556519, "learning_rate": 7.011287002876296e-05, "loss": 1.1808, "num_input_tokens_seen": 4469555200, "step": 68200, "train_runtime": 32889.3297, "train_tokens_per_second": 135896.816 }, { "epoch": 0.683, "grad_norm": 0.5431249141693115, "learning_rate": 6.971039741549894e-05, "loss": 1.1872, "num_input_tokens_seen": 4476108800, "step": 68300, "train_runtime": 32943.0615, "train_tokens_per_second": 135874.099 }, { "epoch": 0.684, "grad_norm": 0.8621413111686707, "learning_rate": 6.930873331764162e-05, "loss": 1.1776, "num_input_tokens_seen": 4482662400, "step": 68400, "train_runtime": 32991.0019, "train_tokens_per_second": 135875.304 }, { "epoch": 0.685, "grad_norm": 0.6102387309074402, "learning_rate": 6.890788177994391e-05, "loss": 1.18, "num_input_tokens_seen": 4489216000, "step": 68500, "train_runtime": 33039.2288, "train_tokens_per_second": 135875.326 }, { "epoch": 0.686, "grad_norm": 0.5266649723052979, "learning_rate": 6.850784683897641e-05, "loss": 1.1743, "num_input_tokens_seen": 4495769600, "step": 68600, "train_runtime": 33086.8363, "train_tokens_per_second": 135877.893 }, { "epoch": 0.687, "grad_norm": 0.5879511833190918, "learning_rate": 6.810863252308653e-05, "loss": 1.1803, "num_input_tokens_seen": 4502323200, "step": 68700, "train_runtime": 33133.6328, "train_tokens_per_second": 135883.778 }, { "epoch": 0.688, "grad_norm": 0.5183672308921814, "learning_rate": 6.771024285235792e-05, "loss": 1.1834, "num_input_tokens_seen": 4508876800, "step": 68800, "train_runtime": 33182.6281, "train_tokens_per_second": 135880.642 }, { "epoch": 0.689, "grad_norm": 0.5091114640235901, "learning_rate": 6.73126818385702e-05, "loss": 1.1913, "num_input_tokens_seen": 4515430400, "step": 68900, "train_runtime": 33236.4019, "train_tokens_per_second": 135857.979 }, { "epoch": 0.69, "grad_norm": 0.7696628570556641, "learning_rate": 6.691595348515837e-05, "loss": 1.1786, "num_input_tokens_seen": 4521984000, "step": 69000, "train_runtime": 33285.7582, "train_tokens_per_second": 135853.417 }, { "epoch": 0.691, "grad_norm": 0.5338857769966125, "learning_rate": 6.65200617871726e-05, "loss": 1.1832, "num_input_tokens_seen": 4528537600, "step": 69100, "train_runtime": 33332.8826, "train_tokens_per_second": 135857.965 }, { "epoch": 0.692, "grad_norm": 0.7705228328704834, "learning_rate": 6.612501073123775e-05, "loss": 1.1762, "num_input_tokens_seen": 4535091200, "step": 69200, "train_runtime": 33380.8611, "train_tokens_per_second": 135859.024 }, { "epoch": 0.693, "grad_norm": 0.5423911213874817, "learning_rate": 6.573080429551368e-05, "loss": 1.19, "num_input_tokens_seen": 4541644800, "step": 69300, "train_runtime": 33429.7481, "train_tokens_per_second": 135856.387 }, { "epoch": 0.694, "grad_norm": 0.5332856774330139, "learning_rate": 6.533744644965482e-05, "loss": 1.1753, "num_input_tokens_seen": 4548198400, "step": 69400, "train_runtime": 33476.6955, "train_tokens_per_second": 135861.629 }, { "epoch": 0.695, "grad_norm": 0.5862846970558167, "learning_rate": 6.494494115477023e-05, "loss": 1.1799, "num_input_tokens_seen": 4554752000, "step": 69500, "train_runtime": 33523.7618, "train_tokens_per_second": 135866.375 }, { "epoch": 0.696, "grad_norm": 0.658592164516449, "learning_rate": 6.455329236338394e-05, "loss": 1.1846, "num_input_tokens_seen": 4561305600, "step": 69600, "train_runtime": 33571.8888, "train_tokens_per_second": 135866.815 }, { "epoch": 0.697, "grad_norm": 0.5558256506919861, "learning_rate": 6.416250401939496e-05, "loss": 1.1873, "num_input_tokens_seen": 4567859200, "step": 69700, "train_runtime": 33620.7189, "train_tokens_per_second": 135864.412 }, { "epoch": 0.698, "grad_norm": 0.5283026099205017, "learning_rate": 6.377258005803746e-05, "loss": 1.1743, "num_input_tokens_seen": 4574412800, "step": 69800, "train_runtime": 33674.4741, "train_tokens_per_second": 135842.145 }, { "epoch": 0.699, "grad_norm": 0.802412211894989, "learning_rate": 6.338352440584149e-05, "loss": 1.1782, "num_input_tokens_seen": 4580966400, "step": 69900, "train_runtime": 33722.7187, "train_tokens_per_second": 135842.144 }, { "epoch": 0.7, "grad_norm": 0.5585867762565613, "learning_rate": 6.299534098059318e-05, "loss": 1.1809, "num_input_tokens_seen": 4587520000, "step": 70000, "train_runtime": 33770.2671, "train_tokens_per_second": 135844.943 }, { "epoch": 0.701, "grad_norm": 0.6285941004753113, "learning_rate": 6.260803369129522e-05, "loss": 1.1807, "num_input_tokens_seen": 4594073600, "step": 70100, "train_runtime": 33818.7011, "train_tokens_per_second": 135844.176 }, { "epoch": 0.702, "grad_norm": 0.9580085277557373, "learning_rate": 6.222160643812774e-05, "loss": 1.1802, "num_input_tokens_seen": 4600627200, "step": 70200, "train_runtime": 33866.618, "train_tokens_per_second": 135845.487 }, { "epoch": 0.703, "grad_norm": 0.6520081162452698, "learning_rate": 6.183606311240901e-05, "loss": 1.1879, "num_input_tokens_seen": 4607180800, "step": 70300, "train_runtime": 33915.2388, "train_tokens_per_second": 135843.973 }, { "epoch": 0.704, "grad_norm": 0.520710289478302, "learning_rate": 6.145140759655585e-05, "loss": 1.179, "num_input_tokens_seen": 4613734400, "step": 70400, "train_runtime": 33968.6026, "train_tokens_per_second": 135823.497 }, { "epoch": 0.705, "grad_norm": 0.5945906639099121, "learning_rate": 6.10676437640451e-05, "loss": 1.192, "num_input_tokens_seen": 4620288000, "step": 70500, "train_runtime": 34016.7254, "train_tokens_per_second": 135824.008 }, { "epoch": 0.706, "grad_norm": 0.5285692811012268, "learning_rate": 6.068477547937436e-05, "loss": 1.1855, "num_input_tokens_seen": 4626841600, "step": 70600, "train_runtime": 34064.6033, "train_tokens_per_second": 135825.495 }, { "epoch": 0.707, "grad_norm": 0.6492000222206116, "learning_rate": 6.030280659802294e-05, "loss": 1.192, "num_input_tokens_seen": 4633395200, "step": 70700, "train_runtime": 34111.1694, "train_tokens_per_second": 135832.2 }, { "epoch": 0.708, "grad_norm": 0.5521112084388733, "learning_rate": 5.9921740966413204e-05, "loss": 1.1781, "num_input_tokens_seen": 4639948800, "step": 70800, "train_runtime": 34162.8893, "train_tokens_per_second": 135818.395 }, { "epoch": 0.709, "grad_norm": 0.9012600183486938, "learning_rate": 5.954158242187197e-05, "loss": 1.1748, "num_input_tokens_seen": 4646502400, "step": 70900, "train_runtime": 34211.5739, "train_tokens_per_second": 135816.68 }, { "epoch": 0.71, "grad_norm": 0.4976861774921417, "learning_rate": 5.91623347925914e-05, "loss": 1.1902, "num_input_tokens_seen": 4653056000, "step": 71000, "train_runtime": 34258.4131, "train_tokens_per_second": 135822.287 }, { "epoch": 0.711, "grad_norm": 0.5690837502479553, "learning_rate": 5.8784001897590996e-05, "loss": 1.1767, "num_input_tokens_seen": 4659609600, "step": 71100, "train_runtime": 34307.7023, "train_tokens_per_second": 135818.177 }, { "epoch": 0.712, "grad_norm": 0.5648302435874939, "learning_rate": 5.840658754667877e-05, "loss": 1.182, "num_input_tokens_seen": 4666163200, "step": 71200, "train_runtime": 34355.8058, "train_tokens_per_second": 135818.768 }, { "epoch": 0.713, "grad_norm": 0.5309351086616516, "learning_rate": 5.8030095540413144e-05, "loss": 1.1755, "num_input_tokens_seen": 4672716800, "step": 71300, "train_runtime": 34402.7961, "train_tokens_per_second": 135823.751 }, { "epoch": 0.714, "grad_norm": 1.0066486597061157, "learning_rate": 5.7654529670064326e-05, "loss": 1.2073, "num_input_tokens_seen": 4679270400, "step": 71400, "train_runtime": 34458.8447, "train_tokens_per_second": 135793.015 }, { "epoch": 0.715, "grad_norm": 0.625823974609375, "learning_rate": 5.7279893717576485e-05, "loss": 1.2012, "num_input_tokens_seen": 4685824000, "step": 71500, "train_runtime": 34506.5957, "train_tokens_per_second": 135795.024 }, { "epoch": 0.716, "grad_norm": 0.512055516242981, "learning_rate": 5.690619145552958e-05, "loss": 1.1702, "num_input_tokens_seen": 4692377600, "step": 71600, "train_runtime": 34554.5393, "train_tokens_per_second": 135796.271 }, { "epoch": 0.717, "grad_norm": 0.749454915523529, "learning_rate": 5.6533426647101135e-05, "loss": 1.1812, "num_input_tokens_seen": 4698931200, "step": 71700, "train_runtime": 34601.4153, "train_tokens_per_second": 135801.705 }, { "epoch": 0.718, "grad_norm": 0.5417782068252563, "learning_rate": 5.6161603046028674e-05, "loss": 1.1681, "num_input_tokens_seen": 4705484800, "step": 71800, "train_runtime": 34650.0822, "train_tokens_per_second": 135800.105 }, { "epoch": 0.719, "grad_norm": 0.7127480506896973, "learning_rate": 5.579072439657179e-05, "loss": 1.1946, "num_input_tokens_seen": 4712038400, "step": 71900, "train_runtime": 34698.539, "train_tokens_per_second": 135799.331 }, { "epoch": 0.72, "grad_norm": 0.5434790253639221, "learning_rate": 5.542079443347431e-05, "loss": 1.1761, "num_input_tokens_seen": 4718592000, "step": 72000, "train_runtime": 34745.7766, "train_tokens_per_second": 135803.325 }, { "epoch": 0.721, "grad_norm": 0.5872786045074463, "learning_rate": 5.505181688192682e-05, "loss": 1.1758, "num_input_tokens_seen": 4725145600, "step": 72100, "train_runtime": 34797.942, "train_tokens_per_second": 135788.076 }, { "epoch": 0.722, "grad_norm": 0.5440493822097778, "learning_rate": 5.468379545752925e-05, "loss": 1.2086, "num_input_tokens_seen": 4731699200, "step": 72200, "train_runtime": 34846.6082, "train_tokens_per_second": 135786.507 }, { "epoch": 0.723, "grad_norm": 0.5699992775917053, "learning_rate": 5.4316733866253166e-05, "loss": 1.1705, "num_input_tokens_seen": 4738252800, "step": 72300, "train_runtime": 34894.2941, "train_tokens_per_second": 135788.756 }, { "epoch": 0.724, "grad_norm": 0.7067492604255676, "learning_rate": 5.3950635804404754e-05, "loss": 1.1788, "num_input_tokens_seen": 4744806400, "step": 72400, "train_runtime": 34943.1279, "train_tokens_per_second": 135786.539 }, { "epoch": 0.725, "grad_norm": 0.4926595389842987, "learning_rate": 5.358550495858751e-05, "loss": 1.1712, "num_input_tokens_seen": 4751360000, "step": 72500, "train_runtime": 34988.8033, "train_tokens_per_second": 135796.585 }, { "epoch": 0.726, "grad_norm": 0.6217764616012573, "learning_rate": 5.322134500566487e-05, "loss": 1.199, "num_input_tokens_seen": 4757913600, "step": 72600, "train_runtime": 35043.098, "train_tokens_per_second": 135773.201 }, { "epoch": 0.727, "grad_norm": 0.5704054236412048, "learning_rate": 5.285815961272359e-05, "loss": 1.1782, "num_input_tokens_seen": 4764467200, "step": 72700, "train_runtime": 35090.0359, "train_tokens_per_second": 135778.351 }, { "epoch": 0.728, "grad_norm": 0.6081520915031433, "learning_rate": 5.249595243703658e-05, "loss": 1.1679, "num_input_tokens_seen": 4771020800, "step": 72800, "train_runtime": 35136.6254, "train_tokens_per_second": 135784.833 }, { "epoch": 0.729, "grad_norm": 0.6235555410385132, "learning_rate": 5.213472712602598e-05, "loss": 1.1707, "num_input_tokens_seen": 4777574400, "step": 72900, "train_runtime": 35185.4188, "train_tokens_per_second": 135782.792 }, { "epoch": 0.73, "grad_norm": 0.5777461528778076, "learning_rate": 5.17744873172267e-05, "loss": 1.1816, "num_input_tokens_seen": 4784128000, "step": 73000, "train_runtime": 35238.2318, "train_tokens_per_second": 135765.268 }, { "epoch": 0.731, "grad_norm": 0.569218635559082, "learning_rate": 5.1415236638249694e-05, "loss": 1.1757, "num_input_tokens_seen": 4790681600, "step": 73100, "train_runtime": 35286.0257, "train_tokens_per_second": 135767.106 }, { "epoch": 0.732, "grad_norm": 1.2679173946380615, "learning_rate": 5.105697870674519e-05, "loss": 1.1686, "num_input_tokens_seen": 4797235200, "step": 73200, "train_runtime": 35333.5517, "train_tokens_per_second": 135769.969 }, { "epoch": 0.733, "grad_norm": 0.5663115382194519, "learning_rate": 5.069971713036664e-05, "loss": 1.1699, "num_input_tokens_seen": 4803788800, "step": 73300, "train_runtime": 35380.3642, "train_tokens_per_second": 135775.561 }, { "epoch": 0.734, "grad_norm": 0.5404617190361023, "learning_rate": 5.034345550673415e-05, "loss": 1.1916, "num_input_tokens_seen": 4810342400, "step": 73400, "train_runtime": 35434.8234, "train_tokens_per_second": 135751.838 }, { "epoch": 0.735, "grad_norm": 0.7994534373283386, "learning_rate": 4.998819742339835e-05, "loss": 1.1842, "num_input_tokens_seen": 4816896000, "step": 73500, "train_runtime": 35482.3263, "train_tokens_per_second": 135754.797 }, { "epoch": 0.736, "grad_norm": 0.6482565402984619, "learning_rate": 4.963394645780411e-05, "loss": 1.1789, "num_input_tokens_seen": 4823449600, "step": 73600, "train_runtime": 35530.782, "train_tokens_per_second": 135754.107 }, { "epoch": 0.737, "grad_norm": 0.5401994585990906, "learning_rate": 4.928070617725482e-05, "loss": 1.1832, "num_input_tokens_seen": 4830003200, "step": 73700, "train_runtime": 35578.1016, "train_tokens_per_second": 135757.755 }, { "epoch": 0.738, "grad_norm": 0.5170857906341553, "learning_rate": 4.892848013887613e-05, "loss": 1.1804, "num_input_tokens_seen": 4836556800, "step": 73800, "train_runtime": 35625.1017, "train_tokens_per_second": 135762.61 }, { "epoch": 0.739, "grad_norm": 0.5744811296463013, "learning_rate": 4.857727188958031e-05, "loss": 1.181, "num_input_tokens_seen": 4843110400, "step": 73900, "train_runtime": 35672.7413, "train_tokens_per_second": 135765.019 }, { "epoch": 0.74, "grad_norm": 0.6613340377807617, "learning_rate": 4.822708496603052e-05, "loss": 1.1879, "num_input_tokens_seen": 4849664000, "step": 74000, "train_runtime": 35721.0554, "train_tokens_per_second": 135764.858 }, { "epoch": 0.741, "grad_norm": 0.5571849346160889, "learning_rate": 4.7877922894605304e-05, "loss": 1.1781, "num_input_tokens_seen": 4856217600, "step": 74100, "train_runtime": 35771.1997, "train_tokens_per_second": 135757.75 }, { "epoch": 0.742, "grad_norm": 0.6960323452949524, "learning_rate": 4.752978919136273e-05, "loss": 1.1702, "num_input_tokens_seen": 4862771200, "step": 74200, "train_runtime": 35823.4168, "train_tokens_per_second": 135742.808 }, { "epoch": 0.743, "grad_norm": 0.5823075175285339, "learning_rate": 4.7182687362005337e-05, "loss": 1.1762, "num_input_tokens_seen": 4869324800, "step": 74300, "train_runtime": 35872.0393, "train_tokens_per_second": 135741.511 }, { "epoch": 0.744, "grad_norm": 0.5310567021369934, "learning_rate": 4.6836620901844794e-05, "loss": 1.1737, "num_input_tokens_seen": 4875878400, "step": 74400, "train_runtime": 35918.2124, "train_tokens_per_second": 135749.473 }, { "epoch": 0.745, "grad_norm": 0.560118556022644, "learning_rate": 4.64915932957664e-05, "loss": 1.1746, "num_input_tokens_seen": 4882432000, "step": 74500, "train_runtime": 35972.3831, "train_tokens_per_second": 135727.232 }, { "epoch": 0.746, "grad_norm": 0.5729120969772339, "learning_rate": 4.614760801819433e-05, "loss": 1.1729, "num_input_tokens_seen": 4888985600, "step": 74600, "train_runtime": 36018.4093, "train_tokens_per_second": 135735.744 }, { "epoch": 0.747, "grad_norm": 0.5329717397689819, "learning_rate": 4.58046685330566e-05, "loss": 1.1969, "num_input_tokens_seen": 4895539200, "step": 74700, "train_runtime": 36066.8487, "train_tokens_per_second": 135735.152 }, { "epoch": 0.748, "grad_norm": 0.5714908838272095, "learning_rate": 4.546277829374993e-05, "loss": 1.172, "num_input_tokens_seen": 4902092800, "step": 74800, "train_runtime": 36115.3648, "train_tokens_per_second": 135734.273 }, { "epoch": 0.749, "grad_norm": 0.5672817826271057, "learning_rate": 4.5121940743105246e-05, "loss": 1.1813, "num_input_tokens_seen": 4908646400, "step": 74900, "train_runtime": 36164.0493, "train_tokens_per_second": 135732.765 }, { "epoch": 0.75, "grad_norm": 0.5890370607376099, "learning_rate": 4.478215931335295e-05, "loss": 1.1667, "num_input_tokens_seen": 4915200000, "step": 75000, "train_runtime": 36215.8524, "train_tokens_per_second": 135719.572 }, { "epoch": 0.751, "grad_norm": 0.6215245127677917, "learning_rate": 4.4443437426088205e-05, "loss": 1.179, "num_input_tokens_seen": 4921753600, "step": 75100, "train_runtime": 36264.1849, "train_tokens_per_second": 135719.405 }, { "epoch": 0.752, "grad_norm": 1.4719446897506714, "learning_rate": 4.410577849223666e-05, "loss": 1.1847, "num_input_tokens_seen": 4928307200, "step": 75200, "train_runtime": 36312.9781, "train_tokens_per_second": 135717.516 }, { "epoch": 0.753, "grad_norm": 1.3475043773651123, "learning_rate": 4.376918591202006e-05, "loss": 1.1745, "num_input_tokens_seen": 4934860800, "step": 75300, "train_runtime": 36359.7761, "train_tokens_per_second": 135723.08 }, { "epoch": 0.754, "grad_norm": 0.9558594822883606, "learning_rate": 4.3433663074922046e-05, "loss": 1.181, "num_input_tokens_seen": 4941414400, "step": 75400, "train_runtime": 36406.8385, "train_tokens_per_second": 135727.644 }, { "epoch": 0.755, "grad_norm": 0.5916360020637512, "learning_rate": 4.309921335965367e-05, "loss": 1.1706, "num_input_tokens_seen": 4947968000, "step": 75500, "train_runtime": 36460.2599, "train_tokens_per_second": 135708.522 }, { "epoch": 0.756, "grad_norm": 0.5985275506973267, "learning_rate": 4.276584013411992e-05, "loss": 1.1758, "num_input_tokens_seen": 4954521600, "step": 75600, "train_runtime": 36507.6786, "train_tokens_per_second": 135711.768 }, { "epoch": 0.757, "grad_norm": 0.5550095438957214, "learning_rate": 4.243354675538555e-05, "loss": 1.1705, "num_input_tokens_seen": 4961075200, "step": 75700, "train_runtime": 36554.9962, "train_tokens_per_second": 135715.38 }, { "epoch": 0.758, "grad_norm": 0.5496001243591309, "learning_rate": 4.210233656964111e-05, "loss": 1.1746, "num_input_tokens_seen": 4967628800, "step": 75800, "train_runtime": 36602.3493, "train_tokens_per_second": 135718.851 }, { "epoch": 0.759, "grad_norm": 0.570070743560791, "learning_rate": 4.1772212912169516e-05, "loss": 1.1771, "num_input_tokens_seen": 4974182400, "step": 75900, "train_runtime": 36656.3482, "train_tokens_per_second": 135697.707 }, { "epoch": 0.76, "grad_norm": 0.7570028305053711, "learning_rate": 4.14431791073124e-05, "loss": 1.1756, "num_input_tokens_seen": 4980736000, "step": 76000, "train_runtime": 36704.1036, "train_tokens_per_second": 135699.704 }, { "epoch": 0.761, "grad_norm": 0.6243161559104919, "learning_rate": 4.111523846843639e-05, "loss": 1.1667, "num_input_tokens_seen": 4987289600, "step": 76100, "train_runtime": 36753.037, "train_tokens_per_second": 135697.347 }, { "epoch": 0.762, "grad_norm": 0.5531216263771057, "learning_rate": 4.078839429790019e-05, "loss": 1.1755, "num_input_tokens_seen": 4993843200, "step": 76200, "train_runtime": 36800.3039, "train_tokens_per_second": 135701.14 }, { "epoch": 0.763, "grad_norm": 0.5894837379455566, "learning_rate": 4.046264988702097e-05, "loss": 1.1778, "num_input_tokens_seen": 5000396800, "step": 76300, "train_runtime": 36847.8696, "train_tokens_per_second": 135703.824 }, { "epoch": 0.764, "grad_norm": 0.6210083365440369, "learning_rate": 4.013800851604123e-05, "loss": 1.1729, "num_input_tokens_seen": 5006950400, "step": 76400, "train_runtime": 36901.2456, "train_tokens_per_second": 135685.133 }, { "epoch": 0.765, "grad_norm": 0.5929700136184692, "learning_rate": 3.981447345409606e-05, "loss": 1.171, "num_input_tokens_seen": 5013504000, "step": 76500, "train_runtime": 36949.2788, "train_tokens_per_second": 135686.113 }, { "epoch": 0.766, "grad_norm": 0.5809143781661987, "learning_rate": 3.949204795917995e-05, "loss": 1.1775, "num_input_tokens_seen": 5020057600, "step": 76600, "train_runtime": 36996.6957, "train_tokens_per_second": 135689.35 }, { "epoch": 0.767, "grad_norm": 0.5398791432380676, "learning_rate": 3.917073527811399e-05, "loss": 1.1765, "num_input_tokens_seen": 5026611200, "step": 76700, "train_runtime": 37044.9859, "train_tokens_per_second": 135689.381 }, { "epoch": 0.768, "grad_norm": 0.8559983372688293, "learning_rate": 3.885053864651334e-05, "loss": 1.1661, "num_input_tokens_seen": 5033164800, "step": 76800, "train_runtime": 37092.5707, "train_tokens_per_second": 135691.992 }, { "epoch": 0.769, "grad_norm": 1.0961577892303467, "learning_rate": 3.8531461288754564e-05, "loss": 1.1734, "num_input_tokens_seen": 5039718400, "step": 76900, "train_runtime": 37145.642, "train_tokens_per_second": 135674.554 }, { "epoch": 0.77, "grad_norm": 0.5564078688621521, "learning_rate": 3.821350641794305e-05, "loss": 1.1783, "num_input_tokens_seen": 5046272000, "step": 77000, "train_runtime": 37194.2194, "train_tokens_per_second": 135673.556 }, { "epoch": 0.771, "grad_norm": 0.6036384701728821, "learning_rate": 3.789667723588087e-05, "loss": 1.1651, "num_input_tokens_seen": 5052825600, "step": 77100, "train_runtime": 37242.6728, "train_tokens_per_second": 135673.012 }, { "epoch": 0.772, "grad_norm": 1.4465519189834595, "learning_rate": 3.758097693303431e-05, "loss": 1.1783, "num_input_tokens_seen": 5059379200, "step": 77200, "train_runtime": 37290.7014, "train_tokens_per_second": 135674.015 }, { "epoch": 0.773, "grad_norm": 0.5566693544387817, "learning_rate": 3.7266408688502005e-05, "loss": 1.1751, "num_input_tokens_seen": 5065932800, "step": 77300, "train_runtime": 37338.6452, "train_tokens_per_second": 135675.324 }, { "epoch": 0.774, "grad_norm": 0.653806209564209, "learning_rate": 3.695297566998256e-05, "loss": 1.1709, "num_input_tokens_seen": 5072486400, "step": 77400, "train_runtime": 37386.3122, "train_tokens_per_second": 135677.634 }, { "epoch": 0.775, "grad_norm": 0.8704593777656555, "learning_rate": 3.664068103374307e-05, "loss": 1.1794, "num_input_tokens_seen": 5079040000, "step": 77500, "train_runtime": 37436.1356, "train_tokens_per_second": 135672.123 }, { "epoch": 0.776, "grad_norm": 0.6627979874610901, "learning_rate": 3.63295279245871e-05, "loss": 1.175, "num_input_tokens_seen": 5085593600, "step": 77600, "train_runtime": 37484.0969, "train_tokens_per_second": 135673.366 }, { "epoch": 0.777, "grad_norm": 0.6232652068138123, "learning_rate": 3.601951947582291e-05, "loss": 1.1665, "num_input_tokens_seen": 5092147200, "step": 77700, "train_runtime": 37536.8508, "train_tokens_per_second": 135657.283 }, { "epoch": 0.778, "grad_norm": 0.5873488187789917, "learning_rate": 3.571065880923216e-05, "loss": 1.1734, "num_input_tokens_seen": 5098700800, "step": 77800, "train_runtime": 37584.0839, "train_tokens_per_second": 135661.17 }, { "epoch": 0.779, "grad_norm": 0.56858891248703, "learning_rate": 3.540294903503841e-05, "loss": 1.1696, "num_input_tokens_seen": 5105254400, "step": 77900, "train_runtime": 37631.6286, "train_tokens_per_second": 135663.924 }, { "epoch": 0.78, "grad_norm": 0.5939886569976807, "learning_rate": 3.5096393251875566e-05, "loss": 1.1784, "num_input_tokens_seen": 5111808000, "step": 78000, "train_runtime": 37679.4424, "train_tokens_per_second": 135665.702 }, { "epoch": 0.781, "grad_norm": 0.5839298367500305, "learning_rate": 3.479099454675701e-05, "loss": 1.1672, "num_input_tokens_seen": 5118361600, "step": 78100, "train_runtime": 37733.7363, "train_tokens_per_second": 135644.177 }, { "epoch": 0.782, "grad_norm": 0.6057742238044739, "learning_rate": 3.448675599504434e-05, "loss": 1.1767, "num_input_tokens_seen": 5124915200, "step": 78200, "train_runtime": 37781.8162, "train_tokens_per_second": 135645.019 }, { "epoch": 0.783, "grad_norm": 0.9875990748405457, "learning_rate": 3.418368066041633e-05, "loss": 1.1619, "num_input_tokens_seen": 5131468800, "step": 78300, "train_runtime": 37829.8727, "train_tokens_per_second": 135645.944 }, { "epoch": 0.784, "grad_norm": 0.5806832313537598, "learning_rate": 3.388177159483826e-05, "loss": 1.1747, "num_input_tokens_seen": 5138022400, "step": 78400, "train_runtime": 37877.7351, "train_tokens_per_second": 135647.561 }, { "epoch": 0.785, "grad_norm": 0.7016937136650085, "learning_rate": 3.3581031838531116e-05, "loss": 1.1664, "num_input_tokens_seen": 5144576000, "step": 78500, "train_runtime": 37924.0105, "train_tokens_per_second": 135654.851 }, { "epoch": 0.786, "grad_norm": 0.7171750664710999, "learning_rate": 3.328146441994084e-05, "loss": 1.1905, "num_input_tokens_seen": 5151129600, "step": 78600, "train_runtime": 37971.9481, "train_tokens_per_second": 135656.184 }, { "epoch": 0.787, "grad_norm": 0.5550017356872559, "learning_rate": 3.2983072355708026e-05, "loss": 1.1741, "num_input_tokens_seen": 5157683200, "step": 78700, "train_runtime": 38021.3003, "train_tokens_per_second": 135652.467 }, { "epoch": 0.788, "grad_norm": 0.5833317637443542, "learning_rate": 3.2685858650637486e-05, "loss": 1.176, "num_input_tokens_seen": 5164236800, "step": 78800, "train_runtime": 38074.1209, "train_tokens_per_second": 135636.403 }, { "epoch": 0.789, "grad_norm": 0.9918714165687561, "learning_rate": 3.238982629766793e-05, "loss": 1.1653, "num_input_tokens_seen": 5170790400, "step": 78900, "train_runtime": 38121.5575, "train_tokens_per_second": 135639.537 }, { "epoch": 0.79, "grad_norm": 1.2304959297180176, "learning_rate": 3.209497827784177e-05, "loss": 1.177, "num_input_tokens_seen": 5177344000, "step": 79000, "train_runtime": 38168.8984, "train_tokens_per_second": 135643.003 }, { "epoch": 0.791, "grad_norm": 0.5920888185501099, "learning_rate": 3.1801317560275394e-05, "loss": 1.1717, "num_input_tokens_seen": 5183897600, "step": 79100, "train_runtime": 38223.2691, "train_tokens_per_second": 135621.513 }, { "epoch": 0.792, "grad_norm": 0.5991621017456055, "learning_rate": 3.150884710212895e-05, "loss": 1.1933, "num_input_tokens_seen": 5190451200, "step": 79200, "train_runtime": 38270.5225, "train_tokens_per_second": 135625.303 }, { "epoch": 0.793, "grad_norm": 0.6007819175720215, "learning_rate": 3.121756984857665e-05, "loss": 1.1721, "num_input_tokens_seen": 5197004800, "step": 79300, "train_runtime": 38316.5535, "train_tokens_per_second": 135633.41 }, { "epoch": 0.794, "grad_norm": 0.6040635704994202, "learning_rate": 3.092748873277725e-05, "loss": 1.1784, "num_input_tokens_seen": 5203558400, "step": 79400, "train_runtime": 38364.1371, "train_tokens_per_second": 135636.008 }, { "epoch": 0.795, "grad_norm": 1.8925070762634277, "learning_rate": 3.06386066758444e-05, "loss": 1.179, "num_input_tokens_seen": 5210112000, "step": 79500, "train_runtime": 38412.6561, "train_tokens_per_second": 135635.297 }, { "epoch": 0.796, "grad_norm": 0.6026915311813354, "learning_rate": 3.0350926586817127e-05, "loss": 1.1706, "num_input_tokens_seen": 5216665600, "step": 79600, "train_runtime": 38465.3514, "train_tokens_per_second": 135619.861 }, { "epoch": 0.797, "grad_norm": 0.7981861233711243, "learning_rate": 3.0064451362630765e-05, "loss": 1.1796, "num_input_tokens_seen": 5223219200, "step": 79700, "train_runtime": 38512.271, "train_tokens_per_second": 135624.804 }, { "epoch": 0.798, "grad_norm": 1.3739973306655884, "learning_rate": 2.9779183888087683e-05, "loss": 1.1827, "num_input_tokens_seen": 5229772800, "step": 79800, "train_runtime": 38560.5377, "train_tokens_per_second": 135624.997 }, { "epoch": 0.799, "grad_norm": 0.7507041692733765, "learning_rate": 2.9495127035828103e-05, "loss": 1.164, "num_input_tokens_seen": 5236326400, "step": 79900, "train_runtime": 38608.5419, "train_tokens_per_second": 135626.111 }, { "epoch": 0.8, "grad_norm": 0.5848426818847656, "learning_rate": 2.921228366630144e-05, "loss": 1.1746, "num_input_tokens_seen": 5242880000, "step": 80000, "train_runtime": 38660.3487, "train_tokens_per_second": 135613.883 }, { "epoch": 0.801, "grad_norm": 0.5851396322250366, "learning_rate": 2.8930656627737276e-05, "loss": 1.1999, "num_input_tokens_seen": 5249433600, "step": 80100, "train_runtime": 38707.849, "train_tokens_per_second": 135616.774 }, { "epoch": 0.802, "grad_norm": 0.5581755638122559, "learning_rate": 2.8650248756116727e-05, "loss": 1.1657, "num_input_tokens_seen": 5255987200, "step": 80200, "train_runtime": 38755.0614, "train_tokens_per_second": 135620.665 }, { "epoch": 0.803, "grad_norm": 0.8737390637397766, "learning_rate": 2.8371062875143968e-05, "loss": 1.168, "num_input_tokens_seen": 5262540800, "step": 80300, "train_runtime": 38809.0814, "train_tokens_per_second": 135600.757 }, { "epoch": 0.804, "grad_norm": 0.6018446683883667, "learning_rate": 2.809310179621776e-05, "loss": 1.1603, "num_input_tokens_seen": 5269094400, "step": 80400, "train_runtime": 38856.5205, "train_tokens_per_second": 135603.866 }, { "epoch": 0.805, "grad_norm": 0.5673835873603821, "learning_rate": 2.781636831840303e-05, "loss": 1.1748, "num_input_tokens_seen": 5275648000, "step": 80500, "train_runtime": 38904.9212, "train_tokens_per_second": 135603.616 }, { "epoch": 0.806, "grad_norm": 0.5929433703422546, "learning_rate": 2.754086522840282e-05, "loss": 1.1663, "num_input_tokens_seen": 5282201600, "step": 80600, "train_runtime": 38952.3955, "train_tokens_per_second": 135606.592 }, { "epoch": 0.807, "grad_norm": 0.555366039276123, "learning_rate": 2.7266595300530204e-05, "loss": 1.1665, "num_input_tokens_seen": 5288755200, "step": 80700, "train_runtime": 39001.4372, "train_tokens_per_second": 135604.11 }, { "epoch": 0.808, "grad_norm": 0.5364073514938354, "learning_rate": 2.6993561296680342e-05, "loss": 1.1687, "num_input_tokens_seen": 5295308800, "step": 80800, "train_runtime": 39048.23, "train_tokens_per_second": 135609.445 }, { "epoch": 0.809, "grad_norm": 0.9588598608970642, "learning_rate": 2.672176596630258e-05, "loss": 1.1831, "num_input_tokens_seen": 5301862400, "step": 80900, "train_runtime": 39096.7929, "train_tokens_per_second": 135608.627 }, { "epoch": 0.81, "grad_norm": 0.6481744050979614, "learning_rate": 2.6451212046372883e-05, "loss": 1.1686, "num_input_tokens_seen": 5308416000, "step": 81000, "train_runtime": 39152.1435, "train_tokens_per_second": 135584.301 }, { "epoch": 0.811, "grad_norm": 0.5828465819358826, "learning_rate": 2.6181902261366256e-05, "loss": 1.1662, "num_input_tokens_seen": 5314969600, "step": 81100, "train_runtime": 39199.715, "train_tokens_per_second": 135586.945 }, { "epoch": 0.812, "grad_norm": 0.5715954899787903, "learning_rate": 2.5913839323229195e-05, "loss": 1.1623, "num_input_tokens_seen": 5321523200, "step": 81200, "train_runtime": 39246.528, "train_tokens_per_second": 135592.203 }, { "epoch": 0.813, "grad_norm": 0.8631576299667358, "learning_rate": 2.564702593135253e-05, "loss": 1.1896, "num_input_tokens_seen": 5328076800, "step": 81300, "train_runtime": 39294.7731, "train_tokens_per_second": 135592.507 }, { "epoch": 0.814, "grad_norm": 0.5882650017738342, "learning_rate": 2.538146477254419e-05, "loss": 1.1728, "num_input_tokens_seen": 5334630400, "step": 81400, "train_runtime": 39341.8017, "train_tokens_per_second": 135597.003 }, { "epoch": 0.815, "grad_norm": 0.5567020773887634, "learning_rate": 2.5117158521002033e-05, "loss": 1.1669, "num_input_tokens_seen": 5341184000, "step": 81500, "train_runtime": 39389.3033, "train_tokens_per_second": 135599.86 }, { "epoch": 0.816, "grad_norm": 0.7412062883377075, "learning_rate": 2.4854109838287116e-05, "loss": 1.1629, "num_input_tokens_seen": 5347737600, "step": 81600, "train_runtime": 39443.4282, "train_tokens_per_second": 135579.939 }, { "epoch": 0.817, "grad_norm": 0.6353700757026672, "learning_rate": 2.459232137329679e-05, "loss": 1.1676, "num_input_tokens_seen": 5354291200, "step": 81700, "train_runtime": 39490.3956, "train_tokens_per_second": 135584.643 }, { "epoch": 0.818, "grad_norm": 0.6541226506233215, "learning_rate": 2.4331795762237894e-05, "loss": 1.1669, "num_input_tokens_seen": 5360844800, "step": 81800, "train_runtime": 39539.3049, "train_tokens_per_second": 135582.677 }, { "epoch": 0.819, "grad_norm": 0.684333086013794, "learning_rate": 2.4072535628600514e-05, "loss": 1.1623, "num_input_tokens_seen": 5367398400, "step": 81900, "train_runtime": 39587.3713, "train_tokens_per_second": 135583.602 }, { "epoch": 0.82, "grad_norm": 0.5568915605545044, "learning_rate": 2.3814543583131306e-05, "loss": 1.1662, "num_input_tokens_seen": 5373952000, "step": 82000, "train_runtime": 39636.1132, "train_tokens_per_second": 135582.214 }, { "epoch": 0.821, "grad_norm": 0.6357592940330505, "learning_rate": 2.3557822223807287e-05, "loss": 1.1617, "num_input_tokens_seen": 5380505600, "step": 82100, "train_runtime": 39683.9299, "train_tokens_per_second": 135583.991 }, { "epoch": 0.822, "grad_norm": 0.6660736203193665, "learning_rate": 2.3302374135809727e-05, "loss": 1.1788, "num_input_tokens_seen": 5387059200, "step": 82200, "train_runtime": 39731.7683, "train_tokens_per_second": 135585.69 }, { "epoch": 0.823, "grad_norm": 0.6093869805335999, "learning_rate": 2.304820189149798e-05, "loss": 1.1823, "num_input_tokens_seen": 5393612800, "step": 82300, "train_runtime": 39780.5498, "train_tokens_per_second": 135584.169 }, { "epoch": 0.824, "grad_norm": 1.0343610048294067, "learning_rate": 2.2795308050383787e-05, "loss": 1.1942, "num_input_tokens_seen": 5400166400, "step": 82400, "train_runtime": 39833.9775, "train_tokens_per_second": 135566.839 }, { "epoch": 0.825, "grad_norm": 0.5363211035728455, "learning_rate": 2.2543695159105248e-05, "loss": 1.1659, "num_input_tokens_seen": 5406720000, "step": 82500, "train_runtime": 39881.8503, "train_tokens_per_second": 135568.434 }, { "epoch": 0.826, "grad_norm": 0.9732265472412109, "learning_rate": 2.2293365751401443e-05, "loss": 1.1757, "num_input_tokens_seen": 5413273600, "step": 82600, "train_runtime": 39929.975, "train_tokens_per_second": 135569.171 }, { "epoch": 0.827, "grad_norm": 0.5309200286865234, "learning_rate": 2.2044322348086735e-05, "loss": 1.1651, "num_input_tokens_seen": 5419827200, "step": 82700, "train_runtime": 39978.229, "train_tokens_per_second": 135569.467 }, { "epoch": 0.828, "grad_norm": 0.543769121170044, "learning_rate": 2.1796567457025372e-05, "loss": 1.1685, "num_input_tokens_seen": 5426380800, "step": 82800, "train_runtime": 40026.0125, "train_tokens_per_second": 135571.356 }, { "epoch": 0.829, "grad_norm": 0.5210631489753723, "learning_rate": 2.15501035731064e-05, "loss": 1.1778, "num_input_tokens_seen": 5432934400, "step": 82900, "train_runtime": 40075.0654, "train_tokens_per_second": 135568.947 }, { "epoch": 0.83, "grad_norm": 1.3538480997085571, "learning_rate": 2.1304933178218426e-05, "loss": 1.1655, "num_input_tokens_seen": 5439488000, "step": 83000, "train_runtime": 40123.2015, "train_tokens_per_second": 135569.64 }, { "epoch": 0.831, "grad_norm": 1.2901802062988281, "learning_rate": 2.1061058741224518e-05, "loss": 1.1668, "num_input_tokens_seen": 5446041600, "step": 83100, "train_runtime": 40170.8312, "train_tokens_per_second": 135572.042 }, { "epoch": 0.832, "grad_norm": 0.6960340142250061, "learning_rate": 2.0818482717937596e-05, "loss": 1.163, "num_input_tokens_seen": 5452595200, "step": 83200, "train_runtime": 40225.6882, "train_tokens_per_second": 135550.079 }, { "epoch": 0.833, "grad_norm": 0.537268340587616, "learning_rate": 2.0577207551095552e-05, "loss": 1.1689, "num_input_tokens_seen": 5459148800, "step": 83300, "train_runtime": 40273.4785, "train_tokens_per_second": 135551.956 }, { "epoch": 0.834, "grad_norm": 0.564239501953125, "learning_rate": 2.0337235670336584e-05, "loss": 1.1662, "num_input_tokens_seen": 5465702400, "step": 83400, "train_runtime": 40320.9705, "train_tokens_per_second": 135554.832 }, { "epoch": 0.835, "grad_norm": 0.520041823387146, "learning_rate": 2.0098569492174887e-05, "loss": 1.1642, "num_input_tokens_seen": 5472256000, "step": 83500, "train_runtime": 40369.1468, "train_tokens_per_second": 135555.404 }, { "epoch": 0.836, "grad_norm": 0.616112232208252, "learning_rate": 1.9861211419976258e-05, "loss": 1.1671, "num_input_tokens_seen": 5478809600, "step": 83600, "train_runtime": 40416.0661, "train_tokens_per_second": 135560.19 }, { "epoch": 0.837, "grad_norm": 1.3083754777908325, "learning_rate": 1.962516384393377e-05, "loss": 1.1778, "num_input_tokens_seen": 5485363200, "step": 83700, "train_runtime": 40465.3405, "train_tokens_per_second": 135557.075 }, { "epoch": 0.838, "grad_norm": 0.5721991062164307, "learning_rate": 1.939042914104396e-05, "loss": 1.179, "num_input_tokens_seen": 5491916800, "step": 83800, "train_runtime": 40513.1256, "train_tokens_per_second": 135558.951 }, { "epoch": 0.839, "grad_norm": 0.8014708161354065, "learning_rate": 1.9157009675082702e-05, "loss": 1.1698, "num_input_tokens_seen": 5498470400, "step": 83900, "train_runtime": 40567.2253, "train_tokens_per_second": 135539.721 }, { "epoch": 0.84, "grad_norm": 0.7554424405097961, "learning_rate": 1.8924907796581363e-05, "loss": 1.1689, "num_input_tokens_seen": 5505024000, "step": 84000, "train_runtime": 40615.2949, "train_tokens_per_second": 135540.663 }, { "epoch": 0.841, "grad_norm": 0.6026338934898376, "learning_rate": 1.869412584280329e-05, "loss": 1.1727, "num_input_tokens_seen": 5511577600, "step": 84100, "train_runtime": 40664.1179, "train_tokens_per_second": 135539.091 }, { "epoch": 0.842, "grad_norm": 0.6569694876670837, "learning_rate": 1.8464666137720208e-05, "loss": 1.1717, "num_input_tokens_seen": 5518131200, "step": 84200, "train_runtime": 40713.1869, "train_tokens_per_second": 135536.705 }, { "epoch": 0.843, "grad_norm": 0.5886375904083252, "learning_rate": 1.823653099198884e-05, "loss": 1.1764, "num_input_tokens_seen": 5524684800, "step": 84300, "train_runtime": 40759.1071, "train_tokens_per_second": 135544.795 }, { "epoch": 0.844, "grad_norm": 0.6782867908477783, "learning_rate": 1.800972270292749e-05, "loss": 1.1637, "num_input_tokens_seen": 5531238400, "step": 84400, "train_runtime": 40811.77, "train_tokens_per_second": 135530.471 }, { "epoch": 0.845, "grad_norm": 0.6513829231262207, "learning_rate": 1.778424355449317e-05, "loss": 1.165, "num_input_tokens_seen": 5537792000, "step": 84500, "train_runtime": 40858.6857, "train_tokens_per_second": 135535.245 }, { "epoch": 0.846, "grad_norm": 0.6192531585693359, "learning_rate": 1.756009581725841e-05, "loss": 1.1589, "num_input_tokens_seen": 5544345600, "step": 84600, "train_runtime": 40906.0609, "train_tokens_per_second": 135538.487 }, { "epoch": 0.847, "grad_norm": 0.5640349388122559, "learning_rate": 1.7337281748388387e-05, "loss": 1.1653, "num_input_tokens_seen": 5550899200, "step": 84700, "train_runtime": 40952.689, "train_tokens_per_second": 135544.194 }, { "epoch": 0.848, "grad_norm": 0.5606239438056946, "learning_rate": 1.7115803591618312e-05, "loss": 1.1734, "num_input_tokens_seen": 5557452800, "step": 84800, "train_runtime": 41006.8402, "train_tokens_per_second": 135525.019 }, { "epoch": 0.849, "grad_norm": 0.5700273513793945, "learning_rate": 1.6895663577230816e-05, "loss": 1.1755, "num_input_tokens_seen": 5564006400, "step": 84900, "train_runtime": 41054.6511, "train_tokens_per_second": 135526.822 }, { "epoch": 0.85, "grad_norm": 0.7111489176750183, "learning_rate": 1.667686392203333e-05, "loss": 1.1673, "num_input_tokens_seen": 5570560000, "step": 85000, "train_runtime": 41102.7763, "train_tokens_per_second": 135527.585 }, { "epoch": 0.851, "grad_norm": 0.5908454060554504, "learning_rate": 1.6459406829335996e-05, "loss": 1.1767, "num_input_tokens_seen": 5577113600, "step": 85100, "train_runtime": 41150.3215, "train_tokens_per_second": 135530.256 }, { "epoch": 0.852, "grad_norm": 0.6215232610702515, "learning_rate": 1.624329448892932e-05, "loss": 1.171, "num_input_tokens_seen": 5583667200, "step": 85200, "train_runtime": 41205.5284, "train_tokens_per_second": 135507.72 }, { "epoch": 0.853, "grad_norm": 0.6203814744949341, "learning_rate": 1.6028529077062163e-05, "loss": 1.1591, "num_input_tokens_seen": 5590220800, "step": 85300, "train_runtime": 41253.0291, "train_tokens_per_second": 135510.553 }, { "epoch": 0.854, "grad_norm": 0.5267207026481628, "learning_rate": 1.5815112756419805e-05, "loss": 1.185, "num_input_tokens_seen": 5596774400, "step": 85400, "train_runtime": 41301.2794, "train_tokens_per_second": 135510.921 }, { "epoch": 0.855, "grad_norm": 0.5815737843513489, "learning_rate": 1.5603047676102313e-05, "loss": 1.173, "num_input_tokens_seen": 5603328000, "step": 85500, "train_runtime": 41349.0127, "train_tokens_per_second": 135512.982 }, { "epoch": 0.856, "grad_norm": 0.6342357397079468, "learning_rate": 1.5392335971602638e-05, "loss": 1.1568, "num_input_tokens_seen": 5609881600, "step": 85600, "train_runtime": 41397.1556, "train_tokens_per_second": 135513.697 }, { "epoch": 0.857, "grad_norm": 0.6623713970184326, "learning_rate": 1.5182979764785258e-05, "loss": 1.1649, "num_input_tokens_seen": 5616435200, "step": 85700, "train_runtime": 41450.3243, "train_tokens_per_second": 135497.98 }, { "epoch": 0.858, "grad_norm": 0.6217081546783447, "learning_rate": 1.4974981163864896e-05, "loss": 1.1772, "num_input_tokens_seen": 5622988800, "step": 85800, "train_runtime": 41497.5379, "train_tokens_per_second": 135501.745 }, { "epoch": 0.859, "grad_norm": 0.6180946826934814, "learning_rate": 1.4768342263385192e-05, "loss": 1.1601, "num_input_tokens_seen": 5629542400, "step": 85900, "train_runtime": 41546.6611, "train_tokens_per_second": 135499.274 }, { "epoch": 0.86, "grad_norm": 0.5609486103057861, "learning_rate": 1.4563065144197517e-05, "loss": 1.1866, "num_input_tokens_seen": 5636096000, "step": 86000, "train_runtime": 41594.2678, "train_tokens_per_second": 135501.748 }, { "epoch": 0.861, "grad_norm": 0.5352550148963928, "learning_rate": 1.4359151873440216e-05, "loss": 1.1732, "num_input_tokens_seen": 5642649600, "step": 86100, "train_runtime": 41640.8053, "train_tokens_per_second": 135507.696 }, { "epoch": 0.862, "grad_norm": 0.5788577198982239, "learning_rate": 1.415660450451767e-05, "loss": 1.1785, "num_input_tokens_seen": 5649203200, "step": 86200, "train_runtime": 41695.0254, "train_tokens_per_second": 135488.662 }, { "epoch": 0.863, "grad_norm": 0.5672028064727783, "learning_rate": 1.3955425077079595e-05, "loss": 1.1692, "num_input_tokens_seen": 5655756800, "step": 86300, "train_runtime": 41742.7936, "train_tokens_per_second": 135490.615 }, { "epoch": 0.864, "grad_norm": 0.577563464641571, "learning_rate": 1.375561561700061e-05, "loss": 1.1662, "num_input_tokens_seen": 5662310400, "step": 86400, "train_runtime": 41789.652, "train_tokens_per_second": 135495.515 }, { "epoch": 0.865, "grad_norm": 0.544994592666626, "learning_rate": 1.3557178136359798e-05, "loss": 1.1665, "num_input_tokens_seen": 5668864000, "step": 86500, "train_runtime": 41842.8709, "train_tokens_per_second": 135479.805 }, { "epoch": 0.866, "grad_norm": 0.5978608727455139, "learning_rate": 1.3360114633420333e-05, "loss": 1.1644, "num_input_tokens_seen": 5675417600, "step": 86600, "train_runtime": 41891.5143, "train_tokens_per_second": 135478.932 }, { "epoch": 0.867, "grad_norm": 0.6005887985229492, "learning_rate": 1.3164427092609503e-05, "loss": 1.1742, "num_input_tokens_seen": 5681971200, "step": 86700, "train_runtime": 41939.4895, "train_tokens_per_second": 135480.218 }, { "epoch": 0.868, "grad_norm": 0.5312247276306152, "learning_rate": 1.2970117484498732e-05, "loss": 1.1575, "num_input_tokens_seen": 5688524800, "step": 86800, "train_runtime": 41987.1811, "train_tokens_per_second": 135482.418 }, { "epoch": 0.869, "grad_norm": 0.9317598342895508, "learning_rate": 1.2777187765783558e-05, "loss": 1.1668, "num_input_tokens_seen": 5695078400, "step": 86900, "train_runtime": 42034.5611, "train_tokens_per_second": 135485.616 }, { "epoch": 0.87, "grad_norm": 0.5501394271850586, "learning_rate": 1.2585639879264103e-05, "loss": 1.1741, "num_input_tokens_seen": 5701632000, "step": 87000, "train_runtime": 42082.1201, "train_tokens_per_second": 135488.231 }, { "epoch": 0.871, "grad_norm": 0.6144236326217651, "learning_rate": 1.2395475753825518e-05, "loss": 1.1665, "num_input_tokens_seen": 5708185600, "step": 87100, "train_runtime": 42136.7086, "train_tokens_per_second": 135468.236 }, { "epoch": 0.872, "grad_norm": 0.6324082612991333, "learning_rate": 1.2206697304418367e-05, "loss": 1.1523, "num_input_tokens_seen": 5714739200, "step": 87200, "train_runtime": 42184.2095, "train_tokens_per_second": 135471.051 }, { "epoch": 0.873, "grad_norm": 0.6486518979072571, "learning_rate": 1.2019306432039594e-05, "loss": 1.1872, "num_input_tokens_seen": 5721292800, "step": 87300, "train_runtime": 42230.9222, "train_tokens_per_second": 135476.388 }, { "epoch": 0.874, "grad_norm": 0.5755148530006409, "learning_rate": 1.1833305023713153e-05, "loss": 1.1963, "num_input_tokens_seen": 5727846400, "step": 87400, "train_runtime": 42278.9901, "train_tokens_per_second": 135477.37 }, { "epoch": 0.875, "grad_norm": 0.6408706307411194, "learning_rate": 1.1648694952471205e-05, "loss": 1.163, "num_input_tokens_seen": 5734400000, "step": 87500, "train_runtime": 42326.8376, "train_tokens_per_second": 135479.056 }, { "epoch": 0.876, "grad_norm": 0.6233325600624084, "learning_rate": 1.1465478077335088e-05, "loss": 1.1591, "num_input_tokens_seen": 5740953600, "step": 87600, "train_runtime": 42379.4952, "train_tokens_per_second": 135465.36 }, { "epoch": 0.877, "grad_norm": 0.8282228708267212, "learning_rate": 1.1283656243296695e-05, "loss": 1.1799, "num_input_tokens_seen": 5747507200, "step": 87700, "train_runtime": 42427.8149, "train_tokens_per_second": 135465.548 }, { "epoch": 0.878, "grad_norm": 0.7755045294761658, "learning_rate": 1.1103231281299923e-05, "loss": 1.1565, "num_input_tokens_seen": 5754060800, "step": 87800, "train_runtime": 42474.6192, "train_tokens_per_second": 135470.568 }, { "epoch": 0.879, "grad_norm": 0.6230588555335999, "learning_rate": 1.0924205008222086e-05, "loss": 1.1673, "num_input_tokens_seen": 5760614400, "step": 87900, "train_runtime": 42522.6205, "train_tokens_per_second": 135471.764 }, { "epoch": 0.88, "grad_norm": 0.5966441035270691, "learning_rate": 1.0746579226855768e-05, "loss": 1.1628, "num_input_tokens_seen": 5767168000, "step": 88000, "train_runtime": 42576.1454, "train_tokens_per_second": 135455.381 }, { "epoch": 0.881, "grad_norm": 0.6604552865028381, "learning_rate": 1.0570355725890678e-05, "loss": 1.1769, "num_input_tokens_seen": 5773721600, "step": 88100, "train_runtime": 42624.8502, "train_tokens_per_second": 135454.355 }, { "epoch": 0.882, "grad_norm": 0.5727500319480896, "learning_rate": 1.0395536279895428e-05, "loss": 1.1571, "num_input_tokens_seen": 5780275200, "step": 88200, "train_runtime": 42673.6883, "train_tokens_per_second": 135452.909 }, { "epoch": 0.883, "grad_norm": 0.5748215317726135, "learning_rate": 1.0222122649299952e-05, "loss": 1.1666, "num_input_tokens_seen": 5786828800, "step": 88300, "train_runtime": 42720.0242, "train_tokens_per_second": 135459.399 }, { "epoch": 0.884, "grad_norm": 0.6671021580696106, "learning_rate": 1.0050116580377593e-05, "loss": 1.1887, "num_input_tokens_seen": 5793382400, "step": 88400, "train_runtime": 42766.9841, "train_tokens_per_second": 135463.899 }, { "epoch": 0.885, "grad_norm": 0.7352688908576965, "learning_rate": 9.879519805227515e-06, "loss": 1.173, "num_input_tokens_seen": 5799936000, "step": 88500, "train_runtime": 42820.4689, "train_tokens_per_second": 135447.746 }, { "epoch": 0.886, "grad_norm": 0.5779001712799072, "learning_rate": 9.710334041757351e-06, "loss": 1.1612, "num_input_tokens_seen": 5806489600, "step": 88600, "train_runtime": 42866.8877, "train_tokens_per_second": 135453.958 }, { "epoch": 0.887, "grad_norm": 0.7246189713478088, "learning_rate": 9.542560993665932e-06, "loss": 1.1926, "num_input_tokens_seen": 5813043200, "step": 88700, "train_runtime": 42915.9912, "train_tokens_per_second": 135451.682 }, { "epoch": 0.888, "grad_norm": 0.5459685921669006, "learning_rate": 9.376202350425888e-06, "loss": 1.1698, "num_input_tokens_seen": 5819596800, "step": 88800, "train_runtime": 42964.4051, "train_tokens_per_second": 135451.586 }, { "epoch": 0.889, "grad_norm": 0.5574699640274048, "learning_rate": 9.211259787266972e-06, "loss": 1.1627, "num_input_tokens_seen": 5826150400, "step": 88900, "train_runtime": 43011.9797, "train_tokens_per_second": 135454.133 }, { "epoch": 0.89, "grad_norm": 0.5637386441230774, "learning_rate": 9.047734965158966e-06, "loss": 1.1659, "num_input_tokens_seen": 5832704000, "step": 89000, "train_runtime": 43065.5789, "train_tokens_per_second": 135437.724 }, { "epoch": 0.891, "grad_norm": 0.5420241951942444, "learning_rate": 8.885629530794997e-06, "loss": 1.1693, "num_input_tokens_seen": 5839257600, "step": 89100, "train_runtime": 43113.8932, "train_tokens_per_second": 135437.957 }, { "epoch": 0.892, "grad_norm": 0.5701260566711426, "learning_rate": 8.724945116574983e-06, "loss": 1.1592, "num_input_tokens_seen": 5845811200, "step": 89200, "train_runtime": 43161.415, "train_tokens_per_second": 135440.675 }, { "epoch": 0.893, "grad_norm": 0.5882892608642578, "learning_rate": 8.565683340589185e-06, "loss": 1.1601, "num_input_tokens_seen": 5852364800, "step": 89300, "train_runtime": 43209.5307, "train_tokens_per_second": 135441.527 }, { "epoch": 0.894, "grad_norm": 0.5708109736442566, "learning_rate": 8.40784580660196e-06, "loss": 1.1684, "num_input_tokens_seen": 5858918400, "step": 89400, "train_runtime": 43257.3597, "train_tokens_per_second": 135443.273 }, { "epoch": 0.895, "grad_norm": 0.5796698927879333, "learning_rate": 8.251434104035465e-06, "loss": 1.1753, "num_input_tokens_seen": 5865472000, "step": 89500, "train_runtime": 43305.3116, "train_tokens_per_second": 135444.632 }, { "epoch": 0.896, "grad_norm": 0.9602819681167603, "learning_rate": 8.09644980795383e-06, "loss": 1.1672, "num_input_tokens_seen": 5872025600, "step": 89600, "train_runtime": 43360.788, "train_tokens_per_second": 135422.484 }, { "epoch": 0.897, "grad_norm": 0.6962534189224243, "learning_rate": 7.942894479047252e-06, "loss": 1.1622, "num_input_tokens_seen": 5878579200, "step": 89700, "train_runtime": 43407.8503, "train_tokens_per_second": 135426.637 }, { "epoch": 0.898, "grad_norm": 0.6292552351951599, "learning_rate": 7.790769663616098e-06, "loss": 1.1632, "num_input_tokens_seen": 5885132800, "step": 89800, "train_runtime": 43455.9389, "train_tokens_per_second": 135427.584 }, { "epoch": 0.899, "grad_norm": 0.5883670449256897, "learning_rate": 7.64007689355563e-06, "loss": 1.1632, "num_input_tokens_seen": 5891686400, "step": 89900, "train_runtime": 43504.2315, "train_tokens_per_second": 135427.893 }, { "epoch": 0.9, "grad_norm": 0.8059070706367493, "learning_rate": 7.490817686340361e-06, "loss": 1.1728, "num_input_tokens_seen": 5898240000, "step": 90000, "train_runtime": 43552.1457, "train_tokens_per_second": 135429.378 }, { "epoch": 0.901, "grad_norm": 0.5949374437332153, "learning_rate": 7.342993545008818e-06, "loss": 1.1732, "num_input_tokens_seen": 5904793600, "step": 90100, "train_runtime": 43599.6931, "train_tokens_per_second": 135431.999 }, { "epoch": 0.902, "grad_norm": 0.6094557642936707, "learning_rate": 7.196605958148505e-06, "loss": 1.1713, "num_input_tokens_seen": 5911347200, "step": 90200, "train_runtime": 43653.2541, "train_tokens_per_second": 135415.957 }, { "epoch": 0.903, "grad_norm": 0.6275845170021057, "learning_rate": 7.051656399880778e-06, "loss": 1.1743, "num_input_tokens_seen": 5917900800, "step": 90300, "train_runtime": 43702.1275, "train_tokens_per_second": 135414.478 }, { "epoch": 0.904, "grad_norm": 0.7113337516784668, "learning_rate": 6.9081463298460815e-06, "loss": 1.162, "num_input_tokens_seen": 5924454400, "step": 90400, "train_runtime": 43749.6704, "train_tokens_per_second": 135417.121 }, { "epoch": 0.905, "grad_norm": 0.6237180233001709, "learning_rate": 6.766077193189201e-06, "loss": 1.159, "num_input_tokens_seen": 5931008000, "step": 90500, "train_runtime": 43797.6522, "train_tokens_per_second": 135418.4 }, { "epoch": 0.906, "grad_norm": 0.9803968667984009, "learning_rate": 6.625450420544831e-06, "loss": 1.1788, "num_input_tokens_seen": 5937561600, "step": 90600, "train_runtime": 43846.1111, "train_tokens_per_second": 135418.203 }, { "epoch": 0.907, "grad_norm": 0.5648267269134521, "learning_rate": 6.486267428022967e-06, "loss": 1.1581, "num_input_tokens_seen": 5944115200, "step": 90700, "train_runtime": 43893.4216, "train_tokens_per_second": 135421.55 }, { "epoch": 0.908, "grad_norm": 0.610898494720459, "learning_rate": 6.34852961719477e-06, "loss": 1.1557, "num_input_tokens_seen": 5950668800, "step": 90800, "train_runtime": 43947.4481, "train_tokens_per_second": 135404.194 }, { "epoch": 0.909, "grad_norm": 0.732876718044281, "learning_rate": 6.212238375078521e-06, "loss": 1.1683, "num_input_tokens_seen": 5957222400, "step": 90900, "train_runtime": 43996.4271, "train_tokens_per_second": 135402.413 }, { "epoch": 0.91, "grad_norm": 0.5793011784553528, "learning_rate": 6.077395074125491e-06, "loss": 1.1747, "num_input_tokens_seen": 5963776000, "step": 91000, "train_runtime": 44044.5112, "train_tokens_per_second": 135403.387 }, { "epoch": 0.911, "grad_norm": 0.6567527651786804, "learning_rate": 5.944001072206212e-06, "loss": 1.1594, "num_input_tokens_seen": 5970329600, "step": 91100, "train_runtime": 44091.43, "train_tokens_per_second": 135407.938 }, { "epoch": 0.912, "grad_norm": 0.6197203397750854, "learning_rate": 5.812057712596807e-06, "loss": 1.1504, "num_input_tokens_seen": 5976883200, "step": 91200, "train_runtime": 44140.2623, "train_tokens_per_second": 135406.608 }, { "epoch": 0.913, "grad_norm": 0.6190736889839172, "learning_rate": 5.681566323965486e-06, "loss": 1.1645, "num_input_tokens_seen": 5983436800, "step": 91300, "train_runtime": 44194.3429, "train_tokens_per_second": 135389.202 }, { "epoch": 0.914, "grad_norm": 0.5632036924362183, "learning_rate": 5.552528220359004e-06, "loss": 1.1691, "num_input_tokens_seen": 5989990400, "step": 91400, "train_runtime": 44242.165, "train_tokens_per_second": 135390.987 }, { "epoch": 0.915, "grad_norm": 0.6650084257125854, "learning_rate": 5.424944701189704e-06, "loss": 1.1587, "num_input_tokens_seen": 5996544000, "step": 91500, "train_runtime": 44290.3253, "train_tokens_per_second": 135391.735 }, { "epoch": 0.916, "grad_norm": 0.6665343642234802, "learning_rate": 5.298817051222182e-06, "loss": 1.16, "num_input_tokens_seen": 6003097600, "step": 91600, "train_runtime": 44344.1461, "train_tokens_per_second": 135375.199 }, { "epoch": 0.917, "grad_norm": 0.9934324026107788, "learning_rate": 5.174146540560442e-06, "loss": 1.186, "num_input_tokens_seen": 6009651200, "step": 91700, "train_runtime": 44386.6411, "train_tokens_per_second": 135393.241 }, { "epoch": 0.918, "grad_norm": 0.587840735912323, "learning_rate": 5.050934424635195e-06, "loss": 1.1685, "num_input_tokens_seen": 6016204800, "step": 91800, "train_runtime": 44440.2445, "train_tokens_per_second": 135377.401 }, { "epoch": 0.919, "grad_norm": 0.6308780312538147, "learning_rate": 4.9291819441910465e-06, "loss": 1.1593, "num_input_tokens_seen": 6022758400, "step": 91900, "train_runtime": 44487.4748, "train_tokens_per_second": 135380.99 }, { "epoch": 0.92, "grad_norm": 0.6875436305999756, "learning_rate": 4.808890325274129e-06, "loss": 1.1686, "num_input_tokens_seen": 6029312000, "step": 92000, "train_runtime": 44535.4396, "train_tokens_per_second": 135382.339 }, { "epoch": 0.921, "grad_norm": 0.6450539231300354, "learning_rate": 4.690060779219723e-06, "loss": 1.1669, "num_input_tokens_seen": 6035865600, "step": 92100, "train_runtime": 44583.0204, "train_tokens_per_second": 135384.852 }, { "epoch": 0.922, "grad_norm": 1.0118526220321655, "learning_rate": 4.572694502640023e-06, "loss": 1.1601, "num_input_tokens_seen": 6042419200, "step": 92200, "train_runtime": 44632.4327, "train_tokens_per_second": 135381.803 }, { "epoch": 0.923, "grad_norm": 0.5630050897598267, "learning_rate": 4.456792677412141e-06, "loss": 1.164, "num_input_tokens_seen": 6048972800, "step": 92300, "train_runtime": 44685.5287, "train_tokens_per_second": 135367.6 }, { "epoch": 0.924, "grad_norm": 0.5819036364555359, "learning_rate": 4.342356470666153e-06, "loss": 1.177, "num_input_tokens_seen": 6055526400, "step": 92400, "train_runtime": 44733.1102, "train_tokens_per_second": 135370.118 }, { "epoch": 0.925, "grad_norm": 0.5852016806602478, "learning_rate": 4.22938703477344e-06, "loss": 1.1846, "num_input_tokens_seen": 6062080000, "step": 92500, "train_runtime": 44781.2518, "train_tokens_per_second": 135370.937 }, { "epoch": 0.926, "grad_norm": 0.7466326355934143, "learning_rate": 4.117885507334884e-06, "loss": 1.1564, "num_input_tokens_seen": 6068633600, "step": 92600, "train_runtime": 44829.0669, "train_tokens_per_second": 135372.739 }, { "epoch": 0.927, "grad_norm": 0.7777779698371887, "learning_rate": 4.007853011169687e-06, "loss": 1.1654, "num_input_tokens_seen": 6075187200, "step": 92700, "train_runtime": 44882.4041, "train_tokens_per_second": 135357.883 }, { "epoch": 0.928, "grad_norm": 0.9159000515937805, "learning_rate": 3.899290654303855e-06, "loss": 1.1854, "num_input_tokens_seen": 6081740800, "step": 92800, "train_runtime": 44929.6625, "train_tokens_per_second": 135361.373 }, { "epoch": 0.929, "grad_norm": 0.5948230028152466, "learning_rate": 3.7921995299591168e-06, "loss": 1.1602, "num_input_tokens_seen": 6088294400, "step": 92900, "train_runtime": 44977.4717, "train_tokens_per_second": 135363.198 }, { "epoch": 0.93, "grad_norm": 0.5999124646186829, "learning_rate": 3.686580716541887e-06, "loss": 1.1484, "num_input_tokens_seen": 6094848000, "step": 93000, "train_runtime": 45026.2424, "train_tokens_per_second": 135362.128 }, { "epoch": 0.931, "grad_norm": 0.6015925407409668, "learning_rate": 3.582435277632456e-06, "loss": 1.1638, "num_input_tokens_seen": 6101401600, "step": 93100, "train_runtime": 45073.6825, "train_tokens_per_second": 135365.057 }, { "epoch": 0.932, "grad_norm": 0.5493288040161133, "learning_rate": 3.479764261974266e-06, "loss": 1.1644, "num_input_tokens_seen": 6107955200, "step": 93200, "train_runtime": 45131.734, "train_tokens_per_second": 135336.152 }, { "epoch": 0.933, "grad_norm": 0.5847836136817932, "learning_rate": 3.3785687034632523e-06, "loss": 1.1528, "num_input_tokens_seen": 6114508800, "step": 93300, "train_runtime": 45180.4411, "train_tokens_per_second": 135335.305 }, { "epoch": 0.934, "grad_norm": 0.6086737513542175, "learning_rate": 3.2788496211376024e-06, "loss": 1.1525, "num_input_tokens_seen": 6121062400, "step": 93400, "train_runtime": 45228.3556, "train_tokens_per_second": 135336.833 }, { "epoch": 0.935, "grad_norm": 0.6097891330718994, "learning_rate": 3.180608019167363e-06, "loss": 1.1681, "num_input_tokens_seen": 6127616000, "step": 93500, "train_runtime": 45275.6501, "train_tokens_per_second": 135340.21 }, { "epoch": 0.936, "grad_norm": 0.5980057716369629, "learning_rate": 3.0838448868443665e-06, "loss": 1.1603, "num_input_tokens_seen": 6134169600, "step": 93600, "train_runtime": 45322.6488, "train_tokens_per_second": 135344.464 }, { "epoch": 0.937, "grad_norm": 0.7306444048881531, "learning_rate": 2.988561198572287e-06, "loss": 1.1702, "num_input_tokens_seen": 6140723200, "step": 93700, "train_runtime": 45376.9708, "train_tokens_per_second": 135326.865 }, { "epoch": 0.938, "grad_norm": 0.9187434911727905, "learning_rate": 2.8947579138567987e-06, "loss": 1.1654, "num_input_tokens_seen": 6147276800, "step": 93800, "train_runtime": 45427.1088, "train_tokens_per_second": 135321.771 }, { "epoch": 0.939, "grad_norm": 0.6403319835662842, "learning_rate": 2.8024359772959525e-06, "loss": 1.1581, "num_input_tokens_seen": 6153830400, "step": 93900, "train_runtime": 45475.34, "train_tokens_per_second": 135322.362 }, { "epoch": 0.94, "grad_norm": 0.7088416218757629, "learning_rate": 2.711596318570597e-06, "loss": 1.1683, "num_input_tokens_seen": 6160384000, "step": 94000, "train_runtime": 45523.8789, "train_tokens_per_second": 135322.037 }, { "epoch": 0.941, "grad_norm": 0.6289553642272949, "learning_rate": 2.6222398524351206e-06, "loss": 1.1538, "num_input_tokens_seen": 6166937600, "step": 94100, "train_runtime": 45571.6907, "train_tokens_per_second": 135323.871 }, { "epoch": 0.942, "grad_norm": 0.8788822889328003, "learning_rate": 2.5343674787081435e-06, "loss": 1.1666, "num_input_tokens_seen": 6173491200, "step": 94200, "train_runtime": 45621.3271, "train_tokens_per_second": 135320.29 }, { "epoch": 0.943, "grad_norm": 0.575515866279602, "learning_rate": 2.4479800822634565e-06, "loss": 1.1685, "num_input_tokens_seen": 6180044800, "step": 94300, "train_runtime": 45670.6842, "train_tokens_per_second": 135317.543 }, { "epoch": 0.944, "grad_norm": 0.5740439891815186, "learning_rate": 2.3630785330212286e-06, "loss": 1.1588, "num_input_tokens_seen": 6186598400, "step": 94400, "train_runtime": 45717.875, "train_tokens_per_second": 135321.215 }, { "epoch": 0.945, "grad_norm": 0.6576538681983948, "learning_rate": 2.2796636859390815e-06, "loss": 1.1492, "num_input_tokens_seen": 6193152000, "step": 94500, "train_runtime": 45766.0209, "train_tokens_per_second": 135322.055 }, { "epoch": 0.946, "grad_norm": 0.5781713128089905, "learning_rate": 2.197736381003612e-06, "loss": 1.1725, "num_input_tokens_seen": 6199705600, "step": 94600, "train_runtime": 45819.6687, "train_tokens_per_second": 135306.644 }, { "epoch": 0.947, "grad_norm": 0.6812490820884705, "learning_rate": 2.1172974432218826e-06, "loss": 1.1509, "num_input_tokens_seen": 6206259200, "step": 94700, "train_runtime": 45866.8187, "train_tokens_per_second": 135310.435 }, { "epoch": 0.948, "grad_norm": 0.8884466886520386, "learning_rate": 2.0383476826130786e-06, "loss": 1.157, "num_input_tokens_seen": 6212812800, "step": 94800, "train_runtime": 45915.7744, "train_tokens_per_second": 135308.897 }, { "epoch": 0.949, "grad_norm": 0.6096293926239014, "learning_rate": 1.96088789420043e-06, "loss": 1.1609, "num_input_tokens_seen": 6219366400, "step": 94900, "train_runtime": 45963.3824, "train_tokens_per_second": 135311.33 }, { "epoch": 0.95, "grad_norm": 0.5762118697166443, "learning_rate": 1.8849188580031539e-06, "loss": 1.1621, "num_input_tokens_seen": 6225920000, "step": 95000, "train_runtime": 46012.4538, "train_tokens_per_second": 135309.454 }, { "epoch": 0.951, "grad_norm": 0.5296618938446045, "learning_rate": 1.8104413390286066e-06, "loss": 1.157, "num_input_tokens_seen": 6232473600, "step": 95100, "train_runtime": 46059.2761, "train_tokens_per_second": 135314.189 }, { "epoch": 0.952, "grad_norm": 0.6025533676147461, "learning_rate": 1.7374560872645438e-06, "loss": 1.1507, "num_input_tokens_seen": 6239027200, "step": 95200, "train_runtime": 46113.68, "train_tokens_per_second": 135296.667 }, { "epoch": 0.953, "grad_norm": 0.616148829460144, "learning_rate": 1.6659638376716578e-06, "loss": 1.1711, "num_input_tokens_seen": 6245580800, "step": 95300, "train_runtime": 46162.0494, "train_tokens_per_second": 135296.87 }, { "epoch": 0.954, "grad_norm": 0.6661262512207031, "learning_rate": 1.5959653101761172e-06, "loss": 1.1604, "num_input_tokens_seen": 6252134400, "step": 95400, "train_runtime": 46208.848, "train_tokens_per_second": 135301.672 }, { "epoch": 0.955, "grad_norm": 0.8173303604125977, "learning_rate": 1.5274612096623063e-06, "loss": 1.1498, "num_input_tokens_seen": 6258688000, "step": 95500, "train_runtime": 46256.5159, "train_tokens_per_second": 135303.922 }, { "epoch": 0.956, "grad_norm": 0.6189817786216736, "learning_rate": 1.4604522259657635e-06, "loss": 1.1602, "num_input_tokens_seen": 6265241600, "step": 95600, "train_runtime": 46309.4141, "train_tokens_per_second": 135290.885 }, { "epoch": 0.957, "grad_norm": 0.7523248195648193, "learning_rate": 1.3949390338662047e-06, "loss": 1.1655, "num_input_tokens_seen": 6271795200, "step": 95700, "train_runtime": 46357.4405, "train_tokens_per_second": 135292.094 }, { "epoch": 0.958, "grad_norm": 0.5935103297233582, "learning_rate": 1.330922293080744e-06, "loss": 1.1702, "num_input_tokens_seen": 6278348800, "step": 95800, "train_runtime": 46406.0604, "train_tokens_per_second": 135291.571 }, { "epoch": 0.959, "grad_norm": 0.8042653203010559, "learning_rate": 1.2684026482572662e-06, "loss": 1.1623, "num_input_tokens_seen": 6284902400, "step": 95900, "train_runtime": 46454.8491, "train_tokens_per_second": 135290.557 }, { "epoch": 0.96, "grad_norm": 0.5935735106468201, "learning_rate": 1.2073807289678993e-06, "loss": 1.1441, "num_input_tokens_seen": 6291456000, "step": 96000, "train_runtime": 46502.688, "train_tokens_per_second": 135292.308 }, { "epoch": 0.961, "grad_norm": 0.5718377828598022, "learning_rate": 1.147857149702669e-06, "loss": 1.1618, "num_input_tokens_seen": 6298009600, "step": 96100, "train_runtime": 46555.2337, "train_tokens_per_second": 135280.378 }, { "epoch": 0.962, "grad_norm": 0.6801995635032654, "learning_rate": 1.0898325098633697e-06, "loss": 1.1479, "num_input_tokens_seen": 6304563200, "step": 96200, "train_runtime": 46603.2751, "train_tokens_per_second": 135281.548 }, { "epoch": 0.963, "grad_norm": 0.5564619898796082, "learning_rate": 1.0333073937575043e-06, "loss": 1.1582, "num_input_tokens_seen": 6311116800, "step": 96300, "train_runtime": 46652.5681, "train_tokens_per_second": 135279.087 }, { "epoch": 0.964, "grad_norm": 0.6501321792602539, "learning_rate": 9.782823705923204e-07, "loss": 1.1617, "num_input_tokens_seen": 6317670400, "step": 96400, "train_runtime": 46700.1727, "train_tokens_per_second": 135281.521 }, { "epoch": 0.965, "grad_norm": 0.6728459596633911, "learning_rate": 9.247579944692162e-07, "loss": 1.1592, "num_input_tokens_seen": 6324224000, "step": 96500, "train_runtime": 46748.7553, "train_tokens_per_second": 135281.12 }, { "epoch": 0.966, "grad_norm": 0.5893784761428833, "learning_rate": 8.72734804378078e-07, "loss": 1.1691, "num_input_tokens_seen": 6330777600, "step": 96600, "train_runtime": 46801.015, "train_tokens_per_second": 135270.092 }, { "epoch": 0.967, "grad_norm": 0.8625339269638062, "learning_rate": 8.222133241918172e-07, "loss": 1.1518, "num_input_tokens_seen": 6337331200, "step": 96700, "train_runtime": 46847.2237, "train_tokens_per_second": 135276.559 }, { "epoch": 0.968, "grad_norm": 0.6501858830451965, "learning_rate": 7.731940626612088e-07, "loss": 1.1693, "num_input_tokens_seen": 6343884800, "step": 96800, "train_runtime": 46895.3712, "train_tokens_per_second": 135277.419 }, { "epoch": 0.969, "grad_norm": 0.6575475335121155, "learning_rate": 7.256775134096615e-07, "loss": 1.1552, "num_input_tokens_seen": 6350438400, "step": 96900, "train_runtime": 46942.8491, "train_tokens_per_second": 135280.208 }, { "epoch": 0.97, "grad_norm": 0.5287050604820251, "learning_rate": 6.796641549283055e-07, "loss": 1.1946, "num_input_tokens_seen": 6356992000, "step": 97000, "train_runtime": 46991.8919, "train_tokens_per_second": 135278.486 }, { "epoch": 0.971, "grad_norm": 0.568566083908081, "learning_rate": 6.351544505711292e-07, "loss": 1.1559, "num_input_tokens_seen": 6363545600, "step": 97100, "train_runtime": 47040.0316, "train_tokens_per_second": 135279.365 }, { "epoch": 0.972, "grad_norm": 0.9329395890235901, "learning_rate": 5.921488485503833e-07, "loss": 1.1603, "num_input_tokens_seen": 6370099200, "step": 97200, "train_runtime": 47092.2725, "train_tokens_per_second": 135268.46 }, { "epoch": 0.973, "grad_norm": 0.6256415843963623, "learning_rate": 5.506477819319843e-07, "loss": 1.1571, "num_input_tokens_seen": 6376652800, "step": 97300, "train_runtime": 47139.4068, "train_tokens_per_second": 135272.233 }, { "epoch": 0.974, "grad_norm": 0.7202081680297852, "learning_rate": 5.106516686312345e-07, "loss": 1.1638, "num_input_tokens_seen": 6383206400, "step": 97400, "train_runtime": 47191.9059, "train_tokens_per_second": 135260.619 }, { "epoch": 0.975, "grad_norm": 1.2700363397598267, "learning_rate": 4.721609114085256e-07, "loss": 1.1649, "num_input_tokens_seen": 6389760000, "step": 97500, "train_runtime": 47240.0777, "train_tokens_per_second": 135261.42 }, { "epoch": 0.976, "grad_norm": 0.5555500388145447, "learning_rate": 4.3517589786539186e-07, "loss": 1.1505, "num_input_tokens_seen": 6396313600, "step": 97600, "train_runtime": 47287.972, "train_tokens_per_second": 135263.013 }, { "epoch": 0.977, "grad_norm": 0.6499391198158264, "learning_rate": 3.996970004404798e-07, "loss": 1.153, "num_input_tokens_seen": 6402867200, "step": 97700, "train_runtime": 47335.8726, "train_tokens_per_second": 135264.586 }, { "epoch": 0.978, "grad_norm": 0.6353591084480286, "learning_rate": 3.657245764058847e-07, "loss": 1.1621, "num_input_tokens_seen": 6409420800, "step": 97800, "train_runtime": 47382.5196, "train_tokens_per_second": 135269.733 }, { "epoch": 0.979, "grad_norm": 0.62052321434021, "learning_rate": 3.3325896786355334e-07, "loss": 1.1539, "num_input_tokens_seen": 6415974400, "step": 97900, "train_runtime": 47435.6023, "train_tokens_per_second": 135256.518 }, { "epoch": 0.98, "grad_norm": 0.5979087352752686, "learning_rate": 3.023005017418201e-07, "loss": 1.1615, "num_input_tokens_seen": 6422528000, "step": 98000, "train_runtime": 47484.0018, "train_tokens_per_second": 135256.671 }, { "epoch": 0.981, "grad_norm": 1.0899096727371216, "learning_rate": 2.7284948979205967e-07, "loss": 1.166, "num_input_tokens_seen": 6429081600, "step": 98100, "train_runtime": 47531.611, "train_tokens_per_second": 135259.072 }, { "epoch": 0.982, "grad_norm": 0.6240010857582092, "learning_rate": 2.449062285856729e-07, "loss": 1.1565, "num_input_tokens_seen": 6435635200, "step": 98200, "train_runtime": 47578.8884, "train_tokens_per_second": 135262.412 }, { "epoch": 0.983, "grad_norm": 0.7941544651985168, "learning_rate": 2.184709995109557e-07, "loss": 1.1572, "num_input_tokens_seen": 6442188800, "step": 98300, "train_runtime": 47627.3828, "train_tokens_per_second": 135262.289 }, { "epoch": 0.984, "grad_norm": 0.5704551339149475, "learning_rate": 1.9354406877038487e-07, "loss": 1.1629, "num_input_tokens_seen": 6448742400, "step": 98400, "train_runtime": 47679.6586, "train_tokens_per_second": 135251.438 }, { "epoch": 0.985, "grad_norm": 0.5758212208747864, "learning_rate": 1.7012568737788668e-07, "loss": 1.1892, "num_input_tokens_seen": 6455296000, "step": 98500, "train_runtime": 47728.7818, "train_tokens_per_second": 135249.545 }, { "epoch": 0.986, "grad_norm": 0.5768951773643494, "learning_rate": 1.4821609115630574e-07, "loss": 1.1617, "num_input_tokens_seen": 6461849600, "step": 98600, "train_runtime": 47775.3275, "train_tokens_per_second": 135254.952 }, { "epoch": 0.987, "grad_norm": 0.5714033842086792, "learning_rate": 1.278155007350068e-07, "loss": 1.1712, "num_input_tokens_seen": 6468403200, "step": 98700, "train_runtime": 47823.1467, "train_tokens_per_second": 135256.746 }, { "epoch": 0.988, "grad_norm": 1.029975414276123, "learning_rate": 1.089241215477099e-07, "loss": 1.1621, "num_input_tokens_seen": 6474956800, "step": 98800, "train_runtime": 47875.5087, "train_tokens_per_second": 135245.702 }, { "epoch": 0.989, "grad_norm": 0.5554516315460205, "learning_rate": 9.154214383042535e-08, "loss": 1.1489, "num_input_tokens_seen": 6481510400, "step": 98900, "train_runtime": 47923.8409, "train_tokens_per_second": 135246.055 }, { "epoch": 0.99, "grad_norm": 0.6340943574905396, "learning_rate": 7.566974261945524e-08, "loss": 1.1721, "num_input_tokens_seen": 6488064000, "step": 99000, "train_runtime": 47972.1937, "train_tokens_per_second": 135246.348 }, { "epoch": 0.991, "grad_norm": 0.582399845123291, "learning_rate": 6.13070777496949e-08, "loss": 1.1497, "num_input_tokens_seen": 6494617600, "step": 99100, "train_runtime": 48020.3976, "train_tokens_per_second": 135247.06 }, { "epoch": 0.992, "grad_norm": 0.6133337020874023, "learning_rate": 4.845429385303412e-08, "loss": 1.1601, "num_input_tokens_seen": 6501171200, "step": 99200, "train_runtime": 48068.6895, "train_tokens_per_second": 135247.523 }, { "epoch": 0.993, "grad_norm": 0.5691381096839905, "learning_rate": 3.711152035685838e-08, "loss": 1.1571, "num_input_tokens_seen": 6507724800, "step": 99300, "train_runtime": 48115.7967, "train_tokens_per_second": 135251.315 }, { "epoch": 0.994, "grad_norm": 0.6613404750823975, "learning_rate": 2.727887148278318e-08, "loss": 1.1569, "num_input_tokens_seen": 6514278400, "step": 99400, "train_runtime": 48169.6246, "train_tokens_per_second": 135236.229 }, { "epoch": 0.995, "grad_norm": 0.5285235047340393, "learning_rate": 1.8956446245455005e-08, "loss": 1.1722, "num_input_tokens_seen": 6520832000, "step": 99500, "train_runtime": 48217.4936, "train_tokens_per_second": 135237.888 }, { "epoch": 0.996, "grad_norm": 0.8071156144142151, "learning_rate": 1.2144328451618724e-08, "loss": 1.1571, "num_input_tokens_seen": 6527385600, "step": 99600, "train_runtime": 48264.7605, "train_tokens_per_second": 135241.231 }, { "epoch": 0.997, "grad_norm": 0.5775815844535828, "learning_rate": 6.84258669920168e-09, "loss": 1.1634, "num_input_tokens_seen": 6533939200, "step": 99700, "train_runtime": 48314.0709, "train_tokens_per_second": 135238.846 }, { "epoch": 0.998, "grad_norm": 0.5299545526504517, "learning_rate": 3.0512743767141524e-09, "loss": 1.1563, "num_input_tokens_seen": 6540492800, "step": 99800, "train_runtime": 48364.7142, "train_tokens_per_second": 135232.74 }, { "epoch": 0.999, "grad_norm": 0.636650800704956, "learning_rate": 7.70429662616534e-10, "loss": 1.1653, "num_input_tokens_seen": 6547046400, "step": 99900, "train_runtime": 48412.6126, "train_tokens_per_second": 135234.313 }, { "epoch": 1.0, "grad_norm": 0.5705932974815369, "learning_rate": 7.552498626495208e-14, "loss": 1.1814, "num_input_tokens_seen": 6553600000, "step": 100000, "train_runtime": 48460.0302, "train_tokens_per_second": 135237.225 }, { "epoch": 1.0, "num_input_tokens_seen": 6553600000, "step": 100000, "total_flos": 1.23866185728e+17, "train_loss": 1.241861473388672, "train_runtime": 48460.2218, "train_samples_per_second": 528.268, "train_steps_per_second": 2.064 } ], "logging_steps": 100, "max_steps": 100000, "num_input_tokens_seen": 6553600000, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.23866185728e+17, "train_batch_size": 256, "trial_name": null, "trial_params": null }