{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.61, "eval_steps": 500, "global_step": 61000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001, "grad_norm": 17.39519691467285, "learning_rate": 2.97e-05, "loss": 9.7941, "num_input_tokens_seen": 6553600, "step": 100, "train_runtime": 74.0623, "train_tokens_per_second": 88487.632 }, { "epoch": 0.002, "grad_norm": 10.212440490722656, "learning_rate": 5.97e-05, "loss": 1.0389, "num_input_tokens_seen": 13107200, "step": 200, "train_runtime": 135.0365, "train_tokens_per_second": 97064.126 }, { "epoch": 0.003, "grad_norm": 6.982235908508301, "learning_rate": 8.969999999999998e-05, "loss": 0.7951, "num_input_tokens_seen": 19660800, "step": 300, "train_runtime": 196.4342, "train_tokens_per_second": 100088.472 }, { "epoch": 0.004, "grad_norm": 2.089735507965088, "learning_rate": 0.0001197, "loss": 0.6341, "num_input_tokens_seen": 26214400, "step": 400, "train_runtime": 257.5653, "train_tokens_per_second": 101777.682 }, { "epoch": 0.005, "grad_norm": 2.6269969940185547, "learning_rate": 0.00014969999999999998, "loss": 0.5353, "num_input_tokens_seen": 32768000, "step": 500, "train_runtime": 323.599, "train_tokens_per_second": 101261.143 }, { "epoch": 0.006, "grad_norm": 0.9126470685005188, "learning_rate": 0.00017969999999999998, "loss": 0.4822, "num_input_tokens_seen": 39321600, "step": 600, "train_runtime": 385.3073, "train_tokens_per_second": 102052.566 }, { "epoch": 0.007, "grad_norm": 0.7452394366264343, "learning_rate": 0.00020969999999999997, "loss": 0.4534, "num_input_tokens_seen": 45875200, "step": 700, "train_runtime": 447.534, "train_tokens_per_second": 102506.63 }, { "epoch": 0.008, "grad_norm": 0.6909123659133911, "learning_rate": 0.0002397, "loss": 0.4323, "num_input_tokens_seen": 52428800, "step": 800, "train_runtime": 510.1043, "train_tokens_per_second": 102780.558 }, { "epoch": 0.009, "grad_norm": 0.5689504146575928, "learning_rate": 0.0002697, "loss": 0.4262, "num_input_tokens_seen": 58982400, "step": 900, "train_runtime": 571.3595, "train_tokens_per_second": 103231.669 }, { "epoch": 0.01, "grad_norm": 0.42208704352378845, "learning_rate": 0.00029969999999999997, "loss": 0.4158, "num_input_tokens_seen": 65536000, "step": 1000, "train_runtime": 638.5123, "train_tokens_per_second": 102638.586 }, { "epoch": 0.011, "grad_norm": 0.4542798399925232, "learning_rate": 0.00029999925978027874, "loss": 0.4127, "num_input_tokens_seen": 72089600, "step": 1100, "train_runtime": 698.6527, "train_tokens_per_second": 103183.742 }, { "epoch": 0.012, "grad_norm": 0.4086480736732483, "learning_rate": 0.0002999970091452017, "loss": 0.4018, "num_input_tokens_seen": 78643200, "step": 1200, "train_runtime": 761.7182, "train_tokens_per_second": 103244.485 }, { "epoch": 0.013, "grad_norm": 0.37623685598373413, "learning_rate": 0.00029999324804190795, "loss": 0.3969, "num_input_tokens_seen": 85196800, "step": 1300, "train_runtime": 827.9033, "train_tokens_per_second": 102906.7 }, { "epoch": 0.014, "grad_norm": 0.3346163332462311, "learning_rate": 0.0002999879765082716, "loss": 0.3906, "num_input_tokens_seen": 91750400, "step": 1400, "train_runtime": 889.5401, "train_tokens_per_second": 103143.635 }, { "epoch": 0.015, "grad_norm": 0.4093320369720459, "learning_rate": 0.000299981194597377, "loss": 0.3852, "num_input_tokens_seen": 98304000, "step": 1500, "train_runtime": 950.9359, "train_tokens_per_second": 103376.055 }, { "epoch": 0.016, "grad_norm": 0.3808560371398926, "learning_rate": 0.0002999729023775179, "loss": 0.3819, "num_input_tokens_seen": 104857600, "step": 1600, "train_runtime": 1017.4047, "train_tokens_per_second": 103063.807 }, { "epoch": 0.017, "grad_norm": 0.3014701306819916, "learning_rate": 0.0002999630999321969, "loss": 0.387, "num_input_tokens_seen": 111411200, "step": 1700, "train_runtime": 1075.027, "train_tokens_per_second": 103635.721 }, { "epoch": 0.018, "grad_norm": 0.25073230266571045, "learning_rate": 0.00029995178736012443, "loss": 0.382, "num_input_tokens_seen": 117964800, "step": 1800, "train_runtime": 1141.6684, "train_tokens_per_second": 103326.671 }, { "epoch": 0.019, "grad_norm": 0.2569698989391327, "learning_rate": 0.0002999389647752181, "loss": 0.3745, "num_input_tokens_seen": 124518400, "step": 1900, "train_runtime": 1202.9974, "train_tokens_per_second": 103506.793 }, { "epoch": 0.02, "grad_norm": 0.2895148694515228, "learning_rate": 0.00029992463230660104, "loss": 0.3747, "num_input_tokens_seen": 131072000, "step": 2000, "train_runtime": 1271.272, "train_tokens_per_second": 103103.035 }, { "epoch": 0.021, "grad_norm": 0.28352853655815125, "learning_rate": 0.00029990879009860117, "loss": 0.3701, "num_input_tokens_seen": 137625600, "step": 2100, "train_runtime": 1335.8501, "train_tokens_per_second": 103024.736 }, { "epoch": 0.022, "grad_norm": 0.2598542273044586, "learning_rate": 0.0002998914383107493, "loss": 0.3715, "num_input_tokens_seen": 144179200, "step": 2200, "train_runtime": 1400.0516, "train_tokens_per_second": 102981.347 }, { "epoch": 0.023, "grad_norm": 0.300857275724411, "learning_rate": 0.0002998725771177778, "loss": 0.3723, "num_input_tokens_seen": 150732800, "step": 2300, "train_runtime": 1465.03, "train_tokens_per_second": 102887.178 }, { "epoch": 0.024, "grad_norm": 0.19827991724014282, "learning_rate": 0.00029985220670961847, "loss": 0.3654, "num_input_tokens_seen": 157286400, "step": 2400, "train_runtime": 1534.4652, "train_tokens_per_second": 102502.423 }, { "epoch": 0.025, "grad_norm": 0.36876365542411804, "learning_rate": 0.0002998303272914014, "loss": 0.368, "num_input_tokens_seen": 163840000, "step": 2500, "train_runtime": 1598.5928, "train_tokens_per_second": 102490.141 }, { "epoch": 0.026, "grad_norm": 0.23755036294460297, "learning_rate": 0.00029980693908345185, "loss": 0.3648, "num_input_tokens_seen": 170393600, "step": 2600, "train_runtime": 1661.9675, "train_tokens_per_second": 102525.227 }, { "epoch": 0.027, "grad_norm": 0.3921568691730499, "learning_rate": 0.00029978204232128895, "loss": 0.3633, "num_input_tokens_seen": 176947200, "step": 2700, "train_runtime": 1731.9606, "train_tokens_per_second": 102165.837 }, { "epoch": 0.028, "grad_norm": 0.1964094191789627, "learning_rate": 0.0002997556372556227, "loss": 0.365, "num_input_tokens_seen": 183500800, "step": 2800, "train_runtime": 1796.4926, "train_tokens_per_second": 102143.922 }, { "epoch": 0.029, "grad_norm": 0.2469199150800705, "learning_rate": 0.0002997277241523519, "loss": 0.364, "num_input_tokens_seen": 190054400, "step": 2900, "train_runtime": 1860.3342, "train_tokens_per_second": 102161.428 }, { "epoch": 0.03, "grad_norm": 0.19437766075134277, "learning_rate": 0.00029969830329256125, "loss": 0.3574, "num_input_tokens_seen": 196608000, "step": 3000, "train_runtime": 1924.7283, "train_tokens_per_second": 102148.444 }, { "epoch": 0.031, "grad_norm": 0.23198598623275757, "learning_rate": 0.00029966737497251836, "loss": 0.3599, "num_input_tokens_seen": 203161600, "step": 3100, "train_runtime": 1993.345, "train_tokens_per_second": 101919.94 }, { "epoch": 0.032, "grad_norm": 0.22857527434825897, "learning_rate": 0.0002996349395036711, "loss": 0.3579, "num_input_tokens_seen": 209715200, "step": 3200, "train_runtime": 2057.8023, "train_tokens_per_second": 101912.218 }, { "epoch": 0.033, "grad_norm": 0.24812710285186768, "learning_rate": 0.00029960099721264435, "loss": 0.3612, "num_input_tokens_seen": 216268800, "step": 3300, "train_runtime": 2121.9536, "train_tokens_per_second": 101919.666 }, { "epoch": 0.034, "grad_norm": 0.21982239186763763, "learning_rate": 0.0002995655484412365, "loss": 0.3554, "num_input_tokens_seen": 222822400, "step": 3400, "train_runtime": 2186.6347, "train_tokens_per_second": 101901.979 }, { "epoch": 0.035, "grad_norm": 0.3460980951786041, "learning_rate": 0.00029952859354641636, "loss": 0.3568, "num_input_tokens_seen": 229376000, "step": 3500, "train_runtime": 2256.5384, "train_tokens_per_second": 101649.502 }, { "epoch": 0.036, "grad_norm": 0.25577911734580994, "learning_rate": 0.00029949013290031924, "loss": 0.354, "num_input_tokens_seen": 235929600, "step": 3600, "train_runtime": 2320.5776, "train_tokens_per_second": 101668.483 }, { "epoch": 0.037, "grad_norm": 0.16108086705207825, "learning_rate": 0.00029945016689024353, "loss": 0.3509, "num_input_tokens_seen": 242483200, "step": 3700, "train_runtime": 2383.8992, "train_tokens_per_second": 101717.051 }, { "epoch": 0.038, "grad_norm": 0.2431662529706955, "learning_rate": 0.0002994086959186464, "loss": 0.3527, "num_input_tokens_seen": 249036800, "step": 3800, "train_runtime": 2448.8427, "train_tokens_per_second": 101695.71 }, { "epoch": 0.039, "grad_norm": 0.18574966490268707, "learning_rate": 0.00029936572040314014, "loss": 0.3546, "num_input_tokens_seen": 255590400, "step": 3900, "train_runtime": 2518.1288, "train_tokens_per_second": 101500.13 }, { "epoch": 0.04, "grad_norm": 0.15902996063232422, "learning_rate": 0.0002993212407764877, "loss": 0.3519, "num_input_tokens_seen": 262144000, "step": 4000, "train_runtime": 2581.8809, "train_tokens_per_second": 101532.18 }, { "epoch": 0.041, "grad_norm": 0.21019065380096436, "learning_rate": 0.00029927525748659834, "loss": 0.3567, "num_input_tokens_seen": 268697600, "step": 4100, "train_runtime": 2646.5068, "train_tokens_per_second": 101529.154 }, { "epoch": 0.042, "grad_norm": 0.18648174405097961, "learning_rate": 0.0002992277709965234, "loss": 0.3512, "num_input_tokens_seen": 275251200, "step": 4200, "train_runtime": 2710.4754, "train_tokens_per_second": 101550.895 }, { "epoch": 0.043, "grad_norm": 0.21123889088630676, "learning_rate": 0.0002991787817844513, "loss": 0.3521, "num_input_tokens_seen": 281804800, "step": 4300, "train_runtime": 2780.6173, "train_tokens_per_second": 101346.13 }, { "epoch": 0.044, "grad_norm": 0.22183509171009064, "learning_rate": 0.0002991282903437028, "loss": 0.3486, "num_input_tokens_seen": 288358400, "step": 4400, "train_runtime": 2843.584, "train_tokens_per_second": 101406.674 }, { "epoch": 0.045, "grad_norm": 0.19213925302028656, "learning_rate": 0.0002990762971827262, "loss": 0.3481, "num_input_tokens_seen": 294912000, "step": 4500, "train_runtime": 2906.5309, "train_tokens_per_second": 101465.29 }, { "epoch": 0.046, "grad_norm": 0.16215530037879944, "learning_rate": 0.00029902280282509197, "loss": 0.3506, "num_input_tokens_seen": 301465600, "step": 4600, "train_runtime": 2977.8135, "train_tokens_per_second": 101237.232 }, { "epoch": 0.047, "grad_norm": 0.17120705544948578, "learning_rate": 0.0002989678078094878, "loss": 0.3433, "num_input_tokens_seen": 308019200, "step": 4700, "train_runtime": 3040.7538, "train_tokens_per_second": 101296.988 }, { "epoch": 0.048, "grad_norm": 0.26389873027801514, "learning_rate": 0.00029891131268971284, "loss": 0.345, "num_input_tokens_seen": 314572800, "step": 4800, "train_runtime": 3104.3446, "train_tokens_per_second": 101333.081 }, { "epoch": 0.049, "grad_norm": 0.1639779806137085, "learning_rate": 0.0002988533180346723, "loss": 0.3431, "num_input_tokens_seen": 321126400, "step": 4900, "train_runtime": 3172.6385, "train_tokens_per_second": 101217.457 }, { "epoch": 0.05, "grad_norm": 0.21486082673072815, "learning_rate": 0.0002987938244283717, "loss": 0.3413, "num_input_tokens_seen": 327680000, "step": 5000, "train_runtime": 3237.5961, "train_tokens_per_second": 101210.896 }, { "epoch": 0.051, "grad_norm": 0.20326170325279236, "learning_rate": 0.00029873283246991105, "loss": 0.3457, "num_input_tokens_seen": 334233600, "step": 5100, "train_runtime": 3302.3096, "train_tokens_per_second": 101212.074 }, { "epoch": 0.052, "grad_norm": 0.171161487698555, "learning_rate": 0.0002986703427734787, "loss": 0.345, "num_input_tokens_seen": 340787200, "step": 5200, "train_runtime": 3367.4928, "train_tokens_per_second": 101199.089 }, { "epoch": 0.053, "grad_norm": 0.19781792163848877, "learning_rate": 0.00029860635596834517, "loss": 0.3455, "num_input_tokens_seen": 347340800, "step": 5300, "train_runtime": 3430.9148, "train_tokens_per_second": 101238.538 }, { "epoch": 0.054, "grad_norm": 0.1795511543750763, "learning_rate": 0.0002985408726988569, "loss": 0.3439, "num_input_tokens_seen": 353894400, "step": 5400, "train_runtime": 3498.4556, "train_tokens_per_second": 101157.322 }, { "epoch": 0.055, "grad_norm": 0.1671728938817978, "learning_rate": 0.0002984738936244296, "loss": 0.3422, "num_input_tokens_seen": 360448000, "step": 5500, "train_runtime": 3561.4394, "train_tokens_per_second": 101208.516 }, { "epoch": 0.056, "grad_norm": 0.17824003100395203, "learning_rate": 0.0002984054194195419, "loss": 0.3489, "num_input_tokens_seen": 367001600, "step": 5600, "train_runtime": 3625.8956, "train_tokens_per_second": 101216.814 }, { "epoch": 0.057, "grad_norm": 0.1654757708311081, "learning_rate": 0.0002983354507737283, "loss": 0.3463, "num_input_tokens_seen": 373555200, "step": 5700, "train_runtime": 3690.173, "train_tokens_per_second": 101229.725 }, { "epoch": 0.058, "grad_norm": 0.2033533751964569, "learning_rate": 0.00029826398839157215, "loss": 0.3462, "num_input_tokens_seen": 380108800, "step": 5800, "train_runtime": 3759.2019, "train_tokens_per_second": 101114.229 }, { "epoch": 0.059, "grad_norm": 0.19753150641918182, "learning_rate": 0.000298191032992699, "loss": 0.3436, "num_input_tokens_seen": 386662400, "step": 5900, "train_runtime": 3822.1964, "train_tokens_per_second": 101162.357 }, { "epoch": 0.06, "grad_norm": 0.13978537917137146, "learning_rate": 0.0002981165853117688, "loss": 0.3393, "num_input_tokens_seen": 393216000, "step": 6000, "train_runtime": 3890.9859, "train_tokens_per_second": 101058.192 }, { "epoch": 0.061, "grad_norm": 0.28539636731147766, "learning_rate": 0.000298040646098469, "loss": 0.3419, "num_input_tokens_seen": 399769600, "step": 6100, "train_runtime": 3955.42, "train_tokens_per_second": 101068.813 }, { "epoch": 0.062, "grad_norm": 0.14195021986961365, "learning_rate": 0.0002979632161175064, "loss": 0.3408, "num_input_tokens_seen": 406323200, "step": 6200, "train_runtime": 4019.3462, "train_tokens_per_second": 101091.865 }, { "epoch": 0.063, "grad_norm": 0.26058393716812134, "learning_rate": 0.0002978842961486003, "loss": 0.3411, "num_input_tokens_seen": 412876800, "step": 6300, "train_runtime": 4082.619, "train_tokens_per_second": 101130.379 }, { "epoch": 0.064, "grad_norm": 0.1645655333995819, "learning_rate": 0.0002978038869864738, "loss": 0.3392, "num_input_tokens_seen": 419430400, "step": 6400, "train_runtime": 4152.2955, "train_tokens_per_second": 101011.694 }, { "epoch": 0.065, "grad_norm": 0.1678280532360077, "learning_rate": 0.0002977219894408463, "loss": 0.338, "num_input_tokens_seen": 425984000, "step": 6500, "train_runtime": 4215.8141, "train_tokens_per_second": 101044.304 }, { "epoch": 0.066, "grad_norm": 0.19337573647499084, "learning_rate": 0.0002976386043364251, "loss": 0.3424, "num_input_tokens_seen": 432537600, "step": 6600, "train_runtime": 4278.8465, "train_tokens_per_second": 101087.432 }, { "epoch": 0.067, "grad_norm": 0.14295175671577454, "learning_rate": 0.00029755373251289733, "loss": 0.3443, "num_input_tokens_seen": 439091200, "step": 6700, "train_runtime": 4348.6665, "train_tokens_per_second": 100971.459 }, { "epoch": 0.068, "grad_norm": 0.22164900600910187, "learning_rate": 0.0002974673748249213, "loss": 0.339, "num_input_tokens_seen": 445644800, "step": 6800, "train_runtime": 4413.12, "train_tokens_per_second": 100981.799 }, { "epoch": 0.069, "grad_norm": 0.1831408590078354, "learning_rate": 0.00029737953214211804, "loss": 0.3398, "num_input_tokens_seen": 452198400, "step": 6900, "train_runtime": 4477.6672, "train_tokens_per_second": 100989.73 }, { "epoch": 0.07, "grad_norm": 0.21329298615455627, "learning_rate": 0.0002972902053490623, "loss": 0.3372, "num_input_tokens_seen": 458752000, "step": 7000, "train_runtime": 4541.4752, "train_tokens_per_second": 101013.873 }, { "epoch": 0.071, "grad_norm": 0.16601704061031342, "learning_rate": 0.00029719939534527393, "loss": 0.3436, "num_input_tokens_seen": 465305600, "step": 7100, "train_runtime": 4607.1943, "train_tokens_per_second": 100995.436 }, { "epoch": 0.072, "grad_norm": 0.2303948849439621, "learning_rate": 0.00029710710304520866, "loss": 0.339, "num_input_tokens_seen": 471859200, "step": 7200, "train_runtime": 4672.0421, "train_tokens_per_second": 100996.349 }, { "epoch": 0.073, "grad_norm": 0.21449029445648193, "learning_rate": 0.00029701332937824885, "loss": 0.336, "num_input_tokens_seen": 478412800, "step": 7300, "train_runtime": 4742.0375, "train_tokens_per_second": 100887.605 }, { "epoch": 0.074, "grad_norm": 0.1367533802986145, "learning_rate": 0.0002969180752886944, "loss": 0.3397, "num_input_tokens_seen": 484966400, "step": 7400, "train_runtime": 4805.1341, "train_tokens_per_second": 100926.716 }, { "epoch": 0.075, "grad_norm": 0.1852603256702423, "learning_rate": 0.0002968213417357529, "loss": 0.34, "num_input_tokens_seen": 491520000, "step": 7500, "train_runtime": 4867.6611, "train_tokens_per_second": 100976.628 }, { "epoch": 0.076, "grad_norm": 0.18590585887432098, "learning_rate": 0.00029672312969353015, "loss": 0.3375, "num_input_tokens_seen": 498073600, "step": 7600, "train_runtime": 4938.9456, "train_tokens_per_second": 100846.14 }, { "epoch": 0.077, "grad_norm": 0.17078232765197754, "learning_rate": 0.00029662344015102027, "loss": 0.3374, "num_input_tokens_seen": 504627200, "step": 7700, "train_runtime": 5003.5948, "train_tokens_per_second": 100852.931 }, { "epoch": 0.078, "grad_norm": 0.14574670791625977, "learning_rate": 0.00029652227411209594, "loss": 0.3369, "num_input_tokens_seen": 511180800, "step": 7800, "train_runtime": 5067.2522, "train_tokens_per_second": 100879.289 }, { "epoch": 0.079, "grad_norm": 0.1603483408689499, "learning_rate": 0.0002964196325954979, "loss": 0.3352, "num_input_tokens_seen": 517734400, "step": 7900, "train_runtime": 5131.2908, "train_tokens_per_second": 100897.497 }, { "epoch": 0.08, "grad_norm": 0.16576310992240906, "learning_rate": 0.0002963155166348253, "loss": 0.3376, "num_input_tokens_seen": 524288000, "step": 8000, "train_runtime": 5200.6662, "train_tokens_per_second": 100811.699 }, { "epoch": 0.081, "grad_norm": 0.31833919882774353, "learning_rate": 0.0002962099272785246, "loss": 0.3382, "num_input_tokens_seen": 530841600, "step": 8100, "train_runtime": 5266.7639, "train_tokens_per_second": 100790.849 }, { "epoch": 0.082, "grad_norm": 0.14755409955978394, "learning_rate": 0.0002961028655898794, "loss": 0.3348, "num_input_tokens_seen": 537395200, "step": 8200, "train_runtime": 5331.3948, "train_tokens_per_second": 100798.238 }, { "epoch": 0.083, "grad_norm": 0.2060171663761139, "learning_rate": 0.0002959943326469998, "loss": 0.3338, "num_input_tokens_seen": 543948800, "step": 8300, "train_runtime": 5395.0396, "train_tokens_per_second": 100823.876 }, { "epoch": 0.084, "grad_norm": 0.16461625695228577, "learning_rate": 0.0002958843295428112, "loss": 0.3326, "num_input_tokens_seen": 550502400, "step": 8400, "train_runtime": 5458.2259, "train_tokens_per_second": 100857.387 }, { "epoch": 0.085, "grad_norm": 0.15455660223960876, "learning_rate": 0.0002957728573850438, "loss": 0.3339, "num_input_tokens_seen": 557056000, "step": 8500, "train_runtime": 5527.7417, "train_tokens_per_second": 100774.607 }, { "epoch": 0.086, "grad_norm": 0.17872081696987152, "learning_rate": 0.0002956599172962209, "loss": 0.3404, "num_input_tokens_seen": 563609600, "step": 8600, "train_runtime": 5593.3318, "train_tokens_per_second": 100764.557 }, { "epoch": 0.087, "grad_norm": 0.19022491574287415, "learning_rate": 0.0002955455104136479, "loss": 0.3329, "num_input_tokens_seen": 570163200, "step": 8700, "train_runtime": 5659.0887, "train_tokens_per_second": 100751.77 }, { "epoch": 0.088, "grad_norm": 0.14710059762001038, "learning_rate": 0.00029542963788940096, "loss": 0.3323, "num_input_tokens_seen": 576716800, "step": 8800, "train_runtime": 5722.168, "train_tokens_per_second": 100786.415 }, { "epoch": 0.089, "grad_norm": 0.1998033970594406, "learning_rate": 0.00029531230089031505, "loss": 0.3378, "num_input_tokens_seen": 583270400, "step": 8900, "train_runtime": 5787.7324, "train_tokens_per_second": 100777.016 }, { "epoch": 0.09, "grad_norm": 0.125193253159523, "learning_rate": 0.0002951935005979724, "loss": 0.3325, "num_input_tokens_seen": 589824000, "step": 9000, "train_runtime": 5855.8455, "train_tokens_per_second": 100723.968 }, { "epoch": 0.091, "grad_norm": 0.19552631676197052, "learning_rate": 0.0002950732382086907, "loss": 0.3316, "num_input_tokens_seen": 596377600, "step": 9100, "train_runtime": 5921.9714, "train_tokens_per_second": 100705.923 }, { "epoch": 0.092, "grad_norm": 0.16468137502670288, "learning_rate": 0.0002949515149335108, "loss": 0.3349, "num_input_tokens_seen": 602931200, "step": 9200, "train_runtime": 5986.1243, "train_tokens_per_second": 100721.464 }, { "epoch": 0.093, "grad_norm": 0.1658785343170166, "learning_rate": 0.0002948283319981848, "loss": 0.3281, "num_input_tokens_seen": 609484800, "step": 9300, "train_runtime": 6050.7028, "train_tokens_per_second": 100729.588 }, { "epoch": 0.094, "grad_norm": 0.16668474674224854, "learning_rate": 0.00029470369064316354, "loss": 0.3301, "num_input_tokens_seen": 616038400, "step": 9400, "train_runtime": 6115.0892, "train_tokens_per_second": 100740.706 }, { "epoch": 0.095, "grad_norm": 0.16522246599197388, "learning_rate": 0.00029457759212358397, "loss": 0.3305, "num_input_tokens_seen": 622592000, "step": 9500, "train_runtime": 6183.2082, "train_tokens_per_second": 100690.77 }, { "epoch": 0.096, "grad_norm": 0.2229623645544052, "learning_rate": 0.00029445003770925686, "loss": 0.3289, "num_input_tokens_seen": 629145600, "step": 9600, "train_runtime": 6247.5147, "train_tokens_per_second": 100703.341 }, { "epoch": 0.097, "grad_norm": 0.16620689630508423, "learning_rate": 0.00029432102868465367, "loss": 0.3299, "num_input_tokens_seen": 635699200, "step": 9700, "train_runtime": 6312.7504, "train_tokens_per_second": 100700.829 }, { "epoch": 0.098, "grad_norm": 0.15970012545585632, "learning_rate": 0.0002941905663488939, "loss": 0.3292, "num_input_tokens_seen": 642252800, "step": 9800, "train_runtime": 6382.1987, "train_tokens_per_second": 100631.903 }, { "epoch": 0.099, "grad_norm": 0.14614014327526093, "learning_rate": 0.0002940586520157318, "loss": 0.3329, "num_input_tokens_seen": 648806400, "step": 9900, "train_runtime": 6445.6924, "train_tokens_per_second": 100657.362 }, { "epoch": 0.1, "grad_norm": 0.16558828949928284, "learning_rate": 0.00029392528701354325, "loss": 0.3286, "num_input_tokens_seen": 655360000, "step": 10000, "train_runtime": 6509.151, "train_tokens_per_second": 100682.87 }, { "epoch": 0.101, "grad_norm": 0.1442118138074875, "learning_rate": 0.00029379047268531243, "loss": 0.3314, "num_input_tokens_seen": 661913600, "step": 10100, "train_runtime": 6575.3071, "train_tokens_per_second": 100666.568 }, { "epoch": 0.102, "grad_norm": 0.16007182002067566, "learning_rate": 0.00029365421038861795, "loss": 0.3326, "num_input_tokens_seen": 668467200, "step": 10200, "train_runtime": 6639.6314, "train_tokens_per_second": 100678.359 }, { "epoch": 0.103, "grad_norm": 0.1417239010334015, "learning_rate": 0.0002935165014956198, "loss": 0.3292, "num_input_tokens_seen": 675020800, "step": 10300, "train_runtime": 6704.2875, "train_tokens_per_second": 100684.942 }, { "epoch": 0.104, "grad_norm": 0.20092202723026276, "learning_rate": 0.0002933773473930448, "loss": 0.3251, "num_input_tokens_seen": 681574400, "step": 10400, "train_runtime": 6769.9733, "train_tokens_per_second": 100676.083 }, { "epoch": 0.105, "grad_norm": 0.12387008965015411, "learning_rate": 0.0002932367494821734, "loss": 0.3302, "num_input_tokens_seen": 688128000, "step": 10500, "train_runtime": 6840.7627, "train_tokens_per_second": 100592.292 }, { "epoch": 0.106, "grad_norm": 0.17865417897701263, "learning_rate": 0.00029309470917882497, "loss": 0.328, "num_input_tokens_seen": 694681600, "step": 10600, "train_runtime": 6905.9119, "train_tokens_per_second": 100592.305 }, { "epoch": 0.107, "grad_norm": 0.14125974476337433, "learning_rate": 0.0002929512279133437, "loss": 0.3296, "num_input_tokens_seen": 701235200, "step": 10700, "train_runtime": 6969.9941, "train_tokens_per_second": 100607.718 }, { "epoch": 0.108, "grad_norm": 0.15725336968898773, "learning_rate": 0.0002928063071305844, "loss": 0.3279, "num_input_tokens_seen": 707788800, "step": 10800, "train_runtime": 7032.9479, "train_tokens_per_second": 100638.994 }, { "epoch": 0.109, "grad_norm": 0.15254800021648407, "learning_rate": 0.0002926599482898978, "loss": 0.3276, "num_input_tokens_seen": 714342400, "step": 10900, "train_runtime": 7097.644, "train_tokens_per_second": 100645.002 }, { "epoch": 0.11, "grad_norm": 0.23630526661872864, "learning_rate": 0.00029251215286511573, "loss": 0.3278, "num_input_tokens_seen": 720896000, "step": 11000, "train_runtime": 7167.7206, "train_tokens_per_second": 100575.348 }, { "epoch": 0.111, "grad_norm": 0.14799726009368896, "learning_rate": 0.00029236292234453647, "loss": 0.3264, "num_input_tokens_seen": 727449600, "step": 11100, "train_runtime": 7232.1207, "train_tokens_per_second": 100585.932 }, { "epoch": 0.112, "grad_norm": 0.17712198197841644, "learning_rate": 0.0002922122582309097, "loss": 0.3304, "num_input_tokens_seen": 734003200, "step": 11200, "train_runtime": 7296.7016, "train_tokens_per_second": 100593.835 }, { "epoch": 0.113, "grad_norm": 0.1620536595582962, "learning_rate": 0.0002920601620414215, "loss": 0.3266, "num_input_tokens_seen": 740556800, "step": 11300, "train_runtime": 7359.3874, "train_tokens_per_second": 100627.506 }, { "epoch": 0.114, "grad_norm": 0.1695978045463562, "learning_rate": 0.0002919066353076786, "loss": 0.3269, "num_input_tokens_seen": 747110400, "step": 11400, "train_runtime": 7425.5624, "train_tokens_per_second": 100613.308 }, { "epoch": 0.115, "grad_norm": 0.23728708922863007, "learning_rate": 0.00029175167957569366, "loss": 0.3269, "num_input_tokens_seen": 753664000, "step": 11500, "train_runtime": 7489.1752, "train_tokens_per_second": 100633.779 }, { "epoch": 0.116, "grad_norm": 0.14579418301582336, "learning_rate": 0.0002915952964058691, "loss": 0.3254, "num_input_tokens_seen": 760217600, "step": 11600, "train_runtime": 7559.1466, "train_tokens_per_second": 100569.237 }, { "epoch": 0.117, "grad_norm": 0.15569131076335907, "learning_rate": 0.00029143748737298173, "loss": 0.3309, "num_input_tokens_seen": 766771200, "step": 11700, "train_runtime": 7625.7219, "train_tokens_per_second": 100550.638 }, { "epoch": 0.118, "grad_norm": 0.15939873456954956, "learning_rate": 0.00029127825406616677, "loss": 0.3251, "num_input_tokens_seen": 773324800, "step": 11800, "train_runtime": 7690.5664, "train_tokens_per_second": 100554.987 }, { "epoch": 0.119, "grad_norm": 0.1355784833431244, "learning_rate": 0.0002911175980889019, "loss": 0.3287, "num_input_tokens_seen": 779878400, "step": 11900, "train_runtime": 7753.5378, "train_tokens_per_second": 100583.556 }, { "epoch": 0.12, "grad_norm": 0.19504176080226898, "learning_rate": 0.00029095552105899095, "loss": 0.325, "num_input_tokens_seen": 786432000, "step": 12000, "train_runtime": 7817.9364, "train_tokens_per_second": 100593.297 }, { "epoch": 0.121, "grad_norm": 0.1594318449497223, "learning_rate": 0.0002907920246085478, "loss": 0.3242, "num_input_tokens_seen": 792985600, "step": 12100, "train_runtime": 7887.1116, "train_tokens_per_second": 100541.953 }, { "epoch": 0.122, "grad_norm": 0.15172167122364044, "learning_rate": 0.00029062711038397996, "loss": 0.3325, "num_input_tokens_seen": 799539200, "step": 12200, "train_runtime": 7952.1371, "train_tokens_per_second": 100543.94 }, { "epoch": 0.123, "grad_norm": 0.13253241777420044, "learning_rate": 0.00029046078004597175, "loss": 0.3239, "num_input_tokens_seen": 806092800, "step": 12300, "train_runtime": 8016.3597, "train_tokens_per_second": 100555.966 }, { "epoch": 0.124, "grad_norm": 0.2943899929523468, "learning_rate": 0.00029029303526946796, "loss": 0.3238, "num_input_tokens_seen": 812646400, "step": 12400, "train_runtime": 8079.6597, "train_tokens_per_second": 100579.286 }, { "epoch": 0.125, "grad_norm": 0.1583172082901001, "learning_rate": 0.0002901238777436565, "loss": 0.3217, "num_input_tokens_seen": 819200000, "step": 12500, "train_runtime": 8148.9297, "train_tokens_per_second": 100528.539 }, { "epoch": 0.126, "grad_norm": 0.1598382592201233, "learning_rate": 0.00028995330917195184, "loss": 0.3245, "num_input_tokens_seen": 825753600, "step": 12600, "train_runtime": 8213.0201, "train_tokens_per_second": 100542.016 }, { "epoch": 0.127, "grad_norm": 0.13507018983364105, "learning_rate": 0.00028978133127197765, "loss": 0.3247, "num_input_tokens_seen": 832307200, "step": 12700, "train_runtime": 8277.3925, "train_tokens_per_second": 100551.859 }, { "epoch": 0.128, "grad_norm": 0.1688830703496933, "learning_rate": 0.0002896079457755493, "loss": 0.3258, "num_input_tokens_seen": 838860800, "step": 12800, "train_runtime": 8342.3491, "train_tokens_per_second": 100554.507 }, { "epoch": 0.129, "grad_norm": 0.2753322422504425, "learning_rate": 0.000289433154428657, "loss": 0.3249, "num_input_tokens_seen": 845414400, "step": 12900, "train_runtime": 8406.9898, "train_tokens_per_second": 100560.892 }, { "epoch": 0.13, "grad_norm": 0.20588786900043488, "learning_rate": 0.0002892569589914476, "loss": 0.3232, "num_input_tokens_seen": 851968000, "step": 13000, "train_runtime": 8475.9626, "train_tokens_per_second": 100515.781 }, { "epoch": 0.131, "grad_norm": 0.1462445855140686, "learning_rate": 0.0002890793612382072, "loss": 0.3239, "num_input_tokens_seen": 858521600, "step": 13100, "train_runtime": 8539.9861, "train_tokens_per_second": 100529.625 }, { "epoch": 0.132, "grad_norm": 0.11379440873861313, "learning_rate": 0.0002889003629573432, "loss": 0.3249, "num_input_tokens_seen": 865075200, "step": 13200, "train_runtime": 8604.867, "train_tokens_per_second": 100533.244 }, { "epoch": 0.133, "grad_norm": 0.12769202888011932, "learning_rate": 0.00028871996595136626, "loss": 0.327, "num_input_tokens_seen": 871628800, "step": 13300, "train_runtime": 8669.3605, "train_tokens_per_second": 100541.303 }, { "epoch": 0.134, "grad_norm": 0.14837151765823364, "learning_rate": 0.0002885381720368723, "loss": 0.321, "num_input_tokens_seen": 878182400, "step": 13400, "train_runtime": 8738.2624, "train_tokens_per_second": 100498.515 }, { "epoch": 0.135, "grad_norm": 0.1538904309272766, "learning_rate": 0.000288354983044524, "loss": 0.3207, "num_input_tokens_seen": 884736000, "step": 13500, "train_runtime": 8802.2586, "train_tokens_per_second": 100512.385 }, { "epoch": 0.136, "grad_norm": 0.12802962958812714, "learning_rate": 0.00028817040081903245, "loss": 0.3241, "num_input_tokens_seen": 891289600, "step": 13600, "train_runtime": 8866.1163, "train_tokens_per_second": 100527.624 }, { "epoch": 0.137, "grad_norm": 0.35466450452804565, "learning_rate": 0.00028798442721913867, "loss": 0.3214, "num_input_tokens_seen": 897843200, "step": 13700, "train_runtime": 8930.5828, "train_tokens_per_second": 100535.79 }, { "epoch": 0.138, "grad_norm": 0.13867586851119995, "learning_rate": 0.00028779706411759465, "loss": 0.3199, "num_input_tokens_seen": 904396800, "step": 13800, "train_runtime": 9001.3287, "train_tokens_per_second": 100473.7 }, { "epoch": 0.139, "grad_norm": 0.2114623785018921, "learning_rate": 0.00028760831340114484, "loss": 0.3234, "num_input_tokens_seen": 910950400, "step": 13900, "train_runtime": 9066.3163, "train_tokens_per_second": 100476.353 }, { "epoch": 0.14, "grad_norm": 0.14202618598937988, "learning_rate": 0.00028741817697050683, "loss": 0.3232, "num_input_tokens_seen": 917504000, "step": 14000, "train_runtime": 9130.2003, "train_tokens_per_second": 100491.114 }, { "epoch": 0.141, "grad_norm": 0.1686236560344696, "learning_rate": 0.00028722665674035233, "loss": 0.3203, "num_input_tokens_seen": 924057600, "step": 14100, "train_runtime": 9195.1426, "train_tokens_per_second": 100494.102 }, { "epoch": 0.142, "grad_norm": 0.14483292400836945, "learning_rate": 0.0002870337546392879, "loss": 0.3321, "num_input_tokens_seen": 930611200, "step": 14200, "train_runtime": 9259.404, "train_tokens_per_second": 100504.438 }, { "epoch": 0.143, "grad_norm": 0.12517394125461578, "learning_rate": 0.00028683947260983576, "loss": 0.3233, "num_input_tokens_seen": 937164800, "step": 14300, "train_runtime": 9324.1454, "train_tokens_per_second": 100509.458 }, { "epoch": 0.144, "grad_norm": 0.24776680767536163, "learning_rate": 0.00028664381260841356, "loss": 0.3192, "num_input_tokens_seen": 943718400, "step": 14400, "train_runtime": 9393.645, "train_tokens_per_second": 100463.494 }, { "epoch": 0.145, "grad_norm": 0.4200928807258606, "learning_rate": 0.0002864467766053154, "loss": 0.321, "num_input_tokens_seen": 950272000, "step": 14500, "train_runtime": 9456.5857, "train_tokens_per_second": 100487.853 }, { "epoch": 0.146, "grad_norm": 0.14573471248149872, "learning_rate": 0.00028624836658469165, "loss": 0.3198, "num_input_tokens_seen": 956825600, "step": 14600, "train_runtime": 9525.9633, "train_tokens_per_second": 100443.973 }, { "epoch": 0.147, "grad_norm": 0.1546989232301712, "learning_rate": 0.00028604858454452906, "loss": 0.3267, "num_input_tokens_seen": 963379200, "step": 14700, "train_runtime": 9585.7512, "train_tokens_per_second": 100501.169 }, { "epoch": 0.148, "grad_norm": 0.172988623380661, "learning_rate": 0.00028584743249663057, "loss": 0.3222, "num_input_tokens_seen": 969932800, "step": 14800, "train_runtime": 9650.7111, "train_tokens_per_second": 100503.765 }, { "epoch": 0.149, "grad_norm": 0.19345735013484955, "learning_rate": 0.000285644912466595, "loss": 0.3194, "num_input_tokens_seen": 976486400, "step": 14900, "train_runtime": 9721.1196, "train_tokens_per_second": 100449.994 }, { "epoch": 0.15, "grad_norm": 0.13317954540252686, "learning_rate": 0.00028544102649379684, "loss": 0.3236, "num_input_tokens_seen": 983040000, "step": 15000, "train_runtime": 9784.7921, "train_tokens_per_second": 100466.11 }, { "epoch": 0.151, "grad_norm": 0.17458604276180267, "learning_rate": 0.00028523577663136556, "loss": 0.3208, "num_input_tokens_seen": 989593600, "step": 15100, "train_runtime": 9853.1273, "train_tokens_per_second": 100434.468 }, { "epoch": 0.152, "grad_norm": 0.1358109712600708, "learning_rate": 0.000285029164946165, "loss": 0.3237, "num_input_tokens_seen": 996147200, "step": 15200, "train_runtime": 9917.7044, "train_tokens_per_second": 100441.307 }, { "epoch": 0.153, "grad_norm": 0.16100633144378662, "learning_rate": 0.0002848211935187725, "loss": 0.3267, "num_input_tokens_seen": 1002700800, "step": 15300, "train_runtime": 9982.8922, "train_tokens_per_second": 100441.914 }, { "epoch": 0.154, "grad_norm": 0.20419622957706451, "learning_rate": 0.0002846118644434581, "loss": 0.3193, "num_input_tokens_seen": 1009254400, "step": 15400, "train_runtime": 10046.3454, "train_tokens_per_second": 100459.855 }, { "epoch": 0.155, "grad_norm": 0.17805695533752441, "learning_rate": 0.00028440117982816326, "loss": 0.3159, "num_input_tokens_seen": 1015808000, "step": 15500, "train_runtime": 10110.0124, "train_tokens_per_second": 100475.446 }, { "epoch": 0.156, "grad_norm": 0.17533563077449799, "learning_rate": 0.0002841891417944796, "loss": 0.3216, "num_input_tokens_seen": 1022361600, "step": 15600, "train_runtime": 10178.7469, "train_tokens_per_second": 100440.812 }, { "epoch": 0.157, "grad_norm": 0.13143610954284668, "learning_rate": 0.0002839757524776279, "loss": 0.3234, "num_input_tokens_seen": 1028915200, "step": 15700, "train_runtime": 10243.1395, "train_tokens_per_second": 100449.203 }, { "epoch": 0.158, "grad_norm": 0.13563373684883118, "learning_rate": 0.0002837610140264361, "loss": 0.3194, "num_input_tokens_seen": 1035468800, "step": 15800, "train_runtime": 10307.5423, "train_tokens_per_second": 100457.39 }, { "epoch": 0.159, "grad_norm": 0.14616088569164276, "learning_rate": 0.0002835449286033182, "loss": 0.3178, "num_input_tokens_seen": 1042022400, "step": 15900, "train_runtime": 10378.0909, "train_tokens_per_second": 100405.982 }, { "epoch": 0.16, "grad_norm": 0.1539888232946396, "learning_rate": 0.0002833274983842518, "loss": 0.3156, "num_input_tokens_seen": 1048576000, "step": 16000, "train_runtime": 10441.484, "train_tokens_per_second": 100424.039 }, { "epoch": 0.161, "grad_norm": 0.15786372125148773, "learning_rate": 0.0002831087255587569, "loss": 0.318, "num_input_tokens_seen": 1055129600, "step": 16100, "train_runtime": 10505.72, "train_tokens_per_second": 100433.821 }, { "epoch": 0.162, "grad_norm": 0.14359760284423828, "learning_rate": 0.0002828886123298734, "loss": 0.3179, "num_input_tokens_seen": 1061683200, "step": 16200, "train_runtime": 10570.7713, "train_tokens_per_second": 100435.736 }, { "epoch": 0.163, "grad_norm": 0.1415397673845291, "learning_rate": 0.00028266716091413906, "loss": 0.32, "num_input_tokens_seen": 1068236800, "step": 16300, "train_runtime": 10635.2645, "train_tokens_per_second": 100442.899 }, { "epoch": 0.164, "grad_norm": 0.1199110895395279, "learning_rate": 0.0002824443735415673, "loss": 0.3188, "num_input_tokens_seen": 1074790400, "step": 16400, "train_runtime": 10704.7074, "train_tokens_per_second": 100403.529 }, { "epoch": 0.165, "grad_norm": 0.18369431793689728, "learning_rate": 0.0002822202524556243, "loss": 0.3208, "num_input_tokens_seen": 1081344000, "step": 16500, "train_runtime": 10770.1863, "train_tokens_per_second": 100401.606 }, { "epoch": 0.166, "grad_norm": 0.2615172266960144, "learning_rate": 0.00028199479991320695, "loss": 0.3224, "num_input_tokens_seen": 1087897600, "step": 16600, "train_runtime": 10834.6749, "train_tokens_per_second": 100408.883 }, { "epoch": 0.167, "grad_norm": 0.1250002384185791, "learning_rate": 0.00028176801818461994, "loss": 0.3171, "num_input_tokens_seen": 1094451200, "step": 16700, "train_runtime": 10899.3075, "train_tokens_per_second": 100414.747 }, { "epoch": 0.168, "grad_norm": 0.14198775589466095, "learning_rate": 0.00028153990955355273, "loss": 0.3194, "num_input_tokens_seen": 1101004800, "step": 16800, "train_runtime": 10964.3423, "train_tokens_per_second": 100416.858 }, { "epoch": 0.169, "grad_norm": 0.14076939225196838, "learning_rate": 0.00028131047631705665, "loss": 0.3189, "num_input_tokens_seen": 1107558400, "step": 16900, "train_runtime": 11033.6033, "train_tokens_per_second": 100380.48 }, { "epoch": 0.17, "grad_norm": 0.13334921002388, "learning_rate": 0.00028107972078552187, "loss": 0.3198, "num_input_tokens_seen": 1114112000, "step": 17000, "train_runtime": 11098.612, "train_tokens_per_second": 100383.003 }, { "epoch": 0.171, "grad_norm": 0.13615840673446655, "learning_rate": 0.0002808476452826541, "loss": 0.3168, "num_input_tokens_seen": 1120665600, "step": 17100, "train_runtime": 11161.3832, "train_tokens_per_second": 100405.62 }, { "epoch": 0.172, "grad_norm": 0.14747090637683868, "learning_rate": 0.00028061425214545094, "loss": 0.3163, "num_input_tokens_seen": 1127219200, "step": 17200, "train_runtime": 11231.5954, "train_tokens_per_second": 100361.45 }, { "epoch": 0.173, "grad_norm": 0.15957149863243103, "learning_rate": 0.00028037954372417883, "loss": 0.317, "num_input_tokens_seen": 1133772800, "step": 17300, "train_runtime": 11295.5019, "train_tokens_per_second": 100373.831 }, { "epoch": 0.174, "grad_norm": 0.20420241355895996, "learning_rate": 0.0002801435223823488, "loss": 0.3207, "num_input_tokens_seen": 1140326400, "step": 17400, "train_runtime": 11360.8649, "train_tokens_per_second": 100373.203 }, { "epoch": 0.175, "grad_norm": 0.20070046186447144, "learning_rate": 0.00027990619049669336, "loss": 0.3206, "num_input_tokens_seen": 1146880000, "step": 17500, "train_runtime": 11424.854, "train_tokens_per_second": 100384.652 }, { "epoch": 0.176, "grad_norm": 0.13903649151325226, "learning_rate": 0.00027966755045714177, "loss": 0.3227, "num_input_tokens_seen": 1153433600, "step": 17600, "train_runtime": 11488.6874, "train_tokens_per_second": 100397.336 }, { "epoch": 0.177, "grad_norm": 0.15853877365589142, "learning_rate": 0.00027942760466679673, "loss": 0.3168, "num_input_tokens_seen": 1159987200, "step": 17700, "train_runtime": 11559.2862, "train_tokens_per_second": 100351.11 }, { "epoch": 0.178, "grad_norm": 0.14262589812278748, "learning_rate": 0.00027918635554190956, "loss": 0.3235, "num_input_tokens_seen": 1166540800, "step": 17800, "train_runtime": 11622.4751, "train_tokens_per_second": 100369.395 }, { "epoch": 0.179, "grad_norm": 0.14338357746601105, "learning_rate": 0.00027894380551185636, "loss": 0.3204, "num_input_tokens_seen": 1173094400, "step": 17900, "train_runtime": 11687.9668, "train_tokens_per_second": 100367.705 }, { "epoch": 0.18, "grad_norm": 0.12374505400657654, "learning_rate": 0.00027869995701911314, "loss": 0.3156, "num_input_tokens_seen": 1179648000, "step": 18000, "train_runtime": 11751.6619, "train_tokens_per_second": 100381.377 }, { "epoch": 0.181, "grad_norm": 0.11708634346723557, "learning_rate": 0.0002784548125192316, "loss": 0.3145, "num_input_tokens_seen": 1186201600, "step": 18100, "train_runtime": 11816.0633, "train_tokens_per_second": 100388.9 }, { "epoch": 0.182, "grad_norm": 0.1318449079990387, "learning_rate": 0.0002782083744808141, "loss": 0.3159, "num_input_tokens_seen": 1192755200, "step": 18200, "train_runtime": 11887.7736, "train_tokens_per_second": 100334.616 }, { "epoch": 0.183, "grad_norm": 0.3383175730705261, "learning_rate": 0.000277960645385489, "loss": 0.3191, "num_input_tokens_seen": 1199308800, "step": 18300, "train_runtime": 11953.3207, "train_tokens_per_second": 100332.688 }, { "epoch": 0.184, "grad_norm": 0.13779285550117493, "learning_rate": 0.00027771162772788544, "loss": 0.3168, "num_input_tokens_seen": 1205862400, "step": 18400, "train_runtime": 12016.7432, "train_tokens_per_second": 100348.521 }, { "epoch": 0.185, "grad_norm": 0.15161630511283875, "learning_rate": 0.00027746132401560857, "loss": 0.3146, "num_input_tokens_seen": 1212416000, "step": 18500, "train_runtime": 12081.3443, "train_tokens_per_second": 100354.395 }, { "epoch": 0.186, "grad_norm": 0.1523953378200531, "learning_rate": 0.0002772097367692139, "loss": 0.3172, "num_input_tokens_seen": 1218969600, "step": 18600, "train_runtime": 12145.9663, "train_tokens_per_second": 100360.035 }, { "epoch": 0.187, "grad_norm": 0.12802754342556, "learning_rate": 0.00027695686852218226, "loss": 0.3198, "num_input_tokens_seen": 1225523200, "step": 18700, "train_runtime": 12215.5887, "train_tokens_per_second": 100324.53 }, { "epoch": 0.188, "grad_norm": 0.13653679192066193, "learning_rate": 0.00027670272182089416, "loss": 0.319, "num_input_tokens_seen": 1232076800, "step": 18800, "train_runtime": 12280.146, "train_tokens_per_second": 100330.794 }, { "epoch": 0.189, "grad_norm": 0.15152159333229065, "learning_rate": 0.0002764472992246039, "loss": 0.3165, "num_input_tokens_seen": 1238630400, "step": 18900, "train_runtime": 12344.6292, "train_tokens_per_second": 100337.594 }, { "epoch": 0.19, "grad_norm": 0.13211041688919067, "learning_rate": 0.0002761906033054143, "loss": 0.3161, "num_input_tokens_seen": 1245184000, "step": 19000, "train_runtime": 12407.4556, "train_tokens_per_second": 100357.724 }, { "epoch": 0.191, "grad_norm": 0.19933822751045227, "learning_rate": 0.00027593263664825045, "loss": 0.3173, "num_input_tokens_seen": 1251737600, "step": 19100, "train_runtime": 12472.5241, "train_tokens_per_second": 100359.606 }, { "epoch": 0.192, "grad_norm": 0.1472938358783722, "learning_rate": 0.00027567340185083363, "loss": 0.3157, "num_input_tokens_seen": 1258291200, "step": 19200, "train_runtime": 12542.0532, "train_tokens_per_second": 100325.774 }, { "epoch": 0.193, "grad_norm": 0.1466071903705597, "learning_rate": 0.00027541290152365537, "loss": 0.3188, "num_input_tokens_seen": 1264844800, "step": 19300, "train_runtime": 12606.5735, "train_tokens_per_second": 100332.164 }, { "epoch": 0.194, "grad_norm": 0.1384386122226715, "learning_rate": 0.00027515113828995117, "loss": 0.318, "num_input_tokens_seen": 1271398400, "step": 19400, "train_runtime": 12672.5058, "train_tokens_per_second": 100327.309 }, { "epoch": 0.195, "grad_norm": 0.16287657618522644, "learning_rate": 0.00027488811478567374, "loss": 0.3153, "num_input_tokens_seen": 1277952000, "step": 19500, "train_runtime": 12735.4985, "train_tokens_per_second": 100345.66 }, { "epoch": 0.196, "grad_norm": 0.14955779910087585, "learning_rate": 0.0002746238336594671, "loss": 0.3144, "num_input_tokens_seen": 1284505600, "step": 19600, "train_runtime": 12804.8911, "train_tokens_per_second": 100313.669 }, { "epoch": 0.197, "grad_norm": 0.15176887810230255, "learning_rate": 0.00027435829757263894, "loss": 0.3172, "num_input_tokens_seen": 1291059200, "step": 19700, "train_runtime": 12869.0984, "train_tokens_per_second": 100322.428 }, { "epoch": 0.198, "grad_norm": 0.12215608358383179, "learning_rate": 0.0002740915091991349, "loss": 0.3182, "num_input_tokens_seen": 1297612800, "step": 19800, "train_runtime": 12932.8746, "train_tokens_per_second": 100334.446 }, { "epoch": 0.199, "grad_norm": 0.248954638838768, "learning_rate": 0.0002738234712255109, "loss": 0.3171, "num_input_tokens_seen": 1304166400, "step": 19900, "train_runtime": 13003.7739, "train_tokens_per_second": 100291.378 }, { "epoch": 0.2, "grad_norm": 0.18855011463165283, "learning_rate": 0.00027355418635090635, "loss": 0.3181, "num_input_tokens_seen": 1310720000, "step": 20000, "train_runtime": 13068.3505, "train_tokens_per_second": 100297.279 }, { "epoch": 0.201, "grad_norm": 0.17624643445014954, "learning_rate": 0.000273283657287017, "loss": 0.3147, "num_input_tokens_seen": 1317273600, "step": 20100, "train_runtime": 13133.7291, "train_tokens_per_second": 100296.998 }, { "epoch": 0.202, "grad_norm": 0.12586164474487305, "learning_rate": 0.00027301188675806745, "loss": 0.3203, "num_input_tokens_seen": 1323827200, "step": 20200, "train_runtime": 13197.5369, "train_tokens_per_second": 100308.657 }, { "epoch": 0.203, "grad_norm": 0.13073797523975372, "learning_rate": 0.0002727388775007839, "loss": 0.3149, "num_input_tokens_seen": 1330380800, "step": 20300, "train_runtime": 13261.8266, "train_tokens_per_second": 100316.558 }, { "epoch": 0.204, "grad_norm": 0.12983232736587524, "learning_rate": 0.0002724646322643666, "loss": 0.3157, "num_input_tokens_seen": 1336934400, "step": 20400, "train_runtime": 13325.295, "train_tokens_per_second": 100330.567 }, { "epoch": 0.205, "grad_norm": 0.2400187999010086, "learning_rate": 0.000272189153810462, "loss": 0.3178, "num_input_tokens_seen": 1343488000, "step": 20500, "train_runtime": 13395.2424, "train_tokens_per_second": 100295.908 }, { "epoch": 0.206, "grad_norm": 0.11757266521453857, "learning_rate": 0.0002719124449131351, "loss": 0.3164, "num_input_tokens_seen": 1350041600, "step": 20600, "train_runtime": 13459.4754, "train_tokens_per_second": 100304.177 }, { "epoch": 0.207, "grad_norm": 0.1606636494398117, "learning_rate": 0.00027163450835884144, "loss": 0.3146, "num_input_tokens_seen": 1356595200, "step": 20700, "train_runtime": 13524.1715, "train_tokens_per_second": 100308.932 }, { "epoch": 0.208, "grad_norm": 0.1295078545808792, "learning_rate": 0.00027135534694639894, "loss": 0.3175, "num_input_tokens_seen": 1363148800, "step": 20800, "train_runtime": 13588.4538, "train_tokens_per_second": 100316.697 }, { "epoch": 0.209, "grad_norm": 0.18409083783626556, "learning_rate": 0.00027107496348696003, "loss": 0.3189, "num_input_tokens_seen": 1369702400, "step": 20900, "train_runtime": 13653.2417, "train_tokens_per_second": 100320.673 }, { "epoch": 0.21, "grad_norm": 0.12083840370178223, "learning_rate": 0.00027079336080398296, "loss": 0.3139, "num_input_tokens_seen": 1376256000, "step": 21000, "train_runtime": 13723.0075, "train_tokens_per_second": 100288.22 }, { "epoch": 0.211, "grad_norm": 0.16270384192466736, "learning_rate": 0.00027051054173320366, "loss": 0.3147, "num_input_tokens_seen": 1382809600, "step": 21100, "train_runtime": 13787.7693, "train_tokens_per_second": 100292.482 }, { "epoch": 0.212, "grad_norm": 0.12299864739179611, "learning_rate": 0.000270226509122607, "loss": 0.3137, "num_input_tokens_seen": 1389363200, "step": 21200, "train_runtime": 13851.6298, "train_tokens_per_second": 100303.229 }, { "epoch": 0.213, "grad_norm": 0.12248677760362625, "learning_rate": 0.0002699412658323983, "loss": 0.3177, "num_input_tokens_seen": 1395916800, "step": 21300, "train_runtime": 13915.8434, "train_tokens_per_second": 100311.333 }, { "epoch": 0.214, "grad_norm": 0.13090935349464417, "learning_rate": 0.00026965481473497423, "loss": 0.3146, "num_input_tokens_seen": 1402470400, "step": 21400, "train_runtime": 13985.645, "train_tokens_per_second": 100279.28 }, { "epoch": 0.215, "grad_norm": 0.1279245913028717, "learning_rate": 0.0002693671587148942, "loss": 0.3128, "num_input_tokens_seen": 1409024000, "step": 21500, "train_runtime": 14050.4506, "train_tokens_per_second": 100283.19 }, { "epoch": 0.216, "grad_norm": 0.15504342317581177, "learning_rate": 0.0002690783006688511, "loss": 0.3145, "num_input_tokens_seen": 1415577600, "step": 21600, "train_runtime": 14115.855, "train_tokens_per_second": 100282.81 }, { "epoch": 0.217, "grad_norm": 0.1325046420097351, "learning_rate": 0.0002687882435056423, "loss": 0.3138, "num_input_tokens_seen": 1422131200, "step": 21700, "train_runtime": 14179.61, "train_tokens_per_second": 100294.098 }, { "epoch": 0.218, "grad_norm": 0.17374184727668762, "learning_rate": 0.0002684969901461402, "loss": 0.3179, "num_input_tokens_seen": 1428684800, "step": 21800, "train_runtime": 14245.0199, "train_tokens_per_second": 100293.633 }, { "epoch": 0.219, "grad_norm": 0.16908228397369385, "learning_rate": 0.000268204543523263, "loss": 0.3182, "num_input_tokens_seen": 1435238400, "step": 21900, "train_runtime": 14310.1147, "train_tokens_per_second": 100295.381 }, { "epoch": 0.22, "grad_norm": 0.15052039921283722, "learning_rate": 0.0002679109065819447, "loss": 0.3148, "num_input_tokens_seen": 1441792000, "step": 22000, "train_runtime": 14374.221, "train_tokens_per_second": 100304.01 }, { "epoch": 0.221, "grad_norm": 0.1661474108695984, "learning_rate": 0.0002676160822791062, "loss": 0.3142, "num_input_tokens_seen": 1448345600, "step": 22100, "train_runtime": 14445.9108, "train_tokens_per_second": 100259.902 }, { "epoch": 0.222, "grad_norm": 0.16423378884792328, "learning_rate": 0.00026732007358362496, "loss": 0.323, "num_input_tokens_seen": 1454899200, "step": 22200, "train_runtime": 14510.5733, "train_tokens_per_second": 100264.763 }, { "epoch": 0.223, "grad_norm": 0.14868460595607758, "learning_rate": 0.0002670228834763052, "loss": 0.3155, "num_input_tokens_seen": 1461452800, "step": 22300, "train_runtime": 14575.7382, "train_tokens_per_second": 100266.126 }, { "epoch": 0.224, "grad_norm": 0.1287386268377304, "learning_rate": 0.00026672451494984804, "loss": 0.3152, "num_input_tokens_seen": 1468006400, "step": 22400, "train_runtime": 14639.7379, "train_tokens_per_second": 100275.456 }, { "epoch": 0.225, "grad_norm": 0.14276720583438873, "learning_rate": 0.0002664249710088213, "loss": 0.3131, "num_input_tokens_seen": 1474560000, "step": 22500, "train_runtime": 14703.588, "train_tokens_per_second": 100285.726 }, { "epoch": 0.226, "grad_norm": 0.1419740915298462, "learning_rate": 0.00026612425466962893, "loss": 0.3112, "num_input_tokens_seen": 1481113600, "step": 22600, "train_runtime": 14773.1939, "train_tokens_per_second": 100256.83 }, { "epoch": 0.227, "grad_norm": 0.12067803740501404, "learning_rate": 0.00026582236896048134, "loss": 0.3122, "num_input_tokens_seen": 1487667200, "step": 22700, "train_runtime": 14837.1829, "train_tokens_per_second": 100266.15 }, { "epoch": 0.228, "grad_norm": 0.1338939219713211, "learning_rate": 0.00026551931692136413, "loss": 0.3128, "num_input_tokens_seen": 1494220800, "step": 22800, "train_runtime": 14900.9562, "train_tokens_per_second": 100276.84 }, { "epoch": 0.229, "grad_norm": 0.16754469275474548, "learning_rate": 0.00026521510160400804, "loss": 0.3133, "num_input_tokens_seen": 1500774400, "step": 22900, "train_runtime": 14965.1238, "train_tokens_per_second": 100284.797 }, { "epoch": 0.23, "grad_norm": 0.12648451328277588, "learning_rate": 0.00026490972607185793, "loss": 0.311, "num_input_tokens_seen": 1507328000, "step": 23000, "train_runtime": 15034.861, "train_tokens_per_second": 100255.533 }, { "epoch": 0.231, "grad_norm": 0.12040221691131592, "learning_rate": 0.0002646031934000421, "loss": 0.3166, "num_input_tokens_seen": 1513881600, "step": 23100, "train_runtime": 15099.2676, "train_tokens_per_second": 100261.922 }, { "epoch": 0.232, "grad_norm": 0.12486282736063004, "learning_rate": 0.00026429550667534095, "loss": 0.3151, "num_input_tokens_seen": 1520435200, "step": 23200, "train_runtime": 15164.1184, "train_tokens_per_second": 100265.321 }, { "epoch": 0.233, "grad_norm": 0.18211719393730164, "learning_rate": 0.0002639866689961565, "loss": 0.3117, "num_input_tokens_seen": 1526988800, "step": 23300, "train_runtime": 15229.7058, "train_tokens_per_second": 100263.841 }, { "epoch": 0.234, "grad_norm": 0.13128802180290222, "learning_rate": 0.00026367668347248083, "loss": 0.3125, "num_input_tokens_seen": 1533542400, "step": 23400, "train_runtime": 15293.6404, "train_tokens_per_second": 100273.209 }, { "epoch": 0.235, "grad_norm": 0.11493753641843796, "learning_rate": 0.0002633655532258646, "loss": 0.317, "num_input_tokens_seen": 1540096000, "step": 23500, "train_runtime": 15365.113, "train_tokens_per_second": 100233.301 }, { "epoch": 0.236, "grad_norm": 0.15309779345989227, "learning_rate": 0.000263053281389386, "loss": 0.3136, "num_input_tokens_seen": 1546649600, "step": 23600, "train_runtime": 15428.6523, "train_tokens_per_second": 100245.282 }, { "epoch": 0.237, "grad_norm": 0.15829730033874512, "learning_rate": 0.0002627398711076189, "loss": 0.3098, "num_input_tokens_seen": 1553203200, "step": 23700, "train_runtime": 15493.1944, "train_tokens_per_second": 100250.675 }, { "epoch": 0.238, "grad_norm": 0.13252806663513184, "learning_rate": 0.0002624253255366014, "loss": 0.3096, "num_input_tokens_seen": 1559756800, "step": 23800, "train_runtime": 15556.5037, "train_tokens_per_second": 100263.969 }, { "epoch": 0.239, "grad_norm": 0.18889528512954712, "learning_rate": 0.0002621096478438039, "loss": 0.3146, "num_input_tokens_seen": 1566310400, "step": 23900, "train_runtime": 15621.7412, "train_tokens_per_second": 100264.777 }, { "epoch": 0.24, "grad_norm": 0.16285447776317596, "learning_rate": 0.00026179284120809727, "loss": 0.3168, "num_input_tokens_seen": 1572864000, "step": 24000, "train_runtime": 15687.4424, "train_tokens_per_second": 100262.615 }, { "epoch": 0.241, "grad_norm": 0.14852070808410645, "learning_rate": 0.0002614749088197208, "loss": 0.3115, "num_input_tokens_seen": 1579417600, "step": 24100, "train_runtime": 15752.1472, "train_tokens_per_second": 100266.813 }, { "epoch": 0.242, "grad_norm": 0.22735795378684998, "learning_rate": 0.00026115585388025015, "loss": 0.3099, "num_input_tokens_seen": 1585971200, "step": 24200, "train_runtime": 15823.0117, "train_tokens_per_second": 100231.943 }, { "epoch": 0.243, "grad_norm": 0.16086964309215546, "learning_rate": 0.00026083567960256493, "loss": 0.3107, "num_input_tokens_seen": 1592524800, "step": 24300, "train_runtime": 15889.3517, "train_tokens_per_second": 100225.914 }, { "epoch": 0.244, "grad_norm": 0.15085358917713165, "learning_rate": 0.00026051438921081667, "loss": 0.3112, "num_input_tokens_seen": 1599078400, "step": 24400, "train_runtime": 15954.2137, "train_tokens_per_second": 100229.22 }, { "epoch": 0.245, "grad_norm": 0.14889656007289886, "learning_rate": 0.00026019198594039595, "loss": 0.3147, "num_input_tokens_seen": 1605632000, "step": 24500, "train_runtime": 16020.1883, "train_tokens_per_second": 100225.539 }, { "epoch": 0.246, "grad_norm": 0.15055876970291138, "learning_rate": 0.00025986847303790026, "loss": 0.3125, "num_input_tokens_seen": 1612185600, "step": 24600, "train_runtime": 16084.1346, "train_tokens_per_second": 100234.525 }, { "epoch": 0.247, "grad_norm": 0.14507324993610382, "learning_rate": 0.00025954385376110076, "loss": 0.3115, "num_input_tokens_seen": 1618739200, "step": 24700, "train_runtime": 16148.9618, "train_tokens_per_second": 100237.973 }, { "epoch": 0.248, "grad_norm": 0.1229107677936554, "learning_rate": 0.00025921813137891005, "loss": 0.3147, "num_input_tokens_seen": 1625292800, "step": 24800, "train_runtime": 16214.7466, "train_tokens_per_second": 100235.473 }, { "epoch": 0.249, "grad_norm": 0.1423114389181137, "learning_rate": 0.000258891309171349, "loss": 0.3127, "num_input_tokens_seen": 1631846400, "step": 24900, "train_runtime": 16278.9968, "train_tokens_per_second": 100242.442 }, { "epoch": 0.25, "grad_norm": 0.15807275474071503, "learning_rate": 0.00025856339042951344, "loss": 0.3088, "num_input_tokens_seen": 1638400000, "step": 25000, "train_runtime": 16343.5944, "train_tokens_per_second": 100247.226 }, { "epoch": 0.251, "grad_norm": 0.15635885298252106, "learning_rate": 0.0002582343784555415, "loss": 0.3105, "num_input_tokens_seen": 1644953600, "step": 25100, "train_runtime": 16414.1861, "train_tokens_per_second": 100215.362 }, { "epoch": 0.252, "grad_norm": 0.13579483330249786, "learning_rate": 0.00025790427656258017, "loss": 0.3159, "num_input_tokens_seen": 1651507200, "step": 25200, "train_runtime": 16478.0373, "train_tokens_per_second": 100224.752 }, { "epoch": 0.253, "grad_norm": 0.14977572858333588, "learning_rate": 0.00025757308807475185, "loss": 0.3115, "num_input_tokens_seen": 1658060800, "step": 25300, "train_runtime": 16542.7006, "train_tokens_per_second": 100229.149 }, { "epoch": 0.254, "grad_norm": 0.1324361115694046, "learning_rate": 0.00025724081632712086, "loss": 0.3108, "num_input_tokens_seen": 1664614400, "step": 25400, "train_runtime": 16607.2591, "train_tokens_per_second": 100234.144 }, { "epoch": 0.255, "grad_norm": 0.12053392827510834, "learning_rate": 0.0002569074646656601, "loss": 0.3081, "num_input_tokens_seen": 1671168000, "step": 25500, "train_runtime": 16676.4765, "train_tokens_per_second": 100211.096 }, { "epoch": 0.256, "grad_norm": 0.16214688122272491, "learning_rate": 0.00025657303644721695, "loss": 0.3154, "num_input_tokens_seen": 1677721600, "step": 25600, "train_runtime": 16741.4269, "train_tokens_per_second": 100213.776 }, { "epoch": 0.257, "grad_norm": 0.13730435073375702, "learning_rate": 0.00025623753503948004, "loss": 0.3159, "num_input_tokens_seen": 1684275200, "step": 25700, "train_runtime": 16805.4849, "train_tokens_per_second": 100221.755 }, { "epoch": 0.258, "grad_norm": 0.16218283772468567, "learning_rate": 0.00025590096382094475, "loss": 0.3111, "num_input_tokens_seen": 1690828800, "step": 25800, "train_runtime": 16869.8548, "train_tokens_per_second": 100227.821 }, { "epoch": 0.259, "grad_norm": 0.15016646683216095, "learning_rate": 0.00025556332618087945, "loss": 0.3106, "num_input_tokens_seen": 1697382400, "step": 25900, "train_runtime": 16938.0105, "train_tokens_per_second": 100211.439 }, { "epoch": 0.26, "grad_norm": 0.1398506760597229, "learning_rate": 0.00025522462551929155, "loss": 0.313, "num_input_tokens_seen": 1703936000, "step": 26000, "train_runtime": 17003.6995, "train_tokens_per_second": 100209.722 }, { "epoch": 0.261, "grad_norm": 0.12380320578813553, "learning_rate": 0.00025488486524689283, "loss": 0.3133, "num_input_tokens_seen": 1710489600, "step": 26100, "train_runtime": 17069.3522, "train_tokens_per_second": 100208.232 }, { "epoch": 0.262, "grad_norm": 0.14536257088184357, "learning_rate": 0.00025454404878506555, "loss": 0.3115, "num_input_tokens_seen": 1717043200, "step": 26200, "train_runtime": 17132.7395, "train_tokens_per_second": 100220.003 }, { "epoch": 0.263, "grad_norm": 0.14442390203475952, "learning_rate": 0.0002542021795658276, "loss": 0.311, "num_input_tokens_seen": 1723596800, "step": 26300, "train_runtime": 17196.4745, "train_tokens_per_second": 100229.66 }, { "epoch": 0.264, "grad_norm": 0.12595972418785095, "learning_rate": 0.0002538592610317984, "loss": 0.3118, "num_input_tokens_seen": 1730150400, "step": 26400, "train_runtime": 17266.9358, "train_tokens_per_second": 100200.199 }, { "epoch": 0.265, "grad_norm": 0.1587669402360916, "learning_rate": 0.00025351529663616355, "loss": 0.3132, "num_input_tokens_seen": 1736704000, "step": 26500, "train_runtime": 17331.5833, "train_tokens_per_second": 100204.578 }, { "epoch": 0.266, "grad_norm": 0.1406719982624054, "learning_rate": 0.00025317028984264087, "loss": 0.3099, "num_input_tokens_seen": 1743257600, "step": 26600, "train_runtime": 17395.5945, "train_tokens_per_second": 100212.591 }, { "epoch": 0.267, "grad_norm": 0.1677832007408142, "learning_rate": 0.0002528242441254448, "loss": 0.309, "num_input_tokens_seen": 1749811200, "step": 26700, "train_runtime": 17459.1185, "train_tokens_per_second": 100223.342 }, { "epoch": 0.268, "grad_norm": 0.13640043139457703, "learning_rate": 0.000252477162969252, "loss": 0.3112, "num_input_tokens_seen": 1756364800, "step": 26800, "train_runtime": 17523.2088, "train_tokens_per_second": 100230.775 }, { "epoch": 0.269, "grad_norm": 0.12981313467025757, "learning_rate": 0.00025212904986916584, "loss": 0.3124, "num_input_tokens_seen": 1762918400, "step": 26900, "train_runtime": 17587.6922, "train_tokens_per_second": 100235.914 }, { "epoch": 0.27, "grad_norm": 0.14338868856430054, "learning_rate": 0.00025177990833068133, "loss": 0.3124, "num_input_tokens_seen": 1769472000, "step": 27000, "train_runtime": 17658.758, "train_tokens_per_second": 100203.649 }, { "epoch": 0.271, "grad_norm": 0.17518877983093262, "learning_rate": 0.0002514297418696499, "loss": 0.3076, "num_input_tokens_seen": 1776025600, "step": 27100, "train_runtime": 17723.3886, "train_tokens_per_second": 100208.016 }, { "epoch": 0.272, "grad_norm": 0.1369880735874176, "learning_rate": 0.0002510785540122439, "loss": 0.3114, "num_input_tokens_seen": 1782579200, "step": 27200, "train_runtime": 17786.611, "train_tokens_per_second": 100220.283 }, { "epoch": 0.273, "grad_norm": 0.15111377835273743, "learning_rate": 0.0002507263482949212, "loss": 0.3144, "num_input_tokens_seen": 1789132800, "step": 27300, "train_runtime": 17852.1418, "train_tokens_per_second": 100219.504 }, { "epoch": 0.274, "grad_norm": 0.140447199344635, "learning_rate": 0.0002503731282643894, "loss": 0.3103, "num_input_tokens_seen": 1795686400, "step": 27400, "train_runtime": 17917.1236, "train_tokens_per_second": 100221.801 }, { "epoch": 0.275, "grad_norm": 0.1373315006494522, "learning_rate": 0.0002500188974775704, "loss": 0.3095, "num_input_tokens_seen": 1802240000, "step": 27500, "train_runtime": 17981.4799, "train_tokens_per_second": 100227.568 }, { "epoch": 0.276, "grad_norm": 0.1453147530555725, "learning_rate": 0.00024966365950156416, "loss": 0.3085, "num_input_tokens_seen": 1808793600, "step": 27600, "train_runtime": 18052.109, "train_tokens_per_second": 100198.464 }, { "epoch": 0.277, "grad_norm": 0.19097484648227692, "learning_rate": 0.00024930741791361326, "loss": 0.3128, "num_input_tokens_seen": 1815347200, "step": 27700, "train_runtime": 18117.9773, "train_tokens_per_second": 100195.909 }, { "epoch": 0.278, "grad_norm": 0.2222718745470047, "learning_rate": 0.0002489501763010664, "loss": 0.3107, "num_input_tokens_seen": 1821900800, "step": 27800, "train_runtime": 18178.1946, "train_tokens_per_second": 100224.519 }, { "epoch": 0.279, "grad_norm": 0.16960225999355316, "learning_rate": 0.00024859193826134285, "loss": 0.3093, "num_input_tokens_seen": 1828454400, "step": 27900, "train_runtime": 18248.1866, "train_tokens_per_second": 100199.238 }, { "epoch": 0.28, "grad_norm": 0.15540289878845215, "learning_rate": 0.00024823270740189556, "loss": 0.3084, "num_input_tokens_seen": 1835008000, "step": 28000, "train_runtime": 18313.0722, "train_tokens_per_second": 100202.084 }, { "epoch": 0.281, "grad_norm": 0.1421203911304474, "learning_rate": 0.00024787248734017527, "loss": 0.3119, "num_input_tokens_seen": 1841561600, "step": 28100, "train_runtime": 18377.039, "train_tokens_per_second": 100209.919 }, { "epoch": 0.282, "grad_norm": 0.131204292178154, "learning_rate": 0.0002475112817035941, "loss": 0.3127, "num_input_tokens_seen": 1848115200, "step": 28200, "train_runtime": 18441.4656, "train_tokens_per_second": 100215.202 }, { "epoch": 0.283, "grad_norm": 0.1507508009672165, "learning_rate": 0.0002471490941294887, "loss": 0.3118, "num_input_tokens_seen": 1854668800, "step": 28300, "train_runtime": 18511.3095, "train_tokens_per_second": 100191.118 }, { "epoch": 0.284, "grad_norm": 0.12522923946380615, "learning_rate": 0.000246785928265084, "loss": 0.3104, "num_input_tokens_seen": 1861222400, "step": 28400, "train_runtime": 18574.4697, "train_tokens_per_second": 100203.259 }, { "epoch": 0.285, "grad_norm": 0.2087126076221466, "learning_rate": 0.0002464217877674562, "loss": 0.3132, "num_input_tokens_seen": 1867776000, "step": 28500, "train_runtime": 18638.8332, "train_tokens_per_second": 100208.848 }, { "epoch": 0.286, "grad_norm": 0.1495303064584732, "learning_rate": 0.0002460566763034961, "loss": 0.3159, "num_input_tokens_seen": 1874329600, "step": 28600, "train_runtime": 18703.8924, "train_tokens_per_second": 100210.671 }, { "epoch": 0.287, "grad_norm": 0.14563380181789398, "learning_rate": 0.00024569059754987196, "loss": 0.3116, "num_input_tokens_seen": 1880883200, "step": 28700, "train_runtime": 18774.7813, "train_tokens_per_second": 100181.364 }, { "epoch": 0.288, "grad_norm": 0.12803615629673004, "learning_rate": 0.00024532355519299296, "loss": 0.3099, "num_input_tokens_seen": 1887436800, "step": 28800, "train_runtime": 18838.435, "train_tokens_per_second": 100190.743 }, { "epoch": 0.289, "grad_norm": 0.5618897676467896, "learning_rate": 0.0002449555529289714, "loss": 0.3129, "num_input_tokens_seen": 1893990400, "step": 28900, "train_runtime": 18901.8999, "train_tokens_per_second": 100201.06 }, { "epoch": 0.29, "grad_norm": 0.15488959848880768, "learning_rate": 0.0002445865944635861, "loss": 0.3155, "num_input_tokens_seen": 1900544000, "step": 29000, "train_runtime": 18967.9894, "train_tokens_per_second": 100197.441 }, { "epoch": 0.291, "grad_norm": 0.13676992058753967, "learning_rate": 0.0002442166835122446, "loss": 0.3101, "num_input_tokens_seen": 1907097600, "step": 29100, "train_runtime": 19031.1664, "train_tokens_per_second": 100209.181 }, { "epoch": 0.292, "grad_norm": 0.11402736604213715, "learning_rate": 0.00024384582379994614, "loss": 0.3094, "num_input_tokens_seen": 1913651200, "step": 29200, "train_runtime": 19096.1775, "train_tokens_per_second": 100211.218 }, { "epoch": 0.293, "grad_norm": 0.1358448714017868, "learning_rate": 0.00024347401906124388, "loss": 0.309, "num_input_tokens_seen": 1920204800, "step": 29300, "train_runtime": 19165.3098, "train_tokens_per_second": 100191.691 }, { "epoch": 0.294, "grad_norm": 0.14608891308307648, "learning_rate": 0.0002431012730402075, "loss": 0.3119, "num_input_tokens_seen": 1926758400, "step": 29400, "train_runtime": 19230.3069, "train_tokens_per_second": 100193.845 }, { "epoch": 0.295, "grad_norm": 0.1501711755990982, "learning_rate": 0.00024272758949038517, "loss": 0.3091, "num_input_tokens_seen": 1933312000, "step": 29500, "train_runtime": 19294.7627, "train_tokens_per_second": 100198.796 }, { "epoch": 0.296, "grad_norm": 0.1614496409893036, "learning_rate": 0.00024235297217476616, "loss": 0.3104, "num_input_tokens_seen": 1939865600, "step": 29600, "train_runtime": 19364.7415, "train_tokens_per_second": 100175.135 }, { "epoch": 0.297, "grad_norm": 0.11902807652950287, "learning_rate": 0.00024197742486574268, "loss": 0.3126, "num_input_tokens_seen": 1946419200, "step": 29700, "train_runtime": 19429.1038, "train_tokens_per_second": 100180.596 }, { "epoch": 0.298, "grad_norm": 0.12998123466968536, "learning_rate": 0.0002416009513450719, "loss": 0.3102, "num_input_tokens_seen": 1952972800, "step": 29800, "train_runtime": 19494.2244, "train_tokens_per_second": 100182.124 }, { "epoch": 0.299, "grad_norm": 0.2079559862613678, "learning_rate": 0.00024122355540383806, "loss": 0.311, "num_input_tokens_seen": 1959526400, "step": 29900, "train_runtime": 19559.2072, "train_tokens_per_second": 100184.347 }, { "epoch": 0.3, "grad_norm": 0.15128397941589355, "learning_rate": 0.00024084524084241405, "loss": 0.3076, "num_input_tokens_seen": 1966080000, "step": 30000, "train_runtime": 19623.3669, "train_tokens_per_second": 100190.758 }, { "epoch": 0.301, "grad_norm": 0.13512304425239563, "learning_rate": 0.00024046601147042332, "loss": 0.3119, "num_input_tokens_seen": 1972633600, "step": 30100, "train_runtime": 19688.91, "train_tokens_per_second": 100190.086 }, { "epoch": 0.302, "grad_norm": 0.12716713547706604, "learning_rate": 0.0002400858711067015, "loss": 0.3093, "num_input_tokens_seen": 1979187200, "step": 30200, "train_runtime": 19753.5863, "train_tokens_per_second": 100193.816 }, { "epoch": 0.303, "grad_norm": 0.1301889717578888, "learning_rate": 0.00023970482357925772, "loss": 0.31, "num_input_tokens_seen": 1985740800, "step": 30300, "train_runtime": 19823.6081, "train_tokens_per_second": 100170.503 }, { "epoch": 0.304, "grad_norm": 0.13871292769908905, "learning_rate": 0.00023932287272523646, "loss": 0.3084, "num_input_tokens_seen": 1992294400, "step": 30400, "train_runtime": 19887.7656, "train_tokens_per_second": 100176.885 }, { "epoch": 0.305, "grad_norm": 0.12449346482753754, "learning_rate": 0.00023894002239087847, "loss": 0.3276, "num_input_tokens_seen": 1998848000, "step": 30500, "train_runtime": 19952.5714, "train_tokens_per_second": 100179.97 }, { "epoch": 0.306, "grad_norm": 0.1523977369070053, "learning_rate": 0.0002385562764314825, "loss": 0.3097, "num_input_tokens_seen": 2005401600, "step": 30600, "train_runtime": 20017.8352, "train_tokens_per_second": 100180.743 }, { "epoch": 0.307, "grad_norm": 0.1439458280801773, "learning_rate": 0.00023817163871136596, "loss": 0.3048, "num_input_tokens_seen": 2011955200, "step": 30700, "train_runtime": 20081.8889, "train_tokens_per_second": 100187.548 }, { "epoch": 0.308, "grad_norm": 0.12756380438804626, "learning_rate": 0.00023778611310382652, "loss": 0.3075, "num_input_tokens_seen": 2018508800, "step": 30800, "train_runtime": 20145.6107, "train_tokens_per_second": 100195.96 }, { "epoch": 0.309, "grad_norm": 0.14607320725917816, "learning_rate": 0.0002373997034911027, "loss": 0.3139, "num_input_tokens_seen": 2025062400, "step": 30900, "train_runtime": 20210.9796, "train_tokens_per_second": 100196.153 }, { "epoch": 0.31, "grad_norm": 0.12456675618886948, "learning_rate": 0.00023701241376433506, "loss": 0.3089, "num_input_tokens_seen": 2031616000, "step": 31000, "train_runtime": 20281.0675, "train_tokens_per_second": 100173.031 }, { "epoch": 0.311, "grad_norm": 0.13834626972675323, "learning_rate": 0.0002366242478235268, "loss": 0.3066, "num_input_tokens_seen": 2038169600, "step": 31100, "train_runtime": 20346.0263, "train_tokens_per_second": 100175.315 }, { "epoch": 0.312, "grad_norm": 0.1534184068441391, "learning_rate": 0.00023623520957750471, "loss": 0.3082, "num_input_tokens_seen": 2044723200, "step": 31200, "train_runtime": 20409.76, "train_tokens_per_second": 100183.598 }, { "epoch": 0.313, "grad_norm": 0.12966671586036682, "learning_rate": 0.00023584530294387953, "loss": 0.3126, "num_input_tokens_seen": 2051276800, "step": 31300, "train_runtime": 20475.6348, "train_tokens_per_second": 100181.353 }, { "epoch": 0.314, "grad_norm": 0.14474999904632568, "learning_rate": 0.00023545453184900682, "loss": 0.3091, "num_input_tokens_seen": 2057830400, "step": 31400, "train_runtime": 20539.196, "train_tokens_per_second": 100190.407 }, { "epoch": 0.315, "grad_norm": 0.13208946585655212, "learning_rate": 0.00023506290022794706, "loss": 0.3095, "num_input_tokens_seen": 2064384000, "step": 31500, "train_runtime": 20604.221, "train_tokens_per_second": 100192.286 }, { "epoch": 0.316, "grad_norm": 0.15090374648571014, "learning_rate": 0.00023467041202442643, "loss": 0.3073, "num_input_tokens_seen": 2070937600, "step": 31600, "train_runtime": 20674.5759, "train_tokens_per_second": 100168.323 }, { "epoch": 0.317, "grad_norm": 0.18638543784618378, "learning_rate": 0.00023427707119079669, "loss": 0.312, "num_input_tokens_seen": 2077491200, "step": 31700, "train_runtime": 20738.8671, "train_tokens_per_second": 100173.804 }, { "epoch": 0.318, "grad_norm": 0.1385478377342224, "learning_rate": 0.0002338828816879957, "loss": 0.3095, "num_input_tokens_seen": 2084044800, "step": 31800, "train_runtime": 20802.7906, "train_tokens_per_second": 100181.021 }, { "epoch": 0.319, "grad_norm": 0.15265443921089172, "learning_rate": 0.00023348784748550744, "loss": 0.3103, "num_input_tokens_seen": 2090598400, "step": 31900, "train_runtime": 20868.0311, "train_tokens_per_second": 100181.871 }, { "epoch": 0.32, "grad_norm": 0.15918248891830444, "learning_rate": 0.00023309197256132184, "loss": 0.3102, "num_input_tokens_seen": 2097152000, "step": 32000, "train_runtime": 20937.8931, "train_tokens_per_second": 100160.603 }, { "epoch": 0.321, "grad_norm": 0.14801020920276642, "learning_rate": 0.00023269526090189505, "loss": 0.3147, "num_input_tokens_seen": 2103705600, "step": 32100, "train_runtime": 21002.9142, "train_tokens_per_second": 100162.557 }, { "epoch": 0.322, "grad_norm": 0.18616679310798645, "learning_rate": 0.00023229771650210907, "loss": 0.3099, "num_input_tokens_seen": 2110259200, "step": 32200, "train_runtime": 21067.872, "train_tokens_per_second": 100164.801 }, { "epoch": 0.323, "grad_norm": 0.13931268453598022, "learning_rate": 0.00023189934336523163, "loss": 0.3115, "num_input_tokens_seen": 2116812800, "step": 32300, "train_runtime": 21131.2256, "train_tokens_per_second": 100174.634 }, { "epoch": 0.324, "grad_norm": 0.1734631061553955, "learning_rate": 0.00023150014550287574, "loss": 0.3112, "num_input_tokens_seen": 2123366400, "step": 32400, "train_runtime": 21201.6285, "train_tokens_per_second": 100151.099 }, { "epoch": 0.325, "grad_norm": 0.13876596093177795, "learning_rate": 0.00023110012693495943, "loss": 0.31, "num_input_tokens_seen": 2129920000, "step": 32500, "train_runtime": 21265.8205, "train_tokens_per_second": 100156.963 }, { "epoch": 0.326, "grad_norm": 0.20441171526908875, "learning_rate": 0.00023069929168966527, "loss": 0.3095, "num_input_tokens_seen": 2136473600, "step": 32600, "train_runtime": 21329.6315, "train_tokens_per_second": 100164.581 }, { "epoch": 0.327, "grad_norm": 0.12022672593593597, "learning_rate": 0.0002302976438033997, "loss": 0.3089, "num_input_tokens_seen": 2143027200, "step": 32700, "train_runtime": 21394.0086, "train_tokens_per_second": 100169.502 }, { "epoch": 0.328, "grad_norm": 0.23158074915409088, "learning_rate": 0.0002298951873207525, "loss": 0.3121, "num_input_tokens_seen": 2149580800, "step": 32800, "train_runtime": 21459.8938, "train_tokens_per_second": 100167.355 }, { "epoch": 0.329, "grad_norm": 0.11978685855865479, "learning_rate": 0.00022949192629445606, "loss": 0.308, "num_input_tokens_seen": 2156134400, "step": 32900, "train_runtime": 21524.2825, "train_tokens_per_second": 100172.185 }, { "epoch": 0.33, "grad_norm": 0.16882842779159546, "learning_rate": 0.0002290878647853443, "loss": 0.3076, "num_input_tokens_seen": 2162688000, "step": 33000, "train_runtime": 21595.0222, "train_tokens_per_second": 100147.524 }, { "epoch": 0.331, "grad_norm": 0.1368299126625061, "learning_rate": 0.00022868300686231224, "loss": 0.3078, "num_input_tokens_seen": 2169241600, "step": 33100, "train_runtime": 21659.0361, "train_tokens_per_second": 100154.115 }, { "epoch": 0.332, "grad_norm": 0.13301041722297668, "learning_rate": 0.00022827735660227457, "loss": 0.3103, "num_input_tokens_seen": 2175795200, "step": 33200, "train_runtime": 21723.8934, "train_tokens_per_second": 100156.779 }, { "epoch": 0.333, "grad_norm": 0.13545189797878265, "learning_rate": 0.000227870918090125, "loss": 0.3068, "num_input_tokens_seen": 2182348800, "step": 33300, "train_runtime": 21788.4359, "train_tokens_per_second": 100160.875 }, { "epoch": 0.334, "grad_norm": 0.2138141542673111, "learning_rate": 0.00022746369541869476, "loss": 0.3059, "num_input_tokens_seen": 2188902400, "step": 33400, "train_runtime": 21853.4857, "train_tokens_per_second": 100162.621 }, { "epoch": 0.335, "grad_norm": 0.1255991905927658, "learning_rate": 0.00022705569268871163, "loss": 0.3099, "num_input_tokens_seen": 2195456000, "step": 33500, "train_runtime": 21918.1728, "train_tokens_per_second": 100166.014 }, { "epoch": 0.336, "grad_norm": 0.1330287754535675, "learning_rate": 0.00022664691400875865, "loss": 0.3093, "num_input_tokens_seen": 2202009600, "step": 33600, "train_runtime": 21987.6743, "train_tokens_per_second": 100147.454 }, { "epoch": 0.337, "grad_norm": 0.1321260631084442, "learning_rate": 0.00022623736349523254, "loss": 0.3109, "num_input_tokens_seen": 2208563200, "step": 33700, "train_runtime": 22052.5483, "train_tokens_per_second": 100150.022 }, { "epoch": 0.338, "grad_norm": 0.13865865767002106, "learning_rate": 0.00022582704527230238, "loss": 0.3068, "num_input_tokens_seen": 2215116800, "step": 33800, "train_runtime": 22117.0958, "train_tokens_per_second": 100154.054 }, { "epoch": 0.339, "grad_norm": 0.13597998023033142, "learning_rate": 0.0002254159634718682, "loss": 0.3061, "num_input_tokens_seen": 2221670400, "step": 33900, "train_runtime": 22180.0605, "train_tokens_per_second": 100165.209 }, { "epoch": 0.34, "grad_norm": 0.14176584780216217, "learning_rate": 0.00022500412223351915, "loss": 0.3114, "num_input_tokens_seen": 2228224000, "step": 34000, "train_runtime": 22251.2759, "train_tokens_per_second": 100139.157 }, { "epoch": 0.341, "grad_norm": 0.13006241619586945, "learning_rate": 0.0002245915257044919, "loss": 0.3071, "num_input_tokens_seen": 2234777600, "step": 34100, "train_runtime": 22315.7056, "train_tokens_per_second": 100143.712 }, { "epoch": 0.342, "grad_norm": 0.186634823679924, "learning_rate": 0.00022417817803962892, "loss": 0.3032, "num_input_tokens_seen": 2241331200, "step": 34200, "train_runtime": 22380.1064, "train_tokens_per_second": 100148.371 }, { "epoch": 0.343, "grad_norm": 0.1767393946647644, "learning_rate": 0.0002237640834013366, "loss": 0.3085, "num_input_tokens_seen": 2247884800, "step": 34300, "train_runtime": 22444.6012, "train_tokens_per_second": 100152.584 }, { "epoch": 0.344, "grad_norm": 0.15075454115867615, "learning_rate": 0.0002233492459595434, "loss": 0.3099, "num_input_tokens_seen": 2254438400, "step": 34400, "train_runtime": 22509.6493, "train_tokens_per_second": 100154.31 }, { "epoch": 0.345, "grad_norm": 0.15754783153533936, "learning_rate": 0.00022293366989165772, "loss": 0.307, "num_input_tokens_seen": 2260992000, "step": 34500, "train_runtime": 22579.4848, "train_tokens_per_second": 100134.791 }, { "epoch": 0.346, "grad_norm": 0.13372038304805756, "learning_rate": 0.00022251735938252587, "loss": 0.3066, "num_input_tokens_seen": 2267545600, "step": 34600, "train_runtime": 22643.953, "train_tokens_per_second": 100139.123 }, { "epoch": 0.347, "grad_norm": 0.17753738164901733, "learning_rate": 0.0002221003186243902, "loss": 0.3087, "num_input_tokens_seen": 2274099200, "step": 34700, "train_runtime": 22708.6869, "train_tokens_per_second": 100142.259 }, { "epoch": 0.348, "grad_norm": 0.1375788450241089, "learning_rate": 0.00022168255181684643, "loss": 0.3064, "num_input_tokens_seen": 2280652800, "step": 34800, "train_runtime": 22774.2018, "train_tokens_per_second": 100141.942 }, { "epoch": 0.349, "grad_norm": 0.14929898083209991, "learning_rate": 0.00022126406316680172, "loss": 0.3108, "num_input_tokens_seen": 2287206400, "step": 34900, "train_runtime": 22839.776, "train_tokens_per_second": 100141.367 }, { "epoch": 0.35, "grad_norm": 0.15789327025413513, "learning_rate": 0.00022084485688843208, "loss": 0.3082, "num_input_tokens_seen": 2293760000, "step": 35000, "train_runtime": 22904.3853, "train_tokens_per_second": 100145.015 }, { "epoch": 0.351, "grad_norm": 0.1339723765850067, "learning_rate": 0.00022042493720314003, "loss": 0.3127, "num_input_tokens_seen": 2300313600, "step": 35100, "train_runtime": 22968.8594, "train_tokens_per_second": 100149.231 }, { "epoch": 0.352, "grad_norm": 0.14159700274467468, "learning_rate": 0.00022000430833951228, "loss": 0.3096, "num_input_tokens_seen": 2306867200, "step": 35200, "train_runtime": 23033.0283, "train_tokens_per_second": 100154.751 }, { "epoch": 0.353, "grad_norm": 0.17289403080940247, "learning_rate": 0.00021958297453327673, "loss": 0.3058, "num_input_tokens_seen": 2313420800, "step": 35300, "train_runtime": 23103.5037, "train_tokens_per_second": 100132.899 }, { "epoch": 0.354, "grad_norm": 0.1353076845407486, "learning_rate": 0.00021916094002726012, "loss": 0.3048, "num_input_tokens_seen": 2319974400, "step": 35400, "train_runtime": 23166.8292, "train_tokens_per_second": 100142.077 }, { "epoch": 0.355, "grad_norm": 0.12303294241428375, "learning_rate": 0.00021873820907134534, "loss": 0.3102, "num_input_tokens_seen": 2326528000, "step": 35500, "train_runtime": 23232.6655, "train_tokens_per_second": 100140.382 }, { "epoch": 0.356, "grad_norm": 0.14765286445617676, "learning_rate": 0.0002183147859224283, "loss": 0.3106, "num_input_tokens_seen": 2333081600, "step": 35600, "train_runtime": 23296.4196, "train_tokens_per_second": 100147.647 }, { "epoch": 0.357, "grad_norm": 0.13833215832710266, "learning_rate": 0.00021789067484437544, "loss": 0.3055, "num_input_tokens_seen": 2339635200, "step": 35700, "train_runtime": 23361.5704, "train_tokens_per_second": 100148.884 }, { "epoch": 0.358, "grad_norm": 0.13157132267951965, "learning_rate": 0.00021746588010798068, "loss": 0.3081, "num_input_tokens_seen": 2346188800, "step": 35800, "train_runtime": 23430.7927, "train_tokens_per_second": 100132.711 }, { "epoch": 0.359, "grad_norm": 0.12913836538791656, "learning_rate": 0.00021704040599092216, "loss": 0.3094, "num_input_tokens_seen": 2352742400, "step": 35900, "train_runtime": 23495.4052, "train_tokens_per_second": 100136.277 }, { "epoch": 0.36, "grad_norm": 0.13528013229370117, "learning_rate": 0.00021661425677771965, "loss": 0.3061, "num_input_tokens_seen": 2359296000, "step": 36000, "train_runtime": 23559.8424, "train_tokens_per_second": 100140.568 }, { "epoch": 0.361, "grad_norm": 0.15519119799137115, "learning_rate": 0.00021618743675969095, "loss": 0.3065, "num_input_tokens_seen": 2365849600, "step": 36100, "train_runtime": 23624.7603, "train_tokens_per_second": 100142.798 }, { "epoch": 0.362, "grad_norm": 0.14744772017002106, "learning_rate": 0.0002157599502349089, "loss": 0.3068, "num_input_tokens_seen": 2372403200, "step": 36200, "train_runtime": 23688.8845, "train_tokens_per_second": 100148.371 }, { "epoch": 0.363, "grad_norm": 0.13838911056518555, "learning_rate": 0.00021533180150815802, "loss": 0.3097, "num_input_tokens_seen": 2378956800, "step": 36300, "train_runtime": 23759.9908, "train_tokens_per_second": 100124.483 }, { "epoch": 0.364, "grad_norm": 0.12536117434501648, "learning_rate": 0.00021490299489089132, "loss": 0.3067, "num_input_tokens_seen": 2385510400, "step": 36400, "train_runtime": 23823.7123, "train_tokens_per_second": 100131.767 }, { "epoch": 0.365, "grad_norm": 0.14205192029476166, "learning_rate": 0.00021447353470118656, "loss": 0.3049, "num_input_tokens_seen": 2392064000, "step": 36500, "train_runtime": 23887.5453, "train_tokens_per_second": 100138.544 }, { "epoch": 0.366, "grad_norm": 0.11950815469026566, "learning_rate": 0.00021404342526370326, "loss": 0.3072, "num_input_tokens_seen": 2398617600, "step": 36600, "train_runtime": 23951.3108, "train_tokens_per_second": 100145.567 }, { "epoch": 0.367, "grad_norm": 0.1286599189043045, "learning_rate": 0.00021361267090963846, "loss": 0.3096, "num_input_tokens_seen": 2405171200, "step": 36700, "train_runtime": 24016.5354, "train_tokens_per_second": 100146.468 }, { "epoch": 0.368, "grad_norm": 0.12663663923740387, "learning_rate": 0.0002131812759766839, "loss": 0.3054, "num_input_tokens_seen": 2411724800, "step": 36800, "train_runtime": 24085.8974, "train_tokens_per_second": 100130.162 }, { "epoch": 0.369, "grad_norm": 0.16495896875858307, "learning_rate": 0.00021274924480898169, "loss": 0.3037, "num_input_tokens_seen": 2418278400, "step": 36900, "train_runtime": 24149.4634, "train_tokens_per_second": 100137.977 }, { "epoch": 0.37, "grad_norm": 0.13351881504058838, "learning_rate": 0.00021231658175708087, "loss": 0.309, "num_input_tokens_seen": 2424832000, "step": 37000, "train_runtime": 24214.3635, "train_tokens_per_second": 100140.233 }, { "epoch": 0.371, "grad_norm": 0.13137440383434296, "learning_rate": 0.00021188329117789357, "loss": 0.3061, "num_input_tokens_seen": 2431385600, "step": 37100, "train_runtime": 24284.8537, "train_tokens_per_second": 100119.426 }, { "epoch": 0.372, "grad_norm": 0.17069390416145325, "learning_rate": 0.0002114493774346512, "loss": 0.3075, "num_input_tokens_seen": 2437939200, "step": 37200, "train_runtime": 24349.7441, "train_tokens_per_second": 100121.759 }, { "epoch": 0.373, "grad_norm": 0.13554754853248596, "learning_rate": 0.00021101484489686025, "loss": 0.3056, "num_input_tokens_seen": 2444492800, "step": 37300, "train_runtime": 24413.4106, "train_tokens_per_second": 100129.099 }, { "epoch": 0.374, "grad_norm": 0.24161159992218018, "learning_rate": 0.00021057969794025866, "loss": 0.3084, "num_input_tokens_seen": 2451046400, "step": 37400, "train_runtime": 24479.2787, "train_tokens_per_second": 100127.395 }, { "epoch": 0.375, "grad_norm": 0.11480960994958878, "learning_rate": 0.00021014394094677128, "loss": 0.3065, "num_input_tokens_seen": 2457600000, "step": 37500, "train_runtime": 24543.1085, "train_tokens_per_second": 100134.015 }, { "epoch": 0.376, "grad_norm": 0.1333978921175003, "learning_rate": 0.00020970757830446633, "loss": 0.3047, "num_input_tokens_seen": 2464153600, "step": 37600, "train_runtime": 24612.4036, "train_tokens_per_second": 100118.365 }, { "epoch": 0.377, "grad_norm": 0.1306515485048294, "learning_rate": 0.00020927061440751072, "loss": 0.3039, "num_input_tokens_seen": 2470707200, "step": 37700, "train_runtime": 24676.7406, "train_tokens_per_second": 100122.915 }, { "epoch": 0.378, "grad_norm": 0.19177651405334473, "learning_rate": 0.00020883305365612602, "loss": 0.3091, "num_input_tokens_seen": 2477260800, "step": 37800, "train_runtime": 24742.4612, "train_tokens_per_second": 100121.842 }, { "epoch": 0.379, "grad_norm": 0.14794479310512543, "learning_rate": 0.00020839490045654425, "loss": 0.3103, "num_input_tokens_seen": 2483814400, "step": 37900, "train_runtime": 24807.833, "train_tokens_per_second": 100122.183 }, { "epoch": 0.38, "grad_norm": 0.1391579508781433, "learning_rate": 0.00020795615922096313, "loss": 0.305, "num_input_tokens_seen": 2490368000, "step": 38000, "train_runtime": 24871.0815, "train_tokens_per_second": 100131.07 }, { "epoch": 0.381, "grad_norm": 0.14466038346290588, "learning_rate": 0.00020751683436750207, "loss": 0.3066, "num_input_tokens_seen": 2496921600, "step": 38100, "train_runtime": 24941.5584, "train_tokens_per_second": 100110.89 }, { "epoch": 0.382, "grad_norm": 0.14706650376319885, "learning_rate": 0.00020707693032015752, "loss": 0.3131, "num_input_tokens_seen": 2503475200, "step": 38200, "train_runtime": 25006.658, "train_tokens_per_second": 100112.346 }, { "epoch": 0.383, "grad_norm": 0.1455349326133728, "learning_rate": 0.00020663645150875834, "loss": 0.3058, "num_input_tokens_seen": 2510028800, "step": 38300, "train_runtime": 25070.3473, "train_tokens_per_second": 100119.427 }, { "epoch": 0.384, "grad_norm": 0.13858123123645782, "learning_rate": 0.00020619540236892125, "loss": 0.3066, "num_input_tokens_seen": 2516582400, "step": 38400, "train_runtime": 25135.6982, "train_tokens_per_second": 100119.853 }, { "epoch": 0.385, "grad_norm": 0.17408473789691925, "learning_rate": 0.00020575378734200616, "loss": 0.3068, "num_input_tokens_seen": 2523136000, "step": 38500, "train_runtime": 25206.1351, "train_tokens_per_second": 100100.075 }, { "epoch": 0.386, "grad_norm": 0.12729153037071228, "learning_rate": 0.0002053116108750715, "loss": 0.3062, "num_input_tokens_seen": 2529689600, "step": 38600, "train_runtime": 25270.823, "train_tokens_per_second": 100103.174 }, { "epoch": 0.387, "grad_norm": 0.15452224016189575, "learning_rate": 0.0002048688774208294, "loss": 0.3029, "num_input_tokens_seen": 2536243200, "step": 38700, "train_runtime": 25334.6018, "train_tokens_per_second": 100109.851 }, { "epoch": 0.388, "grad_norm": 0.11749983578920364, "learning_rate": 0.0002044255914376009, "loss": 0.3055, "num_input_tokens_seen": 2542796800, "step": 38800, "train_runtime": 25398.9456, "train_tokens_per_second": 100114.266 }, { "epoch": 0.389, "grad_norm": 0.12558670341968536, "learning_rate": 0.00020398175738927082, "loss": 0.307, "num_input_tokens_seen": 2549350400, "step": 38900, "train_runtime": 25469.3443, "train_tokens_per_second": 100094.858 }, { "epoch": 0.39, "grad_norm": 0.11652723699808121, "learning_rate": 0.00020353737974524312, "loss": 0.3059, "num_input_tokens_seen": 2555904000, "step": 39000, "train_runtime": 25534.1962, "train_tokens_per_second": 100097.296 }, { "epoch": 0.391, "grad_norm": 0.14530417323112488, "learning_rate": 0.00020309246298039584, "loss": 0.3043, "num_input_tokens_seen": 2562457600, "step": 39100, "train_runtime": 25597.7668, "train_tokens_per_second": 100104.733 }, { "epoch": 0.392, "grad_norm": 0.2145591825246811, "learning_rate": 0.0002026470115750357, "loss": 0.3097, "num_input_tokens_seen": 2569011200, "step": 39200, "train_runtime": 25662.2383, "train_tokens_per_second": 100108.618 }, { "epoch": 0.393, "grad_norm": 0.13407446444034576, "learning_rate": 0.0002022010300148535, "loss": 0.3072, "num_input_tokens_seen": 2575564800, "step": 39300, "train_runtime": 25726.7635, "train_tokens_per_second": 100112.274 }, { "epoch": 0.394, "grad_norm": 0.20070548355579376, "learning_rate": 0.0002017545227908786, "loss": 0.3042, "num_input_tokens_seen": 2582118400, "step": 39400, "train_runtime": 25798.3829, "train_tokens_per_second": 100088.382 }, { "epoch": 0.395, "grad_norm": 0.12969562411308289, "learning_rate": 0.00020130749439943376, "loss": 0.3025, "num_input_tokens_seen": 2588672000, "step": 39500, "train_runtime": 25861.9837, "train_tokens_per_second": 100095.647 }, { "epoch": 0.396, "grad_norm": 0.22430787980556488, "learning_rate": 0.00020085994934208998, "loss": 0.3075, "num_input_tokens_seen": 2595225600, "step": 39600, "train_runtime": 25927.1388, "train_tokens_per_second": 100096.876 }, { "epoch": 0.397, "grad_norm": 0.1543964445590973, "learning_rate": 0.00020041189212562094, "loss": 0.3061, "num_input_tokens_seen": 2601779200, "step": 39700, "train_runtime": 25990.8084, "train_tokens_per_second": 100103.82 }, { "epoch": 0.398, "grad_norm": 0.17474599182605743, "learning_rate": 0.0001999633272619579, "loss": 0.3026, "num_input_tokens_seen": 2608332800, "step": 39800, "train_runtime": 26055.1661, "train_tokens_per_second": 100108.086 }, { "epoch": 0.399, "grad_norm": 0.12200487405061722, "learning_rate": 0.00019951425926814404, "loss": 0.3051, "num_input_tokens_seen": 2614886400, "step": 39900, "train_runtime": 26125.5167, "train_tokens_per_second": 100089.366 }, { "epoch": 0.4, "grad_norm": 0.12909364700317383, "learning_rate": 0.00019906469266628904, "loss": 0.3083, "num_input_tokens_seen": 2621440000, "step": 40000, "train_runtime": 26189.9855, "train_tokens_per_second": 100093.221 }, { "epoch": 0.401, "grad_norm": 0.14507311582565308, "learning_rate": 0.0001986146319835236, "loss": 0.3063, "num_input_tokens_seen": 2627993600, "step": 40100, "train_runtime": 26254.1189, "train_tokens_per_second": 100098.335 }, { "epoch": 0.402, "grad_norm": 0.15015749633312225, "learning_rate": 0.00019816408175195383, "loss": 0.3024, "num_input_tokens_seen": 2634547200, "step": 40200, "train_runtime": 26317.4656, "train_tokens_per_second": 100106.417 }, { "epoch": 0.403, "grad_norm": 0.1793050467967987, "learning_rate": 0.0001977130465086155, "loss": 0.3058, "num_input_tokens_seen": 2641100800, "step": 40300, "train_runtime": 26387.6285, "train_tokens_per_second": 100088.6 }, { "epoch": 0.404, "grad_norm": 0.13494957983493805, "learning_rate": 0.0001972615307954286, "loss": 0.3058, "num_input_tokens_seen": 2647654400, "step": 40400, "train_runtime": 26452.3646, "train_tokens_per_second": 100091.407 }, { "epoch": 0.405, "grad_norm": 0.15225248038768768, "learning_rate": 0.00019680953915915124, "loss": 0.3032, "num_input_tokens_seen": 2654208000, "step": 40500, "train_runtime": 26516.6796, "train_tokens_per_second": 100095.79 }, { "epoch": 0.406, "grad_norm": 0.15482735633850098, "learning_rate": 0.00019635707615133427, "loss": 0.3061, "num_input_tokens_seen": 2660761600, "step": 40600, "train_runtime": 26585.3848, "train_tokens_per_second": 100083.622 }, { "epoch": 0.407, "grad_norm": 0.15725013613700867, "learning_rate": 0.00019590414632827513, "loss": 0.3101, "num_input_tokens_seen": 2667315200, "step": 40700, "train_runtime": 26649.9092, "train_tokens_per_second": 100087.215 }, { "epoch": 0.408, "grad_norm": 0.16835036873817444, "learning_rate": 0.00019545075425097204, "loss": 0.3049, "num_input_tokens_seen": 2673868800, "step": 40800, "train_runtime": 26714.9814, "train_tokens_per_second": 100088.739 }, { "epoch": 0.409, "grad_norm": 0.167361319065094, "learning_rate": 0.00019499690448507827, "loss": 0.3027, "num_input_tokens_seen": 2680422400, "step": 40900, "train_runtime": 26779.2716, "train_tokens_per_second": 100093.178 }, { "epoch": 0.41, "grad_norm": 0.1781291663646698, "learning_rate": 0.00019454260160085588, "loss": 0.3005, "num_input_tokens_seen": 2686976000, "step": 41000, "train_runtime": 26843.9197, "train_tokens_per_second": 100096.261 }, { "epoch": 0.411, "grad_norm": 0.1289975345134735, "learning_rate": 0.0001940878501731299, "loss": 0.3085, "num_input_tokens_seen": 2693529600, "step": 41100, "train_runtime": 26914.2047, "train_tokens_per_second": 100078.365 }, { "epoch": 0.412, "grad_norm": 0.12804220616817474, "learning_rate": 0.00019363265478124214, "loss": 0.3062, "num_input_tokens_seen": 2700083200, "step": 41200, "train_runtime": 26979.3069, "train_tokens_per_second": 100079.784 }, { "epoch": 0.413, "grad_norm": 0.14838483929634094, "learning_rate": 0.00019317702000900516, "loss": 0.3065, "num_input_tokens_seen": 2706636800, "step": 41300, "train_runtime": 27043.7101, "train_tokens_per_second": 100083.783 }, { "epoch": 0.414, "grad_norm": 0.3049434423446655, "learning_rate": 0.000192720950444656, "loss": 0.3075, "num_input_tokens_seen": 2713190400, "step": 41400, "train_runtime": 27108.2869, "train_tokens_per_second": 100087.121 }, { "epoch": 0.415, "grad_norm": 0.16474822163581848, "learning_rate": 0.00019226445068081018, "loss": 0.3087, "num_input_tokens_seen": 2719744000, "step": 41500, "train_runtime": 27173.4382, "train_tokens_per_second": 100088.328 }, { "epoch": 0.416, "grad_norm": 0.18445253372192383, "learning_rate": 0.00019180752531441523, "loss": 0.3065, "num_input_tokens_seen": 2726297600, "step": 41600, "train_runtime": 27237.7945, "train_tokens_per_second": 100092.45 }, { "epoch": 0.417, "grad_norm": 0.1226682960987091, "learning_rate": 0.00019135017894670456, "loss": 0.3062, "num_input_tokens_seen": 2732851200, "step": 41700, "train_runtime": 27307.5255, "train_tokens_per_second": 100076.852 }, { "epoch": 0.418, "grad_norm": 0.12846247851848602, "learning_rate": 0.0001908924161831509, "loss": 0.3064, "num_input_tokens_seen": 2739404800, "step": 41800, "train_runtime": 27371.4125, "train_tokens_per_second": 100082.698 }, { "epoch": 0.419, "grad_norm": 0.14241133630275726, "learning_rate": 0.0001904342416334203, "loss": 0.3048, "num_input_tokens_seen": 2745958400, "step": 41900, "train_runtime": 27436.5912, "train_tokens_per_second": 100083.803 }, { "epoch": 0.42, "grad_norm": 0.19496770203113556, "learning_rate": 0.00018997565991132532, "loss": 0.3046, "num_input_tokens_seen": 2752512000, "step": 42000, "train_runtime": 27500.5131, "train_tokens_per_second": 100089.478 }, { "epoch": 0.421, "grad_norm": 0.16859756410121918, "learning_rate": 0.0001895166756347789, "loss": 0.3082, "num_input_tokens_seen": 2759065600, "step": 42100, "train_runtime": 27570.8932, "train_tokens_per_second": 100071.68 }, { "epoch": 0.422, "grad_norm": 0.13300351798534393, "learning_rate": 0.0001890572934257475, "loss": 0.3065, "num_input_tokens_seen": 2765619200, "step": 42200, "train_runtime": 27634.6434, "train_tokens_per_second": 100077.977 }, { "epoch": 0.423, "grad_norm": 0.14460822939872742, "learning_rate": 0.00018859751791020497, "loss": 0.3055, "num_input_tokens_seen": 2772172800, "step": 42300, "train_runtime": 27700.3395, "train_tokens_per_second": 100077.214 }, { "epoch": 0.424, "grad_norm": 0.1369091421365738, "learning_rate": 0.0001881373537180856, "loss": 0.3026, "num_input_tokens_seen": 2778726400, "step": 42400, "train_runtime": 27764.0211, "train_tokens_per_second": 100083.716 }, { "epoch": 0.425, "grad_norm": 0.15593157708644867, "learning_rate": 0.00018767680548323766, "loss": 0.3014, "num_input_tokens_seen": 2785280000, "step": 42500, "train_runtime": 27828.3317, "train_tokens_per_second": 100087.926 }, { "epoch": 0.426, "grad_norm": 0.18689674139022827, "learning_rate": 0.0001872158778433768, "loss": 0.3041, "num_input_tokens_seen": 2791833600, "step": 42600, "train_runtime": 27897.9539, "train_tokens_per_second": 100073.059 }, { "epoch": 0.427, "grad_norm": 0.1532142609357834, "learning_rate": 0.0001867545754400392, "loss": 0.3041, "num_input_tokens_seen": 2798387200, "step": 42700, "train_runtime": 27964.2157, "train_tokens_per_second": 100070.291 }, { "epoch": 0.428, "grad_norm": 0.12894967198371887, "learning_rate": 0.000186292902918535, "loss": 0.3047, "num_input_tokens_seen": 2804940800, "step": 42800, "train_runtime": 28028.1798, "train_tokens_per_second": 100075.739 }, { "epoch": 0.429, "grad_norm": 0.14526289701461792, "learning_rate": 0.00018583086492790136, "loss": 0.3097, "num_input_tokens_seen": 2811494400, "step": 42900, "train_runtime": 28093.2724, "train_tokens_per_second": 100077.142 }, { "epoch": 0.43, "grad_norm": 0.15546266734600067, "learning_rate": 0.00018536846612085566, "loss": 0.3066, "num_input_tokens_seen": 2818048000, "step": 43000, "train_runtime": 28157.8145, "train_tokens_per_second": 100080.495 }, { "epoch": 0.431, "grad_norm": 0.16307438910007477, "learning_rate": 0.00018490571115374878, "loss": 0.3073, "num_input_tokens_seen": 2824601600, "step": 43100, "train_runtime": 28227.9591, "train_tokens_per_second": 100063.968 }, { "epoch": 0.432, "grad_norm": 0.1360054761171341, "learning_rate": 0.00018444260468651816, "loss": 0.3013, "num_input_tokens_seen": 2831155200, "step": 43200, "train_runtime": 28291.3921, "train_tokens_per_second": 100071.258 }, { "epoch": 0.433, "grad_norm": 0.1404498666524887, "learning_rate": 0.00018397915138264068, "loss": 0.3066, "num_input_tokens_seen": 2837708800, "step": 43300, "train_runtime": 28355.3195, "train_tokens_per_second": 100076.771 }, { "epoch": 0.434, "grad_norm": 0.1926499307155609, "learning_rate": 0.00018351535590908606, "loss": 0.3012, "num_input_tokens_seen": 2844262400, "step": 43400, "train_runtime": 28420.6726, "train_tokens_per_second": 100077.237 }, { "epoch": 0.435, "grad_norm": 0.13713879883289337, "learning_rate": 0.00018305122293626948, "loss": 0.3029, "num_input_tokens_seen": 2850816000, "step": 43500, "train_runtime": 28490.1826, "train_tokens_per_second": 100063.1 }, { "epoch": 0.436, "grad_norm": 0.1541578322649002, "learning_rate": 0.00018258675713800492, "loss": 0.3061, "num_input_tokens_seen": 2857369600, "step": 43600, "train_runtime": 28555.7903, "train_tokens_per_second": 100062.704 }, { "epoch": 0.437, "grad_norm": 0.14117270708084106, "learning_rate": 0.00018212196319145773, "loss": 0.3053, "num_input_tokens_seen": 2863923200, "step": 43700, "train_runtime": 28622.1811, "train_tokens_per_second": 100059.572 }, { "epoch": 0.438, "grad_norm": 0.14943140745162964, "learning_rate": 0.00018165684577709778, "loss": 0.3043, "num_input_tokens_seen": 2870476800, "step": 43800, "train_runtime": 28686.5648, "train_tokens_per_second": 100063.455 }, { "epoch": 0.439, "grad_norm": 0.14043770730495453, "learning_rate": 0.0001811914095786524, "loss": 0.3048, "num_input_tokens_seen": 2877030400, "step": 43900, "train_runtime": 28751.3532, "train_tokens_per_second": 100065.913 }, { "epoch": 0.44, "grad_norm": 0.17811591923236847, "learning_rate": 0.0001807256592830588, "loss": 0.3088, "num_input_tokens_seen": 2883584000, "step": 44000, "train_runtime": 28815.5193, "train_tokens_per_second": 100070.52 }, { "epoch": 0.441, "grad_norm": 0.14588113129138947, "learning_rate": 0.00018025959958041732, "loss": 0.3017, "num_input_tokens_seen": 2890137600, "step": 44100, "train_runtime": 28880.019, "train_tokens_per_second": 100073.951 }, { "epoch": 0.442, "grad_norm": 0.22986213862895966, "learning_rate": 0.00017979323516394407, "loss": 0.3049, "num_input_tokens_seen": 2896691200, "step": 44200, "train_runtime": 28945.7871, "train_tokens_per_second": 100072.981 }, { "epoch": 0.443, "grad_norm": 0.853501558303833, "learning_rate": 0.00017932657072992344, "loss": 0.3081, "num_input_tokens_seen": 2903244800, "step": 44300, "train_runtime": 29016.3509, "train_tokens_per_second": 100055.476 }, { "epoch": 0.444, "grad_norm": 0.15835335850715637, "learning_rate": 0.00017885961097766117, "loss": 0.3035, "num_input_tokens_seen": 2909798400, "step": 44400, "train_runtime": 29079.9877, "train_tokens_per_second": 100061.886 }, { "epoch": 0.445, "grad_norm": 0.25418880581855774, "learning_rate": 0.00017839236060943674, "loss": 0.3014, "num_input_tokens_seen": 2916352000, "step": 44500, "train_runtime": 29144.3776, "train_tokens_per_second": 100065.681 }, { "epoch": 0.446, "grad_norm": 0.14922253787517548, "learning_rate": 0.0001779248243304562, "loss": 0.3038, "num_input_tokens_seen": 2922905600, "step": 44600, "train_runtime": 29208.2393, "train_tokens_per_second": 100071.27 }, { "epoch": 0.447, "grad_norm": 0.14103923738002777, "learning_rate": 0.00017745700684880465, "loss": 0.3064, "num_input_tokens_seen": 2929459200, "step": 44700, "train_runtime": 29273.1105, "train_tokens_per_second": 100073.383 }, { "epoch": 0.448, "grad_norm": 0.15813007950782776, "learning_rate": 0.000176988912875399, "loss": 0.3049, "num_input_tokens_seen": 2936012800, "step": 44800, "train_runtime": 29342.9224, "train_tokens_per_second": 100058.636 }, { "epoch": 0.449, "grad_norm": 0.1471075564622879, "learning_rate": 0.00017652054712394028, "loss": 0.3029, "num_input_tokens_seen": 2942566400, "step": 44900, "train_runtime": 29408.1792, "train_tokens_per_second": 100059.455 }, { "epoch": 0.45, "grad_norm": 0.16910097002983093, "learning_rate": 0.0001760519143108665, "loss": 0.3026, "num_input_tokens_seen": 2949120000, "step": 45000, "train_runtime": 29472.6802, "train_tokens_per_second": 100062.837 }, { "epoch": 0.451, "grad_norm": 0.15087512135505676, "learning_rate": 0.00017558301915530483, "loss": 0.305, "num_input_tokens_seen": 2955673600, "step": 45100, "train_runtime": 29537.0324, "train_tokens_per_second": 100066.708 }, { "epoch": 0.452, "grad_norm": 0.16292531788349152, "learning_rate": 0.00017511386637902428, "loss": 0.305, "num_input_tokens_seen": 2962227200, "step": 45200, "train_runtime": 29600.4356, "train_tokens_per_second": 100073.77 }, { "epoch": 0.453, "grad_norm": 0.14504611492156982, "learning_rate": 0.00017464446070638814, "loss": 0.3061, "num_input_tokens_seen": 2968780800, "step": 45300, "train_runtime": 29670.2849, "train_tokens_per_second": 100059.06 }, { "epoch": 0.454, "grad_norm": 0.14068329334259033, "learning_rate": 0.00017417480686430622, "loss": 0.3096, "num_input_tokens_seen": 2975334400, "step": 45400, "train_runtime": 29735.31, "train_tokens_per_second": 100060.648 }, { "epoch": 0.455, "grad_norm": 0.139748677611351, "learning_rate": 0.00017370490958218765, "loss": 0.3027, "num_input_tokens_seen": 2981888000, "step": 45500, "train_runtime": 29800.4491, "train_tokens_per_second": 100061.848 }, { "epoch": 0.456, "grad_norm": 0.1487821340560913, "learning_rate": 0.00017323477359189272, "loss": 0.3023, "num_input_tokens_seen": 2988441600, "step": 45600, "train_runtime": 29869.053, "train_tokens_per_second": 100051.434 }, { "epoch": 0.457, "grad_norm": 0.15015476942062378, "learning_rate": 0.00017276440362768564, "loss": 0.3028, "num_input_tokens_seen": 2994995200, "step": 45700, "train_runtime": 29933.644, "train_tokens_per_second": 100054.481 }, { "epoch": 0.458, "grad_norm": 0.1298416256904602, "learning_rate": 0.0001722938044261868, "loss": 0.3058, "num_input_tokens_seen": 3001548800, "step": 45800, "train_runtime": 29997.6813, "train_tokens_per_second": 100059.36 }, { "epoch": 0.459, "grad_norm": 0.1956530213356018, "learning_rate": 0.0001718229807263249, "loss": 0.3033, "num_input_tokens_seen": 3008102400, "step": 45900, "train_runtime": 30067.1877, "train_tokens_per_second": 100046.018 }, { "epoch": 0.46, "grad_norm": 0.15267929434776306, "learning_rate": 0.0001713519372692894, "loss": 0.3028, "num_input_tokens_seen": 3014656000, "step": 46000, "train_runtime": 30131.0143, "train_tokens_per_second": 100051.594 }, { "epoch": 0.461, "grad_norm": 0.13846905529499054, "learning_rate": 0.0001708806787984826, "loss": 0.3036, "num_input_tokens_seen": 3021209600, "step": 46100, "train_runtime": 30195.5066, "train_tokens_per_second": 100054.94 }, { "epoch": 0.462, "grad_norm": 0.13704828917980194, "learning_rate": 0.00017040921005947212, "loss": 0.3094, "num_input_tokens_seen": 3027763200, "step": 46200, "train_runtime": 30260.3523, "train_tokens_per_second": 100057.104 }, { "epoch": 0.463, "grad_norm": 0.15288543701171875, "learning_rate": 0.0001699375357999429, "loss": 0.3014, "num_input_tokens_seen": 3034316800, "step": 46300, "train_runtime": 30325.5675, "train_tokens_per_second": 100058.039 }, { "epoch": 0.464, "grad_norm": 0.19963988661766052, "learning_rate": 0.0001694656607696496, "loss": 0.3061, "num_input_tokens_seen": 3040870400, "step": 46400, "train_runtime": 30399.8434, "train_tokens_per_second": 100029.147 }, { "epoch": 0.465, "grad_norm": 0.14533430337905884, "learning_rate": 0.0001689935897203684, "loss": 0.3056, "num_input_tokens_seen": 3047424000, "step": 46500, "train_runtime": 30464.3563, "train_tokens_per_second": 100032.443 }, { "epoch": 0.466, "grad_norm": 0.14005503058433533, "learning_rate": 0.0001685213274058496, "loss": 0.3016, "num_input_tokens_seen": 3053977600, "step": 46600, "train_runtime": 30528.7292, "train_tokens_per_second": 100036.185 }, { "epoch": 0.467, "grad_norm": 0.17612388730049133, "learning_rate": 0.00016804887858176944, "loss": 0.3006, "num_input_tokens_seen": 3060531200, "step": 46700, "train_runtime": 30592.7142, "train_tokens_per_second": 100041.179 }, { "epoch": 0.468, "grad_norm": 0.13526348769664764, "learning_rate": 0.00016757624800568238, "loss": 0.3001, "num_input_tokens_seen": 3067084800, "step": 46800, "train_runtime": 30656.5144, "train_tokens_per_second": 100046.755 }, { "epoch": 0.469, "grad_norm": 0.6205772161483765, "learning_rate": 0.00016710344043697301, "loss": 0.3016, "num_input_tokens_seen": 3073638400, "step": 46900, "train_runtime": 30727.0215, "train_tokens_per_second": 100030.47 }, { "epoch": 0.47, "grad_norm": 0.15328101813793182, "learning_rate": 0.0001666304606368083, "loss": 0.3049, "num_input_tokens_seen": 3080192000, "step": 47000, "train_runtime": 30792.0203, "train_tokens_per_second": 100032.15 }, { "epoch": 0.471, "grad_norm": 0.1804981380701065, "learning_rate": 0.00016615731336808962, "loss": 0.3008, "num_input_tokens_seen": 3086745600, "step": 47100, "train_runtime": 30856.1119, "train_tokens_per_second": 100036.764 }, { "epoch": 0.472, "grad_norm": 0.1460595428943634, "learning_rate": 0.0001656840033954047, "loss": 0.2996, "num_input_tokens_seen": 3093299200, "step": 47200, "train_runtime": 30922.3293, "train_tokens_per_second": 100034.482 }, { "epoch": 0.473, "grad_norm": 0.17493313550949097, "learning_rate": 0.00016521053548497973, "loss": 0.3005, "num_input_tokens_seen": 3099852800, "step": 47300, "train_runtime": 30985.6891, "train_tokens_per_second": 100041.435 }, { "epoch": 0.474, "grad_norm": 0.11990969628095627, "learning_rate": 0.0001647369144046313, "loss": 0.2995, "num_input_tokens_seen": 3106406400, "step": 47400, "train_runtime": 31056.5152, "train_tokens_per_second": 100024.307 }, { "epoch": 0.475, "grad_norm": 0.15634778141975403, "learning_rate": 0.00016426314492371842, "loss": 0.3054, "num_input_tokens_seen": 3112960000, "step": 47500, "train_runtime": 31121.0302, "train_tokens_per_second": 100027.537 }, { "epoch": 0.476, "grad_norm": 0.14218732714653015, "learning_rate": 0.0001637892318130945, "loss": 0.3036, "num_input_tokens_seen": 3119513600, "step": 47600, "train_runtime": 31185.6411, "train_tokens_per_second": 100030.446 }, { "epoch": 0.477, "grad_norm": 0.147688090801239, "learning_rate": 0.00016331517984505934, "loss": 0.3003, "num_input_tokens_seen": 3126067200, "step": 47700, "train_runtime": 31250.7507, "train_tokens_per_second": 100031.748 }, { "epoch": 0.478, "grad_norm": 0.1728331595659256, "learning_rate": 0.00016284099379331092, "loss": 0.2997, "num_input_tokens_seen": 3132620800, "step": 47800, "train_runtime": 31321.2751, "train_tokens_per_second": 100015.749 }, { "epoch": 0.479, "grad_norm": 0.12835726141929626, "learning_rate": 0.00016236667843289759, "loss": 0.2989, "num_input_tokens_seen": 3139174400, "step": 47900, "train_runtime": 31386.2974, "train_tokens_per_second": 100017.353 }, { "epoch": 0.48, "grad_norm": 0.13368946313858032, "learning_rate": 0.00016189223854016973, "loss": 0.3078, "num_input_tokens_seen": 3145728000, "step": 48000, "train_runtime": 31451.659, "train_tokens_per_second": 100017.872 }, { "epoch": 0.481, "grad_norm": 0.12727653980255127, "learning_rate": 0.00016141767889273182, "loss": 0.3017, "num_input_tokens_seen": 3152281600, "step": 48100, "train_runtime": 31516.5086, "train_tokens_per_second": 100020.013 }, { "epoch": 0.482, "grad_norm": 0.16222263872623444, "learning_rate": 0.00016094300426939417, "loss": 0.3009, "num_input_tokens_seen": 3158835200, "step": 48200, "train_runtime": 31581.3453, "train_tokens_per_second": 100022.186 }, { "epoch": 0.483, "grad_norm": 0.15287387371063232, "learning_rate": 0.00016046821945012505, "loss": 0.2975, "num_input_tokens_seen": 3165388800, "step": 48300, "train_runtime": 31645.8484, "train_tokens_per_second": 100025.405 }, { "epoch": 0.484, "grad_norm": 0.13035738468170166, "learning_rate": 0.00015999332921600226, "loss": 0.3046, "num_input_tokens_seen": 3171942400, "step": 48400, "train_runtime": 31716.5254, "train_tokens_per_second": 100009.139 }, { "epoch": 0.485, "grad_norm": 0.16508948802947998, "learning_rate": 0.00015951833834916532, "loss": 0.3061, "num_input_tokens_seen": 3178496000, "step": 48500, "train_runtime": 31781.7614, "train_tokens_per_second": 100010.064 }, { "epoch": 0.486, "grad_norm": 0.1543286293745041, "learning_rate": 0.00015904325163276672, "loss": 0.2995, "num_input_tokens_seen": 3185049600, "step": 48600, "train_runtime": 31847.2029, "train_tokens_per_second": 100010.34 }, { "epoch": 0.487, "grad_norm": 0.13470540940761566, "learning_rate": 0.00015856807385092466, "loss": 0.3067, "num_input_tokens_seen": 3191603200, "step": 48700, "train_runtime": 31911.0411, "train_tokens_per_second": 100015.64 }, { "epoch": 0.488, "grad_norm": 0.15521059930324554, "learning_rate": 0.00015809280978867405, "loss": 0.3009, "num_input_tokens_seen": 3198156800, "step": 48800, "train_runtime": 31975.3091, "train_tokens_per_second": 100019.574 }, { "epoch": 0.489, "grad_norm": 0.16505663096904755, "learning_rate": 0.0001576174642319187, "loss": 0.3019, "num_input_tokens_seen": 3204710400, "step": 48900, "train_runtime": 32039.3359, "train_tokens_per_second": 100024.246 }, { "epoch": 0.49, "grad_norm": 0.15701062977313995, "learning_rate": 0.0001571420419673831, "loss": 0.3025, "num_input_tokens_seen": 3211264000, "step": 49000, "train_runtime": 32104.9123, "train_tokens_per_second": 100024.07 }, { "epoch": 0.491, "grad_norm": 0.22376379370689392, "learning_rate": 0.0001566665477825642, "loss": 0.3035, "num_input_tokens_seen": 3217817600, "step": 49100, "train_runtime": 32177.5739, "train_tokens_per_second": 100001.871 }, { "epoch": 0.492, "grad_norm": 0.1716614067554474, "learning_rate": 0.0001561909864656831, "loss": 0.3046, "num_input_tokens_seen": 3224371200, "step": 49200, "train_runtime": 32241.8903, "train_tokens_per_second": 100005.65 }, { "epoch": 0.493, "grad_norm": 0.17557290196418762, "learning_rate": 0.00015571536280563705, "loss": 0.2987, "num_input_tokens_seen": 3230924800, "step": 49300, "train_runtime": 32307.4373, "train_tokens_per_second": 100005.605 }, { "epoch": 0.494, "grad_norm": 0.16884572803974152, "learning_rate": 0.000155239681591951, "loss": 0.2986, "num_input_tokens_seen": 3237478400, "step": 49400, "train_runtime": 32371.4412, "train_tokens_per_second": 100010.326 }, { "epoch": 0.495, "grad_norm": 0.15279650688171387, "learning_rate": 0.00015476394761472953, "loss": 0.2982, "num_input_tokens_seen": 3244032000, "step": 49500, "train_runtime": 32436.5241, "train_tokens_per_second": 100011.702 }, { "epoch": 0.496, "grad_norm": 0.1866491436958313, "learning_rate": 0.00015428816566460843, "loss": 0.3038, "num_input_tokens_seen": 3250585600, "step": 49600, "train_runtime": 32508.3167, "train_tokens_per_second": 99992.43 }, { "epoch": 0.497, "grad_norm": 0.14084835350513458, "learning_rate": 0.00015381234053270669, "loss": 0.3027, "num_input_tokens_seen": 3257139200, "step": 49700, "train_runtime": 32572.1194, "train_tokens_per_second": 99997.767 }, { "epoch": 0.498, "grad_norm": 0.16111333668231964, "learning_rate": 0.0001533364770105781, "loss": 0.3015, "num_input_tokens_seen": 3263692800, "step": 49800, "train_runtime": 32637.2501, "train_tokens_per_second": 99999.013 }, { "epoch": 0.499, "grad_norm": 0.14655210077762604, "learning_rate": 0.0001528605798901631, "loss": 0.3012, "num_input_tokens_seen": 3270246400, "step": 49900, "train_runtime": 32707.4201, "train_tokens_per_second": 99984.847 }, { "epoch": 0.5, "grad_norm": 0.1385914832353592, "learning_rate": 0.00015238465396374027, "loss": 0.3027, "num_input_tokens_seen": 3276800000, "step": 50000, "train_runtime": 32772.7798, "train_tokens_per_second": 99985.415 }, { "epoch": 0.501, "grad_norm": 0.1433262825012207, "learning_rate": 0.00015190870402387858, "loss": 0.3006, "num_input_tokens_seen": 3283353600, "step": 50100, "train_runtime": 32837.3412, "train_tokens_per_second": 99988.412 }, { "epoch": 0.502, "grad_norm": 0.15529057383537292, "learning_rate": 0.00015143273486338857, "loss": 0.2995, "num_input_tokens_seen": 3289907200, "step": 50200, "train_runtime": 32902.1033, "train_tokens_per_second": 99990.787 }, { "epoch": 0.503, "grad_norm": 0.1301671862602234, "learning_rate": 0.00015095675127527438, "loss": 0.3055, "num_input_tokens_seen": 3296460800, "step": 50300, "train_runtime": 32967.0743, "train_tokens_per_second": 99992.519 }, { "epoch": 0.504, "grad_norm": 0.1454419493675232, "learning_rate": 0.00015048075805268547, "loss": 0.3036, "num_input_tokens_seen": 3303014400, "step": 50400, "train_runtime": 33033.1243, "train_tokens_per_second": 99990.978 }, { "epoch": 0.505, "grad_norm": 0.1473357379436493, "learning_rate": 0.00015000475998886825, "loss": 0.3018, "num_input_tokens_seen": 3309568000, "step": 50500, "train_runtime": 33105.2406, "train_tokens_per_second": 99971.121 }, { "epoch": 0.506, "grad_norm": 0.13996386528015137, "learning_rate": 0.00014952876187711804, "loss": 0.2974, "num_input_tokens_seen": 3316121600, "step": 50600, "train_runtime": 33169.1198, "train_tokens_per_second": 99976.171 }, { "epoch": 0.507, "grad_norm": 0.14000660181045532, "learning_rate": 0.00014905276851073053, "loss": 0.2992, "num_input_tokens_seen": 3322675200, "step": 50700, "train_runtime": 33234.0005, "train_tokens_per_second": 99978.19 }, { "epoch": 0.508, "grad_norm": 0.14661286771297455, "learning_rate": 0.00014857678468295352, "loss": 0.3045, "num_input_tokens_seen": 3329228800, "step": 50800, "train_runtime": 33299.7758, "train_tokens_per_second": 99977.514 }, { "epoch": 0.509, "grad_norm": 0.15111635625362396, "learning_rate": 0.00014810081518693902, "loss": 0.3006, "num_input_tokens_seen": 3335782400, "step": 50900, "train_runtime": 33370.9097, "train_tokens_per_second": 99960.787 }, { "epoch": 0.51, "grad_norm": 0.12965109944343567, "learning_rate": 0.0001476248648156945, "loss": 0.2986, "num_input_tokens_seen": 3342336000, "step": 51000, "train_runtime": 33435.7602, "train_tokens_per_second": 99962.913 }, { "epoch": 0.511, "grad_norm": 0.13791891932487488, "learning_rate": 0.00014714893836203485, "loss": 0.2994, "num_input_tokens_seen": 3348889600, "step": 51100, "train_runtime": 33500.2878, "train_tokens_per_second": 99965.995 }, { "epoch": 0.512, "grad_norm": 0.1420348435640335, "learning_rate": 0.0001466730406185343, "loss": 0.2996, "num_input_tokens_seen": 3355443200, "step": 51200, "train_runtime": 33564.5521, "train_tokens_per_second": 99969.849 }, { "epoch": 0.513, "grad_norm": 0.1938745528459549, "learning_rate": 0.0001461971763774778, "loss": 0.3007, "num_input_tokens_seen": 3361996800, "step": 51300, "train_runtime": 33630.8004, "train_tokens_per_second": 99967.79 }, { "epoch": 0.514, "grad_norm": 0.1449531763792038, "learning_rate": 0.0001457213504308129, "loss": 0.3011, "num_input_tokens_seen": 3368550400, "step": 51400, "train_runtime": 33696.4447, "train_tokens_per_second": 99967.532 }, { "epoch": 0.515, "grad_norm": 0.16473324596881866, "learning_rate": 0.00014524556757010177, "loss": 0.3005, "num_input_tokens_seen": 3375104000, "step": 51500, "train_runtime": 33766.6492, "train_tokens_per_second": 99953.773 }, { "epoch": 0.516, "grad_norm": 0.1542610377073288, "learning_rate": 0.00014476983258647234, "loss": 0.3012, "num_input_tokens_seen": 3381657600, "step": 51600, "train_runtime": 33832.0917, "train_tokens_per_second": 99954.139 }, { "epoch": 0.517, "grad_norm": 0.1388223022222519, "learning_rate": 0.0001442941502705707, "loss": 0.3031, "num_input_tokens_seen": 3388211200, "step": 51700, "train_runtime": 33896.7212, "train_tokens_per_second": 99956.901 }, { "epoch": 0.518, "grad_norm": 0.19452647864818573, "learning_rate": 0.0001438185254125125, "loss": 0.3011, "num_input_tokens_seen": 3394764800, "step": 51800, "train_runtime": 33962.0557, "train_tokens_per_second": 99957.577 }, { "epoch": 0.519, "grad_norm": 0.16043786704540253, "learning_rate": 0.00014334296280183473, "loss": 0.2997, "num_input_tokens_seen": 3401318400, "step": 51900, "train_runtime": 34027.5551, "train_tokens_per_second": 99957.766 }, { "epoch": 0.52, "grad_norm": 0.19769923388957977, "learning_rate": 0.00014286746722744768, "loss": 0.3007, "num_input_tokens_seen": 3407872000, "step": 52000, "train_runtime": 34098.2307, "train_tokens_per_second": 99942.781 }, { "epoch": 0.521, "grad_norm": 0.1524592489004135, "learning_rate": 0.00014239204347758647, "loss": 0.299, "num_input_tokens_seen": 3414425600, "step": 52100, "train_runtime": 34164.2522, "train_tokens_per_second": 99941.47 }, { "epoch": 0.522, "grad_norm": 0.14221727848052979, "learning_rate": 0.00014191669633976294, "loss": 0.3029, "num_input_tokens_seen": 3420979200, "step": 52200, "train_runtime": 34227.7165, "train_tokens_per_second": 99947.632 }, { "epoch": 0.523, "grad_norm": 0.15958262979984283, "learning_rate": 0.00014144143060071756, "loss": 0.3005, "num_input_tokens_seen": 3427532800, "step": 52300, "train_runtime": 34292.9446, "train_tokens_per_second": 99948.629 }, { "epoch": 0.524, "grad_norm": 0.1545192301273346, "learning_rate": 0.000140966251046371, "loss": 0.3024, "num_input_tokens_seen": 3434086400, "step": 52400, "train_runtime": 34357.5392, "train_tokens_per_second": 99951.466 }, { "epoch": 0.525, "grad_norm": 0.14636173844337463, "learning_rate": 0.0001404911624617761, "loss": 0.2967, "num_input_tokens_seen": 3440640000, "step": 52500, "train_runtime": 34423.9361, "train_tokens_per_second": 99949.058 }, { "epoch": 0.526, "grad_norm": 0.26764926314353943, "learning_rate": 0.00014001616963106966, "loss": 0.2982, "num_input_tokens_seen": 3447193600, "step": 52600, "train_runtime": 34489.4544, "train_tokens_per_second": 99949.206 }, { "epoch": 0.527, "grad_norm": 0.20636320114135742, "learning_rate": 0.00013954127733742416, "loss": 0.3011, "num_input_tokens_seen": 3453747200, "step": 52700, "train_runtime": 34559.9071, "train_tokens_per_second": 99935.083 }, { "epoch": 0.528, "grad_norm": 0.1523534059524536, "learning_rate": 0.0001390664903629998, "loss": 0.3042, "num_input_tokens_seen": 3460300800, "step": 52800, "train_runtime": 34624.4507, "train_tokens_per_second": 99938.071 }, { "epoch": 0.529, "grad_norm": 0.15213948488235474, "learning_rate": 0.0001385918134888961, "loss": 0.3024, "num_input_tokens_seen": 3466854400, "step": 52900, "train_runtime": 34690.2273, "train_tokens_per_second": 99937.495 }, { "epoch": 0.53, "grad_norm": 0.14115960896015167, "learning_rate": 0.00013811725149510387, "loss": 0.2999, "num_input_tokens_seen": 3473408000, "step": 53000, "train_runtime": 34756.5786, "train_tokens_per_second": 99935.268 }, { "epoch": 0.531, "grad_norm": 0.16747893393039703, "learning_rate": 0.0001376428091604572, "loss": 0.3011, "num_input_tokens_seen": 3479961600, "step": 53100, "train_runtime": 34823.0381, "train_tokens_per_second": 99932.74 }, { "epoch": 0.532, "grad_norm": 0.1266140639781952, "learning_rate": 0.00013716849126258512, "loss": 0.2985, "num_input_tokens_seen": 3486515200, "step": 53200, "train_runtime": 34892.7557, "train_tokens_per_second": 99920.89 }, { "epoch": 0.533, "grad_norm": 0.14753171801567078, "learning_rate": 0.00013669430257786354, "loss": 0.2996, "num_input_tokens_seen": 3493068800, "step": 53300, "train_runtime": 34957.0461, "train_tokens_per_second": 99924.599 }, { "epoch": 0.534, "grad_norm": 0.2617182731628418, "learning_rate": 0.00013622024788136728, "loss": 0.3027, "num_input_tokens_seen": 3499622400, "step": 53400, "train_runtime": 35022.8837, "train_tokens_per_second": 99923.879 }, { "epoch": 0.535, "grad_norm": 0.17150761187076569, "learning_rate": 0.00013574633194682185, "loss": 0.3027, "num_input_tokens_seen": 3506176000, "step": 53500, "train_runtime": 35088.2396, "train_tokens_per_second": 99924.534 }, { "epoch": 0.536, "grad_norm": 0.16566570103168488, "learning_rate": 0.0001352725595465555, "loss": 0.2999, "num_input_tokens_seen": 3512729600, "step": 53600, "train_runtime": 35153.6189, "train_tokens_per_second": 99925.12 }, { "epoch": 0.537, "grad_norm": 0.13577675819396973, "learning_rate": 0.000134798935451451, "loss": 0.2969, "num_input_tokens_seen": 3519283200, "step": 53700, "train_runtime": 35225.0068, "train_tokens_per_second": 99908.659 }, { "epoch": 0.538, "grad_norm": 0.20843537151813507, "learning_rate": 0.00013432546443089768, "loss": 0.2967, "num_input_tokens_seen": 3525836800, "step": 53800, "train_runtime": 35288.0858, "train_tokens_per_second": 99915.785 }, { "epoch": 0.539, "grad_norm": 0.15664201974868774, "learning_rate": 0.0001338521512527436, "loss": 0.3007, "num_input_tokens_seen": 3532390400, "step": 53900, "train_runtime": 35353.7477, "train_tokens_per_second": 99915.586 }, { "epoch": 0.54, "grad_norm": 0.14205297827720642, "learning_rate": 0.00013337900068324712, "loss": 0.3001, "num_input_tokens_seen": 3538944000, "step": 54000, "train_runtime": 35423.5891, "train_tokens_per_second": 99903.598 }, { "epoch": 0.541, "grad_norm": 0.13229498267173767, "learning_rate": 0.00013290601748702918, "loss": 0.2931, "num_input_tokens_seen": 3545497600, "step": 54100, "train_runtime": 35489.6646, "train_tokens_per_second": 99902.257 }, { "epoch": 0.542, "grad_norm": 0.1380510926246643, "learning_rate": 0.00013243320642702543, "loss": 0.3116, "num_input_tokens_seen": 3552051200, "step": 54200, "train_runtime": 35554.9224, "train_tokens_per_second": 99903.219 }, { "epoch": 0.543, "grad_norm": 0.16735288500785828, "learning_rate": 0.0001319605722644379, "loss": 0.2998, "num_input_tokens_seen": 3558604800, "step": 54300, "train_runtime": 35619.4728, "train_tokens_per_second": 99906.161 }, { "epoch": 0.544, "grad_norm": 0.17502574622631073, "learning_rate": 0.0001314881197586874, "loss": 0.3004, "num_input_tokens_seen": 3565158400, "step": 54400, "train_runtime": 35685.8161, "train_tokens_per_second": 99904.074 }, { "epoch": 0.545, "grad_norm": 0.14805424213409424, "learning_rate": 0.0001310158536673654, "loss": 0.2983, "num_input_tokens_seen": 3571712000, "step": 54500, "train_runtime": 35750.1467, "train_tokens_per_second": 99907.618 }, { "epoch": 0.546, "grad_norm": 0.1533045917749405, "learning_rate": 0.0001305437787461862, "loss": 0.2976, "num_input_tokens_seen": 3578265600, "step": 54600, "train_runtime": 35816.4973, "train_tokens_per_second": 99905.515 }, { "epoch": 0.547, "grad_norm": 0.18475773930549622, "learning_rate": 0.00013007189974893903, "loss": 0.2951, "num_input_tokens_seen": 3584819200, "step": 54700, "train_runtime": 35886.6478, "train_tokens_per_second": 99892.841 }, { "epoch": 0.548, "grad_norm": 0.13913068175315857, "learning_rate": 0.00012960022142744016, "loss": 0.297, "num_input_tokens_seen": 3591372800, "step": 54800, "train_runtime": 35950.7798, "train_tokens_per_second": 99896.937 }, { "epoch": 0.549, "grad_norm": 0.15448203682899475, "learning_rate": 0.00012912874853148506, "loss": 0.303, "num_input_tokens_seen": 3597926400, "step": 54900, "train_runtime": 36015.8762, "train_tokens_per_second": 99898.344 }, { "epoch": 0.55, "grad_norm": 0.15416036546230316, "learning_rate": 0.00012865748580880053, "loss": 0.2979, "num_input_tokens_seen": 3604480000, "step": 55000, "train_runtime": 36080.201, "train_tokens_per_second": 99901.883 }, { "epoch": 0.551, "grad_norm": 0.14506150782108307, "learning_rate": 0.0001281864380049969, "loss": 0.2983, "num_input_tokens_seen": 3611033600, "step": 55100, "train_runtime": 36150.2521, "train_tokens_per_second": 99889.583 }, { "epoch": 0.552, "grad_norm": 0.17357710003852844, "learning_rate": 0.00012771560986352042, "loss": 0.2986, "num_input_tokens_seen": 3617587200, "step": 55200, "train_runtime": 36215.2659, "train_tokens_per_second": 99891.223 }, { "epoch": 0.553, "grad_norm": 0.16711916029453278, "learning_rate": 0.0001272450061256052, "loss": 0.2979, "num_input_tokens_seen": 3624140800, "step": 55300, "train_runtime": 36279.3222, "train_tokens_per_second": 99895.494 }, { "epoch": 0.554, "grad_norm": 0.1502256691455841, "learning_rate": 0.00012677463153022565, "loss": 0.3007, "num_input_tokens_seen": 3630694400, "step": 55400, "train_runtime": 36345.9552, "train_tokens_per_second": 99892.667 }, { "epoch": 0.555, "grad_norm": 0.15480037033557892, "learning_rate": 0.0001263044908140488, "loss": 0.2975, "num_input_tokens_seen": 3637248000, "step": 55500, "train_runtime": 36415.9598, "train_tokens_per_second": 99880.602 }, { "epoch": 0.556, "grad_norm": 0.15693609416484833, "learning_rate": 0.00012583458871138632, "loss": 0.2978, "num_input_tokens_seen": 3643801600, "step": 55600, "train_runtime": 36480.5541, "train_tokens_per_second": 99883.395 }, { "epoch": 0.557, "grad_norm": 0.147445410490036, "learning_rate": 0.00012536492995414723, "loss": 0.2991, "num_input_tokens_seen": 3650355200, "step": 55700, "train_runtime": 36545.2319, "train_tokens_per_second": 99885.95 }, { "epoch": 0.558, "grad_norm": 0.13640980422496796, "learning_rate": 0.00012489551927179007, "loss": 0.2987, "num_input_tokens_seen": 3656908800, "step": 55800, "train_runtime": 36611.0993, "train_tokens_per_second": 99885.25 }, { "epoch": 0.559, "grad_norm": 0.14373840391635895, "learning_rate": 0.00012442636139127508, "loss": 0.3, "num_input_tokens_seen": 3663462400, "step": 55900, "train_runtime": 36676.4606, "train_tokens_per_second": 99885.931 }, { "epoch": 0.56, "grad_norm": 0.14679211378097534, "learning_rate": 0.00012395746103701695, "loss": 0.2996, "num_input_tokens_seen": 3670016000, "step": 56000, "train_runtime": 36748.2938, "train_tokens_per_second": 99869.018 }, { "epoch": 0.561, "grad_norm": 0.15536077320575714, "learning_rate": 0.00012348882293083708, "loss": 0.2953, "num_input_tokens_seen": 3676569600, "step": 56100, "train_runtime": 36813.4246, "train_tokens_per_second": 99870.35 }, { "epoch": 0.562, "grad_norm": 0.16678054630756378, "learning_rate": 0.00012302045179191594, "loss": 0.2969, "num_input_tokens_seen": 3683123200, "step": 56200, "train_runtime": 36877.8431, "train_tokens_per_second": 99873.607 }, { "epoch": 0.563, "grad_norm": 0.15781697630882263, "learning_rate": 0.00012255235233674572, "loss": 0.2972, "num_input_tokens_seen": 3689676800, "step": 56300, "train_runtime": 36943.2178, "train_tokens_per_second": 99874.267 }, { "epoch": 0.564, "grad_norm": 0.13541863858699799, "learning_rate": 0.00012208452927908278, "loss": 0.302, "num_input_tokens_seen": 3696230400, "step": 56400, "train_runtime": 37008.8029, "train_tokens_per_second": 99874.357 }, { "epoch": 0.565, "grad_norm": 0.1400034874677658, "learning_rate": 0.00012161698732990003, "loss": 0.3, "num_input_tokens_seen": 3702784000, "step": 56500, "train_runtime": 37078.9889, "train_tokens_per_second": 99862.054 }, { "epoch": 0.566, "grad_norm": 0.1511828452348709, "learning_rate": 0.00012114973119733987, "loss": 0.3017, "num_input_tokens_seen": 3709337600, "step": 56600, "train_runtime": 37144.0507, "train_tokens_per_second": 99863.573 }, { "epoch": 0.567, "grad_norm": 0.15576902031898499, "learning_rate": 0.00012068276558666616, "loss": 0.2981, "num_input_tokens_seen": 3715891200, "step": 56700, "train_runtime": 37206.97, "train_tokens_per_second": 99870.836 }, { "epoch": 0.568, "grad_norm": 0.24084219336509705, "learning_rate": 0.00012021609520021752, "loss": 0.3025, "num_input_tokens_seen": 3722444800, "step": 56800, "train_runtime": 37278.1305, "train_tokens_per_second": 99855.995 }, { "epoch": 0.569, "grad_norm": 0.16832643747329712, "learning_rate": 0.00011974972473735957, "loss": 0.301, "num_input_tokens_seen": 3728998400, "step": 56900, "train_runtime": 37343.2452, "train_tokens_per_second": 99857.374 }, { "epoch": 0.57, "grad_norm": 0.18326181173324585, "learning_rate": 0.00011928365889443764, "loss": 0.2987, "num_input_tokens_seen": 3735552000, "step": 57000, "train_runtime": 37407.594, "train_tokens_per_second": 99860.793 }, { "epoch": 0.571, "grad_norm": 0.15526984632015228, "learning_rate": 0.00011881790236472966, "loss": 0.2991, "num_input_tokens_seen": 3742105600, "step": 57100, "train_runtime": 37474.3952, "train_tokens_per_second": 99857.665 }, { "epoch": 0.572, "grad_norm": 0.18177416920661926, "learning_rate": 0.00011835245983839869, "loss": 0.3002, "num_input_tokens_seen": 3748659200, "step": 57200, "train_runtime": 37538.8922, "train_tokens_per_second": 99860.677 }, { "epoch": 0.573, "grad_norm": 0.1915498822927475, "learning_rate": 0.00011788733600244575, "loss": 0.2986, "num_input_tokens_seen": 3755212800, "step": 57300, "train_runtime": 37605.3867, "train_tokens_per_second": 99858.375 }, { "epoch": 0.574, "grad_norm": 0.15175184607505798, "learning_rate": 0.00011742253554066278, "loss": 0.3015, "num_input_tokens_seen": 3761766400, "step": 57400, "train_runtime": 37678.0051, "train_tokens_per_second": 99839.851 }, { "epoch": 0.575, "grad_norm": 0.16369026899337769, "learning_rate": 0.00011695806313358523, "loss": 0.3003, "num_input_tokens_seen": 3768320000, "step": 57500, "train_runtime": 37742.0245, "train_tokens_per_second": 99844.141 }, { "epoch": 0.576, "grad_norm": 0.16646848618984222, "learning_rate": 0.00011649392345844506, "loss": 0.2972, "num_input_tokens_seen": 3774873600, "step": 57600, "train_runtime": 37807.5481, "train_tokens_per_second": 99844.444 }, { "epoch": 0.577, "grad_norm": 0.14035099744796753, "learning_rate": 0.00011603012118912372, "loss": 0.2985, "num_input_tokens_seen": 3781427200, "step": 57700, "train_runtime": 37871.8826, "train_tokens_per_second": 99847.88 }, { "epoch": 0.578, "grad_norm": 0.14899714291095734, "learning_rate": 0.00011556666099610485, "loss": 0.3008, "num_input_tokens_seen": 3787980800, "step": 57800, "train_runtime": 37943.2827, "train_tokens_per_second": 99832.712 }, { "epoch": 0.579, "grad_norm": 0.15600667893886566, "learning_rate": 0.00011510354754642745, "loss": 0.303, "num_input_tokens_seen": 3794534400, "step": 57900, "train_runtime": 38008.9332, "train_tokens_per_second": 99832.699 }, { "epoch": 0.58, "grad_norm": 0.1631072610616684, "learning_rate": 0.00011464078550363887, "loss": 0.2978, "num_input_tokens_seen": 3801088000, "step": 58000, "train_runtime": 38073.7575, "train_tokens_per_second": 99834.853 }, { "epoch": 0.581, "grad_norm": 0.1560899019241333, "learning_rate": 0.0001141783795277477, "loss": 0.299, "num_input_tokens_seen": 3807641600, "step": 58100, "train_runtime": 38139.694, "train_tokens_per_second": 99834.089 }, { "epoch": 0.582, "grad_norm": 0.1506076604127884, "learning_rate": 0.00011371633427517696, "loss": 0.2985, "num_input_tokens_seen": 3814195200, "step": 58200, "train_runtime": 38209.9556, "train_tokens_per_second": 99822.026 }, { "epoch": 0.583, "grad_norm": 0.16049940884113312, "learning_rate": 0.00011325465439871731, "loss": 0.2998, "num_input_tokens_seen": 3820748800, "step": 58300, "train_runtime": 38274.5015, "train_tokens_per_second": 99824.913 }, { "epoch": 0.584, "grad_norm": 0.15604519844055176, "learning_rate": 0.00011279334454747989, "loss": 0.2969, "num_input_tokens_seen": 3827302400, "step": 58400, "train_runtime": 38341.4547, "train_tokens_per_second": 99821.523 }, { "epoch": 0.585, "grad_norm": 0.15963351726531982, "learning_rate": 0.00011233240936684981, "loss": 0.2988, "num_input_tokens_seen": 3833856000, "step": 58500, "train_runtime": 38406.0222, "train_tokens_per_second": 99824.345 }, { "epoch": 0.586, "grad_norm": 0.15443411469459534, "learning_rate": 0.00011187185349843916, "loss": 0.298, "num_input_tokens_seen": 3840409600, "step": 58600, "train_runtime": 38472.0656, "train_tokens_per_second": 99823.327 }, { "epoch": 0.587, "grad_norm": 0.15459220111370087, "learning_rate": 0.00011141168158004053, "loss": 0.3004, "num_input_tokens_seen": 3846963200, "step": 58700, "train_runtime": 38542.0532, "train_tokens_per_second": 99812.098 }, { "epoch": 0.588, "grad_norm": 0.16199928522109985, "learning_rate": 0.00011095189824557998, "loss": 0.2985, "num_input_tokens_seen": 3853516800, "step": 58800, "train_runtime": 38609.4411, "train_tokens_per_second": 99807.63 }, { "epoch": 0.589, "grad_norm": 0.2209610939025879, "learning_rate": 0.00011049250812507054, "loss": 0.3005, "num_input_tokens_seen": 3860070400, "step": 58900, "train_runtime": 38675.4402, "train_tokens_per_second": 99806.761 }, { "epoch": 0.59, "grad_norm": 0.22285670042037964, "learning_rate": 0.00011003351584456571, "loss": 0.298, "num_input_tokens_seen": 3866624000, "step": 59000, "train_runtime": 38740.3065, "train_tokens_per_second": 99808.813 }, { "epoch": 0.591, "grad_norm": 0.2148812711238861, "learning_rate": 0.0001095749260261126, "loss": 0.2966, "num_input_tokens_seen": 3873177600, "step": 59100, "train_runtime": 38806.3344, "train_tokens_per_second": 99807.871 }, { "epoch": 0.592, "grad_norm": 0.21284043788909912, "learning_rate": 0.00010911674328770559, "loss": 0.3009, "num_input_tokens_seen": 3879731200, "step": 59200, "train_runtime": 38871.8466, "train_tokens_per_second": 99808.256 }, { "epoch": 0.593, "grad_norm": 0.1655593365430832, "learning_rate": 0.00010865897224323979, "loss": 0.2981, "num_input_tokens_seen": 3886284800, "step": 59300, "train_runtime": 38937.7196, "train_tokens_per_second": 99807.714 }, { "epoch": 0.594, "grad_norm": 0.17153207957744598, "learning_rate": 0.00010820161750246453, "loss": 0.3042, "num_input_tokens_seen": 3892838400, "step": 59400, "train_runtime": 39004.8582, "train_tokens_per_second": 99803.937 }, { "epoch": 0.595, "grad_norm": 0.15362666547298431, "learning_rate": 0.00010774468367093696, "loss": 0.3001, "num_input_tokens_seen": 3899392000, "step": 59500, "train_runtime": 39068.7475, "train_tokens_per_second": 99808.472 }, { "epoch": 0.596, "grad_norm": 0.15481388568878174, "learning_rate": 0.00010728817534997573, "loss": 0.2973, "num_input_tokens_seen": 3905945600, "step": 59600, "train_runtime": 39137.2916, "train_tokens_per_second": 99801.122 }, { "epoch": 0.597, "grad_norm": 0.1292748749256134, "learning_rate": 0.00010683209713661453, "loss": 0.2993, "num_input_tokens_seen": 3912499200, "step": 59700, "train_runtime": 39198.2818, "train_tokens_per_second": 99813.028 }, { "epoch": 0.598, "grad_norm": 0.14853951334953308, "learning_rate": 0.00010637645362355589, "loss": 0.2967, "num_input_tokens_seen": 3919052800, "step": 59800, "train_runtime": 39262.6162, "train_tokens_per_second": 99816.395 }, { "epoch": 0.599, "grad_norm": 0.13745439052581787, "learning_rate": 0.00010592124939912497, "loss": 0.3023, "num_input_tokens_seen": 3925606400, "step": 59900, "train_runtime": 39328.4755, "train_tokens_per_second": 99815.88 }, { "epoch": 0.6, "grad_norm": 0.14352121949195862, "learning_rate": 0.00010546648904722326, "loss": 0.2973, "num_input_tokens_seen": 3932160000, "step": 60000, "train_runtime": 39393.6967, "train_tokens_per_second": 99816.984 }, { "epoch": 0.601, "grad_norm": 0.16375063359737396, "learning_rate": 0.0001050121771472824, "loss": 0.2934, "num_input_tokens_seen": 3938713600, "step": 60100, "train_runtime": 39465.7876, "train_tokens_per_second": 99800.709 }, { "epoch": 0.602, "grad_norm": 0.144679456949234, "learning_rate": 0.0001045583182742182, "loss": 0.2983, "num_input_tokens_seen": 3945267200, "step": 60200, "train_runtime": 39531.166, "train_tokens_per_second": 99801.438 }, { "epoch": 0.603, "grad_norm": 0.33903974294662476, "learning_rate": 0.00010410491699838448, "loss": 0.2981, "num_input_tokens_seen": 3951820800, "step": 60300, "train_runtime": 39596.8662, "train_tokens_per_second": 99801.352 }, { "epoch": 0.604, "grad_norm": 0.1823410987854004, "learning_rate": 0.00010365197788552707, "loss": 0.2986, "num_input_tokens_seen": 3958374400, "step": 60400, "train_runtime": 39664.1206, "train_tokens_per_second": 99797.357 }, { "epoch": 0.605, "grad_norm": 0.18758277595043182, "learning_rate": 0.00010319950549673778, "loss": 0.2967, "num_input_tokens_seen": 3964928000, "step": 60500, "train_runtime": 39728.4695, "train_tokens_per_second": 99800.673 }, { "epoch": 0.606, "grad_norm": 0.173909991979599, "learning_rate": 0.00010274750438840855, "loss": 0.2981, "num_input_tokens_seen": 3971481600, "step": 60600, "train_runtime": 39794.5098, "train_tokens_per_second": 99799.737 }, { "epoch": 0.607, "grad_norm": 0.14504651725292206, "learning_rate": 0.00010229597911218554, "loss": 0.2967, "num_input_tokens_seen": 3978035200, "step": 60700, "train_runtime": 39864.8024, "train_tokens_per_second": 99788.158 }, { "epoch": 0.608, "grad_norm": 0.1418026238679886, "learning_rate": 0.00010184493421492324, "loss": 0.2976, "num_input_tokens_seen": 3984588800, "step": 60800, "train_runtime": 39931.2064, "train_tokens_per_second": 99786.337 }, { "epoch": 0.609, "grad_norm": 0.18415790796279907, "learning_rate": 0.0001013943742386388, "loss": 0.2997, "num_input_tokens_seen": 3991142400, "step": 60900, "train_runtime": 39996.7127, "train_tokens_per_second": 99786.761 }, { "epoch": 0.61, "grad_norm": 0.14107364416122437, "learning_rate": 0.00010094430372046616, "loss": 0.2979, "num_input_tokens_seen": 3997696000, "step": 61000, "train_runtime": 40068.6157, "train_tokens_per_second": 99771.253 } ], "logging_steps": 100, "max_steps": 100000, "num_input_tokens_seen": 3997696000, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.5963643723776e+16, "train_batch_size": 256, "trial_name": null, "trial_params": null }