diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6134 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.61, + "eval_steps": 500, + "global_step": 61000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001, + "grad_norm": 17.39519691467285, + "learning_rate": 2.97e-05, + "loss": 9.7941, + "num_input_tokens_seen": 6553600, + "step": 100, + "train_runtime": 74.0623, + "train_tokens_per_second": 88487.632 + }, + { + "epoch": 0.002, + "grad_norm": 10.212440490722656, + "learning_rate": 5.97e-05, + "loss": 1.0389, + "num_input_tokens_seen": 13107200, + "step": 200, + "train_runtime": 135.0365, + "train_tokens_per_second": 97064.126 + }, + { + "epoch": 0.003, + "grad_norm": 6.982235908508301, + "learning_rate": 8.969999999999998e-05, + "loss": 0.7951, + "num_input_tokens_seen": 19660800, + "step": 300, + "train_runtime": 196.4342, + "train_tokens_per_second": 100088.472 + }, + { + "epoch": 0.004, + "grad_norm": 2.089735507965088, + "learning_rate": 0.0001197, + "loss": 0.6341, + "num_input_tokens_seen": 26214400, + "step": 400, + "train_runtime": 257.5653, + "train_tokens_per_second": 101777.682 + }, + { + "epoch": 0.005, + "grad_norm": 2.6269969940185547, + "learning_rate": 0.00014969999999999998, + "loss": 0.5353, + "num_input_tokens_seen": 32768000, + "step": 500, + "train_runtime": 323.599, + "train_tokens_per_second": 101261.143 + }, + { + "epoch": 0.006, + "grad_norm": 0.9126470685005188, + "learning_rate": 0.00017969999999999998, + "loss": 0.4822, + "num_input_tokens_seen": 39321600, + "step": 600, + "train_runtime": 385.3073, + "train_tokens_per_second": 102052.566 + }, + { + "epoch": 0.007, + "grad_norm": 0.7452394366264343, + "learning_rate": 0.00020969999999999997, + "loss": 0.4534, + "num_input_tokens_seen": 45875200, + "step": 700, + "train_runtime": 447.534, + "train_tokens_per_second": 102506.63 + }, + { + "epoch": 0.008, + "grad_norm": 0.6909123659133911, + "learning_rate": 0.0002397, + "loss": 0.4323, + "num_input_tokens_seen": 52428800, + "step": 800, + "train_runtime": 510.1043, + "train_tokens_per_second": 102780.558 + }, + { + "epoch": 0.009, + "grad_norm": 0.5689504146575928, + "learning_rate": 0.0002697, + "loss": 0.4262, + "num_input_tokens_seen": 58982400, + "step": 900, + "train_runtime": 571.3595, + "train_tokens_per_second": 103231.669 + }, + { + "epoch": 0.01, + "grad_norm": 0.42208704352378845, + "learning_rate": 0.00029969999999999997, + "loss": 0.4158, + "num_input_tokens_seen": 65536000, + "step": 1000, + "train_runtime": 638.5123, + "train_tokens_per_second": 102638.586 + }, + { + "epoch": 0.011, + "grad_norm": 0.4542798399925232, + "learning_rate": 0.00029999925978027874, + "loss": 0.4127, + "num_input_tokens_seen": 72089600, + "step": 1100, + "train_runtime": 698.6527, + "train_tokens_per_second": 103183.742 + }, + { + "epoch": 0.012, + "grad_norm": 0.4086480736732483, + "learning_rate": 0.0002999970091452017, + "loss": 0.4018, + "num_input_tokens_seen": 78643200, + "step": 1200, + "train_runtime": 761.7182, + "train_tokens_per_second": 103244.485 + }, + { + "epoch": 0.013, + "grad_norm": 0.37623685598373413, + "learning_rate": 0.00029999324804190795, + "loss": 0.3969, + "num_input_tokens_seen": 85196800, + "step": 1300, + "train_runtime": 827.9033, + "train_tokens_per_second": 102906.7 + }, + { + "epoch": 0.014, + "grad_norm": 0.3346163332462311, + "learning_rate": 0.0002999879765082716, + "loss": 0.3906, + "num_input_tokens_seen": 91750400, + "step": 1400, + "train_runtime": 889.5401, + "train_tokens_per_second": 103143.635 + }, + { + "epoch": 0.015, + "grad_norm": 0.4093320369720459, + "learning_rate": 0.000299981194597377, + "loss": 0.3852, + "num_input_tokens_seen": 98304000, + "step": 1500, + "train_runtime": 950.9359, + "train_tokens_per_second": 103376.055 + }, + { + "epoch": 0.016, + "grad_norm": 0.3808560371398926, + "learning_rate": 0.0002999729023775179, + "loss": 0.3819, + "num_input_tokens_seen": 104857600, + "step": 1600, + "train_runtime": 1017.4047, + "train_tokens_per_second": 103063.807 + }, + { + "epoch": 0.017, + "grad_norm": 0.3014701306819916, + "learning_rate": 0.0002999630999321969, + "loss": 0.387, + "num_input_tokens_seen": 111411200, + "step": 1700, + "train_runtime": 1075.027, + "train_tokens_per_second": 103635.721 + }, + { + "epoch": 0.018, + "grad_norm": 0.25073230266571045, + "learning_rate": 0.00029995178736012443, + "loss": 0.382, + "num_input_tokens_seen": 117964800, + "step": 1800, + "train_runtime": 1141.6684, + "train_tokens_per_second": 103326.671 + }, + { + "epoch": 0.019, + "grad_norm": 0.2569698989391327, + "learning_rate": 0.0002999389647752181, + "loss": 0.3745, + "num_input_tokens_seen": 124518400, + "step": 1900, + "train_runtime": 1202.9974, + "train_tokens_per_second": 103506.793 + }, + { + "epoch": 0.02, + "grad_norm": 0.2895148694515228, + "learning_rate": 0.00029992463230660104, + "loss": 0.3747, + "num_input_tokens_seen": 131072000, + "step": 2000, + "train_runtime": 1271.272, + "train_tokens_per_second": 103103.035 + }, + { + "epoch": 0.021, + "grad_norm": 0.28352853655815125, + "learning_rate": 0.00029990879009860117, + "loss": 0.3701, + "num_input_tokens_seen": 137625600, + "step": 2100, + "train_runtime": 1335.8501, + "train_tokens_per_second": 103024.736 + }, + { + "epoch": 0.022, + "grad_norm": 0.2598542273044586, + "learning_rate": 0.0002998914383107493, + "loss": 0.3715, + "num_input_tokens_seen": 144179200, + "step": 2200, + "train_runtime": 1400.0516, + "train_tokens_per_second": 102981.347 + }, + { + "epoch": 0.023, + "grad_norm": 0.300857275724411, + "learning_rate": 0.0002998725771177778, + "loss": 0.3723, + "num_input_tokens_seen": 150732800, + "step": 2300, + "train_runtime": 1465.03, + "train_tokens_per_second": 102887.178 + }, + { + "epoch": 0.024, + "grad_norm": 0.19827991724014282, + "learning_rate": 0.00029985220670961847, + "loss": 0.3654, + "num_input_tokens_seen": 157286400, + "step": 2400, + "train_runtime": 1534.4652, + "train_tokens_per_second": 102502.423 + }, + { + "epoch": 0.025, + "grad_norm": 0.36876365542411804, + "learning_rate": 0.0002998303272914014, + "loss": 0.368, + "num_input_tokens_seen": 163840000, + "step": 2500, + "train_runtime": 1598.5928, + "train_tokens_per_second": 102490.141 + }, + { + "epoch": 0.026, + "grad_norm": 0.23755036294460297, + "learning_rate": 0.00029980693908345185, + "loss": 0.3648, + "num_input_tokens_seen": 170393600, + "step": 2600, + "train_runtime": 1661.9675, + "train_tokens_per_second": 102525.227 + }, + { + "epoch": 0.027, + "grad_norm": 0.3921568691730499, + "learning_rate": 0.00029978204232128895, + "loss": 0.3633, + "num_input_tokens_seen": 176947200, + "step": 2700, + "train_runtime": 1731.9606, + "train_tokens_per_second": 102165.837 + }, + { + "epoch": 0.028, + "grad_norm": 0.1964094191789627, + "learning_rate": 0.0002997556372556227, + "loss": 0.365, + "num_input_tokens_seen": 183500800, + "step": 2800, + "train_runtime": 1796.4926, + "train_tokens_per_second": 102143.922 + }, + { + "epoch": 0.029, + "grad_norm": 0.2469199150800705, + "learning_rate": 0.0002997277241523519, + "loss": 0.364, + "num_input_tokens_seen": 190054400, + "step": 2900, + "train_runtime": 1860.3342, + "train_tokens_per_second": 102161.428 + }, + { + "epoch": 0.03, + "grad_norm": 0.19437766075134277, + "learning_rate": 0.00029969830329256125, + "loss": 0.3574, + "num_input_tokens_seen": 196608000, + "step": 3000, + "train_runtime": 1924.7283, + "train_tokens_per_second": 102148.444 + }, + { + "epoch": 0.031, + "grad_norm": 0.23198598623275757, + "learning_rate": 0.00029966737497251836, + "loss": 0.3599, + "num_input_tokens_seen": 203161600, + "step": 3100, + "train_runtime": 1993.345, + "train_tokens_per_second": 101919.94 + }, + { + "epoch": 0.032, + "grad_norm": 0.22857527434825897, + "learning_rate": 0.0002996349395036711, + "loss": 0.3579, + "num_input_tokens_seen": 209715200, + "step": 3200, + "train_runtime": 2057.8023, + "train_tokens_per_second": 101912.218 + }, + { + "epoch": 0.033, + "grad_norm": 0.24812710285186768, + "learning_rate": 0.00029960099721264435, + "loss": 0.3612, + "num_input_tokens_seen": 216268800, + "step": 3300, + "train_runtime": 2121.9536, + "train_tokens_per_second": 101919.666 + }, + { + "epoch": 0.034, + "grad_norm": 0.21982239186763763, + "learning_rate": 0.0002995655484412365, + "loss": 0.3554, + "num_input_tokens_seen": 222822400, + "step": 3400, + "train_runtime": 2186.6347, + "train_tokens_per_second": 101901.979 + }, + { + "epoch": 0.035, + "grad_norm": 0.3460980951786041, + "learning_rate": 0.00029952859354641636, + "loss": 0.3568, + "num_input_tokens_seen": 229376000, + "step": 3500, + "train_runtime": 2256.5384, + "train_tokens_per_second": 101649.502 + }, + { + "epoch": 0.036, + "grad_norm": 0.25577911734580994, + "learning_rate": 0.00029949013290031924, + "loss": 0.354, + "num_input_tokens_seen": 235929600, + "step": 3600, + "train_runtime": 2320.5776, + "train_tokens_per_second": 101668.483 + }, + { + "epoch": 0.037, + "grad_norm": 0.16108086705207825, + "learning_rate": 0.00029945016689024353, + "loss": 0.3509, + "num_input_tokens_seen": 242483200, + "step": 3700, + "train_runtime": 2383.8992, + "train_tokens_per_second": 101717.051 + }, + { + "epoch": 0.038, + "grad_norm": 0.2431662529706955, + "learning_rate": 0.0002994086959186464, + "loss": 0.3527, + "num_input_tokens_seen": 249036800, + "step": 3800, + "train_runtime": 2448.8427, + "train_tokens_per_second": 101695.71 + }, + { + "epoch": 0.039, + "grad_norm": 0.18574966490268707, + "learning_rate": 0.00029936572040314014, + "loss": 0.3546, + "num_input_tokens_seen": 255590400, + "step": 3900, + "train_runtime": 2518.1288, + "train_tokens_per_second": 101500.13 + }, + { + "epoch": 0.04, + "grad_norm": 0.15902996063232422, + "learning_rate": 0.0002993212407764877, + "loss": 0.3519, + "num_input_tokens_seen": 262144000, + "step": 4000, + "train_runtime": 2581.8809, + "train_tokens_per_second": 101532.18 + }, + { + "epoch": 0.041, + "grad_norm": 0.21019065380096436, + "learning_rate": 0.00029927525748659834, + "loss": 0.3567, + "num_input_tokens_seen": 268697600, + "step": 4100, + "train_runtime": 2646.5068, + "train_tokens_per_second": 101529.154 + }, + { + "epoch": 0.042, + "grad_norm": 0.18648174405097961, + "learning_rate": 0.0002992277709965234, + "loss": 0.3512, + "num_input_tokens_seen": 275251200, + "step": 4200, + "train_runtime": 2710.4754, + "train_tokens_per_second": 101550.895 + }, + { + "epoch": 0.043, + "grad_norm": 0.21123889088630676, + "learning_rate": 0.0002991787817844513, + "loss": 0.3521, + "num_input_tokens_seen": 281804800, + "step": 4300, + "train_runtime": 2780.6173, + "train_tokens_per_second": 101346.13 + }, + { + "epoch": 0.044, + "grad_norm": 0.22183509171009064, + "learning_rate": 0.0002991282903437028, + "loss": 0.3486, + "num_input_tokens_seen": 288358400, + "step": 4400, + "train_runtime": 2843.584, + "train_tokens_per_second": 101406.674 + }, + { + "epoch": 0.045, + "grad_norm": 0.19213925302028656, + "learning_rate": 0.0002990762971827262, + "loss": 0.3481, + "num_input_tokens_seen": 294912000, + "step": 4500, + "train_runtime": 2906.5309, + "train_tokens_per_second": 101465.29 + }, + { + "epoch": 0.046, + "grad_norm": 0.16215530037879944, + "learning_rate": 0.00029902280282509197, + "loss": 0.3506, + "num_input_tokens_seen": 301465600, + "step": 4600, + "train_runtime": 2977.8135, + "train_tokens_per_second": 101237.232 + }, + { + "epoch": 0.047, + "grad_norm": 0.17120705544948578, + "learning_rate": 0.0002989678078094878, + "loss": 0.3433, + "num_input_tokens_seen": 308019200, + "step": 4700, + "train_runtime": 3040.7538, + "train_tokens_per_second": 101296.988 + }, + { + "epoch": 0.048, + "grad_norm": 0.26389873027801514, + "learning_rate": 0.00029891131268971284, + "loss": 0.345, + "num_input_tokens_seen": 314572800, + "step": 4800, + "train_runtime": 3104.3446, + "train_tokens_per_second": 101333.081 + }, + { + "epoch": 0.049, + "grad_norm": 0.1639779806137085, + "learning_rate": 0.0002988533180346723, + "loss": 0.3431, + "num_input_tokens_seen": 321126400, + "step": 4900, + "train_runtime": 3172.6385, + "train_tokens_per_second": 101217.457 + }, + { + "epoch": 0.05, + "grad_norm": 0.21486082673072815, + "learning_rate": 0.0002987938244283717, + "loss": 0.3413, + "num_input_tokens_seen": 327680000, + "step": 5000, + "train_runtime": 3237.5961, + "train_tokens_per_second": 101210.896 + }, + { + "epoch": 0.051, + "grad_norm": 0.20326170325279236, + "learning_rate": 0.00029873283246991105, + "loss": 0.3457, + "num_input_tokens_seen": 334233600, + "step": 5100, + "train_runtime": 3302.3096, + "train_tokens_per_second": 101212.074 + }, + { + "epoch": 0.052, + "grad_norm": 0.171161487698555, + "learning_rate": 0.0002986703427734787, + "loss": 0.345, + "num_input_tokens_seen": 340787200, + "step": 5200, + "train_runtime": 3367.4928, + "train_tokens_per_second": 101199.089 + }, + { + "epoch": 0.053, + "grad_norm": 0.19781792163848877, + "learning_rate": 0.00029860635596834517, + "loss": 0.3455, + "num_input_tokens_seen": 347340800, + "step": 5300, + "train_runtime": 3430.9148, + "train_tokens_per_second": 101238.538 + }, + { + "epoch": 0.054, + "grad_norm": 0.1795511543750763, + "learning_rate": 0.0002985408726988569, + "loss": 0.3439, + "num_input_tokens_seen": 353894400, + "step": 5400, + "train_runtime": 3498.4556, + "train_tokens_per_second": 101157.322 + }, + { + "epoch": 0.055, + "grad_norm": 0.1671728938817978, + "learning_rate": 0.0002984738936244296, + "loss": 0.3422, + "num_input_tokens_seen": 360448000, + "step": 5500, + "train_runtime": 3561.4394, + "train_tokens_per_second": 101208.516 + }, + { + "epoch": 0.056, + "grad_norm": 0.17824003100395203, + "learning_rate": 0.0002984054194195419, + "loss": 0.3489, + "num_input_tokens_seen": 367001600, + "step": 5600, + "train_runtime": 3625.8956, + "train_tokens_per_second": 101216.814 + }, + { + "epoch": 0.057, + "grad_norm": 0.1654757708311081, + "learning_rate": 0.0002983354507737283, + "loss": 0.3463, + "num_input_tokens_seen": 373555200, + "step": 5700, + "train_runtime": 3690.173, + "train_tokens_per_second": 101229.725 + }, + { + "epoch": 0.058, + "grad_norm": 0.2033533751964569, + "learning_rate": 0.00029826398839157215, + "loss": 0.3462, + "num_input_tokens_seen": 380108800, + "step": 5800, + "train_runtime": 3759.2019, + "train_tokens_per_second": 101114.229 + }, + { + "epoch": 0.059, + "grad_norm": 0.19753150641918182, + "learning_rate": 0.000298191032992699, + "loss": 0.3436, + "num_input_tokens_seen": 386662400, + "step": 5900, + "train_runtime": 3822.1964, + "train_tokens_per_second": 101162.357 + }, + { + "epoch": 0.06, + "grad_norm": 0.13978537917137146, + "learning_rate": 0.0002981165853117688, + "loss": 0.3393, + "num_input_tokens_seen": 393216000, + "step": 6000, + "train_runtime": 3890.9859, + "train_tokens_per_second": 101058.192 + }, + { + "epoch": 0.061, + "grad_norm": 0.28539636731147766, + "learning_rate": 0.000298040646098469, + "loss": 0.3419, + "num_input_tokens_seen": 399769600, + "step": 6100, + "train_runtime": 3955.42, + "train_tokens_per_second": 101068.813 + }, + { + "epoch": 0.062, + "grad_norm": 0.14195021986961365, + "learning_rate": 0.0002979632161175064, + "loss": 0.3408, + "num_input_tokens_seen": 406323200, + "step": 6200, + "train_runtime": 4019.3462, + "train_tokens_per_second": 101091.865 + }, + { + "epoch": 0.063, + "grad_norm": 0.26058393716812134, + "learning_rate": 0.0002978842961486003, + "loss": 0.3411, + "num_input_tokens_seen": 412876800, + "step": 6300, + "train_runtime": 4082.619, + "train_tokens_per_second": 101130.379 + }, + { + "epoch": 0.064, + "grad_norm": 0.1645655333995819, + "learning_rate": 0.0002978038869864738, + "loss": 0.3392, + "num_input_tokens_seen": 419430400, + "step": 6400, + "train_runtime": 4152.2955, + "train_tokens_per_second": 101011.694 + }, + { + "epoch": 0.065, + "grad_norm": 0.1678280532360077, + "learning_rate": 0.0002977219894408463, + "loss": 0.338, + "num_input_tokens_seen": 425984000, + "step": 6500, + "train_runtime": 4215.8141, + "train_tokens_per_second": 101044.304 + }, + { + "epoch": 0.066, + "grad_norm": 0.19337573647499084, + "learning_rate": 0.0002976386043364251, + "loss": 0.3424, + "num_input_tokens_seen": 432537600, + "step": 6600, + "train_runtime": 4278.8465, + "train_tokens_per_second": 101087.432 + }, + { + "epoch": 0.067, + "grad_norm": 0.14295175671577454, + "learning_rate": 0.00029755373251289733, + "loss": 0.3443, + "num_input_tokens_seen": 439091200, + "step": 6700, + "train_runtime": 4348.6665, + "train_tokens_per_second": 100971.459 + }, + { + "epoch": 0.068, + "grad_norm": 0.22164900600910187, + "learning_rate": 0.0002974673748249213, + "loss": 0.339, + "num_input_tokens_seen": 445644800, + "step": 6800, + "train_runtime": 4413.12, + "train_tokens_per_second": 100981.799 + }, + { + "epoch": 0.069, + "grad_norm": 0.1831408590078354, + "learning_rate": 0.00029737953214211804, + "loss": 0.3398, + "num_input_tokens_seen": 452198400, + "step": 6900, + "train_runtime": 4477.6672, + "train_tokens_per_second": 100989.73 + }, + { + "epoch": 0.07, + "grad_norm": 0.21329298615455627, + "learning_rate": 0.0002972902053490623, + "loss": 0.3372, + "num_input_tokens_seen": 458752000, + "step": 7000, + "train_runtime": 4541.4752, + "train_tokens_per_second": 101013.873 + }, + { + "epoch": 0.071, + "grad_norm": 0.16601704061031342, + "learning_rate": 0.00029719939534527393, + "loss": 0.3436, + "num_input_tokens_seen": 465305600, + "step": 7100, + "train_runtime": 4607.1943, + "train_tokens_per_second": 100995.436 + }, + { + "epoch": 0.072, + "grad_norm": 0.2303948849439621, + "learning_rate": 0.00029710710304520866, + "loss": 0.339, + "num_input_tokens_seen": 471859200, + "step": 7200, + "train_runtime": 4672.0421, + "train_tokens_per_second": 100996.349 + }, + { + "epoch": 0.073, + "grad_norm": 0.21449029445648193, + "learning_rate": 0.00029701332937824885, + "loss": 0.336, + "num_input_tokens_seen": 478412800, + "step": 7300, + "train_runtime": 4742.0375, + "train_tokens_per_second": 100887.605 + }, + { + "epoch": 0.074, + "grad_norm": 0.1367533802986145, + "learning_rate": 0.0002969180752886944, + "loss": 0.3397, + "num_input_tokens_seen": 484966400, + "step": 7400, + "train_runtime": 4805.1341, + "train_tokens_per_second": 100926.716 + }, + { + "epoch": 0.075, + "grad_norm": 0.1852603256702423, + "learning_rate": 0.0002968213417357529, + "loss": 0.34, + "num_input_tokens_seen": 491520000, + "step": 7500, + "train_runtime": 4867.6611, + "train_tokens_per_second": 100976.628 + }, + { + "epoch": 0.076, + "grad_norm": 0.18590585887432098, + "learning_rate": 0.00029672312969353015, + "loss": 0.3375, + "num_input_tokens_seen": 498073600, + "step": 7600, + "train_runtime": 4938.9456, + "train_tokens_per_second": 100846.14 + }, + { + "epoch": 0.077, + "grad_norm": 0.17078232765197754, + "learning_rate": 0.00029662344015102027, + "loss": 0.3374, + "num_input_tokens_seen": 504627200, + "step": 7700, + "train_runtime": 5003.5948, + "train_tokens_per_second": 100852.931 + }, + { + "epoch": 0.078, + "grad_norm": 0.14574670791625977, + "learning_rate": 0.00029652227411209594, + "loss": 0.3369, + "num_input_tokens_seen": 511180800, + "step": 7800, + "train_runtime": 5067.2522, + "train_tokens_per_second": 100879.289 + }, + { + "epoch": 0.079, + "grad_norm": 0.1603483408689499, + "learning_rate": 0.0002964196325954979, + "loss": 0.3352, + "num_input_tokens_seen": 517734400, + "step": 7900, + "train_runtime": 5131.2908, + "train_tokens_per_second": 100897.497 + }, + { + "epoch": 0.08, + "grad_norm": 0.16576310992240906, + "learning_rate": 0.0002963155166348253, + "loss": 0.3376, + "num_input_tokens_seen": 524288000, + "step": 8000, + "train_runtime": 5200.6662, + "train_tokens_per_second": 100811.699 + }, + { + "epoch": 0.081, + "grad_norm": 0.31833919882774353, + "learning_rate": 0.0002962099272785246, + "loss": 0.3382, + "num_input_tokens_seen": 530841600, + "step": 8100, + "train_runtime": 5266.7639, + "train_tokens_per_second": 100790.849 + }, + { + "epoch": 0.082, + "grad_norm": 0.14755409955978394, + "learning_rate": 0.0002961028655898794, + "loss": 0.3348, + "num_input_tokens_seen": 537395200, + "step": 8200, + "train_runtime": 5331.3948, + "train_tokens_per_second": 100798.238 + }, + { + "epoch": 0.083, + "grad_norm": 0.2060171663761139, + "learning_rate": 0.0002959943326469998, + "loss": 0.3338, + "num_input_tokens_seen": 543948800, + "step": 8300, + "train_runtime": 5395.0396, + "train_tokens_per_second": 100823.876 + }, + { + "epoch": 0.084, + "grad_norm": 0.16461625695228577, + "learning_rate": 0.0002958843295428112, + "loss": 0.3326, + "num_input_tokens_seen": 550502400, + "step": 8400, + "train_runtime": 5458.2259, + "train_tokens_per_second": 100857.387 + }, + { + "epoch": 0.085, + "grad_norm": 0.15455660223960876, + "learning_rate": 0.0002957728573850438, + "loss": 0.3339, + "num_input_tokens_seen": 557056000, + "step": 8500, + "train_runtime": 5527.7417, + "train_tokens_per_second": 100774.607 + }, + { + "epoch": 0.086, + "grad_norm": 0.17872081696987152, + "learning_rate": 0.0002956599172962209, + "loss": 0.3404, + "num_input_tokens_seen": 563609600, + "step": 8600, + "train_runtime": 5593.3318, + "train_tokens_per_second": 100764.557 + }, + { + "epoch": 0.087, + "grad_norm": 0.19022491574287415, + "learning_rate": 0.0002955455104136479, + "loss": 0.3329, + "num_input_tokens_seen": 570163200, + "step": 8700, + "train_runtime": 5659.0887, + "train_tokens_per_second": 100751.77 + }, + { + "epoch": 0.088, + "grad_norm": 0.14710059762001038, + "learning_rate": 0.00029542963788940096, + "loss": 0.3323, + "num_input_tokens_seen": 576716800, + "step": 8800, + "train_runtime": 5722.168, + "train_tokens_per_second": 100786.415 + }, + { + "epoch": 0.089, + "grad_norm": 0.1998033970594406, + "learning_rate": 0.00029531230089031505, + "loss": 0.3378, + "num_input_tokens_seen": 583270400, + "step": 8900, + "train_runtime": 5787.7324, + "train_tokens_per_second": 100777.016 + }, + { + "epoch": 0.09, + "grad_norm": 0.125193253159523, + "learning_rate": 0.0002951935005979724, + "loss": 0.3325, + "num_input_tokens_seen": 589824000, + "step": 9000, + "train_runtime": 5855.8455, + "train_tokens_per_second": 100723.968 + }, + { + "epoch": 0.091, + "grad_norm": 0.19552631676197052, + "learning_rate": 0.0002950732382086907, + "loss": 0.3316, + "num_input_tokens_seen": 596377600, + "step": 9100, + "train_runtime": 5921.9714, + "train_tokens_per_second": 100705.923 + }, + { + "epoch": 0.092, + "grad_norm": 0.16468137502670288, + "learning_rate": 0.0002949515149335108, + "loss": 0.3349, + "num_input_tokens_seen": 602931200, + "step": 9200, + "train_runtime": 5986.1243, + "train_tokens_per_second": 100721.464 + }, + { + "epoch": 0.093, + "grad_norm": 0.1658785343170166, + "learning_rate": 0.0002948283319981848, + "loss": 0.3281, + "num_input_tokens_seen": 609484800, + "step": 9300, + "train_runtime": 6050.7028, + "train_tokens_per_second": 100729.588 + }, + { + "epoch": 0.094, + "grad_norm": 0.16668474674224854, + "learning_rate": 0.00029470369064316354, + "loss": 0.3301, + "num_input_tokens_seen": 616038400, + "step": 9400, + "train_runtime": 6115.0892, + "train_tokens_per_second": 100740.706 + }, + { + "epoch": 0.095, + "grad_norm": 0.16522246599197388, + "learning_rate": 0.00029457759212358397, + "loss": 0.3305, + "num_input_tokens_seen": 622592000, + "step": 9500, + "train_runtime": 6183.2082, + "train_tokens_per_second": 100690.77 + }, + { + "epoch": 0.096, + "grad_norm": 0.2229623645544052, + "learning_rate": 0.00029445003770925686, + "loss": 0.3289, + "num_input_tokens_seen": 629145600, + "step": 9600, + "train_runtime": 6247.5147, + "train_tokens_per_second": 100703.341 + }, + { + "epoch": 0.097, + "grad_norm": 0.16620689630508423, + "learning_rate": 0.00029432102868465367, + "loss": 0.3299, + "num_input_tokens_seen": 635699200, + "step": 9700, + "train_runtime": 6312.7504, + "train_tokens_per_second": 100700.829 + }, + { + "epoch": 0.098, + "grad_norm": 0.15970012545585632, + "learning_rate": 0.0002941905663488939, + "loss": 0.3292, + "num_input_tokens_seen": 642252800, + "step": 9800, + "train_runtime": 6382.1987, + "train_tokens_per_second": 100631.903 + }, + { + "epoch": 0.099, + "grad_norm": 0.14614014327526093, + "learning_rate": 0.0002940586520157318, + "loss": 0.3329, + "num_input_tokens_seen": 648806400, + "step": 9900, + "train_runtime": 6445.6924, + "train_tokens_per_second": 100657.362 + }, + { + "epoch": 0.1, + "grad_norm": 0.16558828949928284, + "learning_rate": 0.00029392528701354325, + "loss": 0.3286, + "num_input_tokens_seen": 655360000, + "step": 10000, + "train_runtime": 6509.151, + "train_tokens_per_second": 100682.87 + }, + { + "epoch": 0.101, + "grad_norm": 0.1442118138074875, + "learning_rate": 0.00029379047268531243, + "loss": 0.3314, + "num_input_tokens_seen": 661913600, + "step": 10100, + "train_runtime": 6575.3071, + "train_tokens_per_second": 100666.568 + }, + { + "epoch": 0.102, + "grad_norm": 0.16007182002067566, + "learning_rate": 0.00029365421038861795, + "loss": 0.3326, + "num_input_tokens_seen": 668467200, + "step": 10200, + "train_runtime": 6639.6314, + "train_tokens_per_second": 100678.359 + }, + { + "epoch": 0.103, + "grad_norm": 0.1417239010334015, + "learning_rate": 0.0002935165014956198, + "loss": 0.3292, + "num_input_tokens_seen": 675020800, + "step": 10300, + "train_runtime": 6704.2875, + "train_tokens_per_second": 100684.942 + }, + { + "epoch": 0.104, + "grad_norm": 0.20092202723026276, + "learning_rate": 0.0002933773473930448, + "loss": 0.3251, + "num_input_tokens_seen": 681574400, + "step": 10400, + "train_runtime": 6769.9733, + "train_tokens_per_second": 100676.083 + }, + { + "epoch": 0.105, + "grad_norm": 0.12387008965015411, + "learning_rate": 0.0002932367494821734, + "loss": 0.3302, + "num_input_tokens_seen": 688128000, + "step": 10500, + "train_runtime": 6840.7627, + "train_tokens_per_second": 100592.292 + }, + { + "epoch": 0.106, + "grad_norm": 0.17865417897701263, + "learning_rate": 0.00029309470917882497, + "loss": 0.328, + "num_input_tokens_seen": 694681600, + "step": 10600, + "train_runtime": 6905.9119, + "train_tokens_per_second": 100592.305 + }, + { + "epoch": 0.107, + "grad_norm": 0.14125974476337433, + "learning_rate": 0.0002929512279133437, + "loss": 0.3296, + "num_input_tokens_seen": 701235200, + "step": 10700, + "train_runtime": 6969.9941, + "train_tokens_per_second": 100607.718 + }, + { + "epoch": 0.108, + "grad_norm": 0.15725336968898773, + "learning_rate": 0.0002928063071305844, + "loss": 0.3279, + "num_input_tokens_seen": 707788800, + "step": 10800, + "train_runtime": 7032.9479, + "train_tokens_per_second": 100638.994 + }, + { + "epoch": 0.109, + "grad_norm": 0.15254800021648407, + "learning_rate": 0.0002926599482898978, + "loss": 0.3276, + "num_input_tokens_seen": 714342400, + "step": 10900, + "train_runtime": 7097.644, + "train_tokens_per_second": 100645.002 + }, + { + "epoch": 0.11, + "grad_norm": 0.23630526661872864, + "learning_rate": 0.00029251215286511573, + "loss": 0.3278, + "num_input_tokens_seen": 720896000, + "step": 11000, + "train_runtime": 7167.7206, + "train_tokens_per_second": 100575.348 + }, + { + "epoch": 0.111, + "grad_norm": 0.14799726009368896, + "learning_rate": 0.00029236292234453647, + "loss": 0.3264, + "num_input_tokens_seen": 727449600, + "step": 11100, + "train_runtime": 7232.1207, + "train_tokens_per_second": 100585.932 + }, + { + "epoch": 0.112, + "grad_norm": 0.17712198197841644, + "learning_rate": 0.0002922122582309097, + "loss": 0.3304, + "num_input_tokens_seen": 734003200, + "step": 11200, + "train_runtime": 7296.7016, + "train_tokens_per_second": 100593.835 + }, + { + "epoch": 0.113, + "grad_norm": 0.1620536595582962, + "learning_rate": 0.0002920601620414215, + "loss": 0.3266, + "num_input_tokens_seen": 740556800, + "step": 11300, + "train_runtime": 7359.3874, + "train_tokens_per_second": 100627.506 + }, + { + "epoch": 0.114, + "grad_norm": 0.1695978045463562, + "learning_rate": 0.0002919066353076786, + "loss": 0.3269, + "num_input_tokens_seen": 747110400, + "step": 11400, + "train_runtime": 7425.5624, + "train_tokens_per_second": 100613.308 + }, + { + "epoch": 0.115, + "grad_norm": 0.23728708922863007, + "learning_rate": 0.00029175167957569366, + "loss": 0.3269, + "num_input_tokens_seen": 753664000, + "step": 11500, + "train_runtime": 7489.1752, + "train_tokens_per_second": 100633.779 + }, + { + "epoch": 0.116, + "grad_norm": 0.14579418301582336, + "learning_rate": 0.0002915952964058691, + "loss": 0.3254, + "num_input_tokens_seen": 760217600, + "step": 11600, + "train_runtime": 7559.1466, + "train_tokens_per_second": 100569.237 + }, + { + "epoch": 0.117, + "grad_norm": 0.15569131076335907, + "learning_rate": 0.00029143748737298173, + "loss": 0.3309, + "num_input_tokens_seen": 766771200, + "step": 11700, + "train_runtime": 7625.7219, + "train_tokens_per_second": 100550.638 + }, + { + "epoch": 0.118, + "grad_norm": 0.15939873456954956, + "learning_rate": 0.00029127825406616677, + "loss": 0.3251, + "num_input_tokens_seen": 773324800, + "step": 11800, + "train_runtime": 7690.5664, + "train_tokens_per_second": 100554.987 + }, + { + "epoch": 0.119, + "grad_norm": 0.1355784833431244, + "learning_rate": 0.0002911175980889019, + "loss": 0.3287, + "num_input_tokens_seen": 779878400, + "step": 11900, + "train_runtime": 7753.5378, + "train_tokens_per_second": 100583.556 + }, + { + "epoch": 0.12, + "grad_norm": 0.19504176080226898, + "learning_rate": 0.00029095552105899095, + "loss": 0.325, + "num_input_tokens_seen": 786432000, + "step": 12000, + "train_runtime": 7817.9364, + "train_tokens_per_second": 100593.297 + }, + { + "epoch": 0.121, + "grad_norm": 0.1594318449497223, + "learning_rate": 0.0002907920246085478, + "loss": 0.3242, + "num_input_tokens_seen": 792985600, + "step": 12100, + "train_runtime": 7887.1116, + "train_tokens_per_second": 100541.953 + }, + { + "epoch": 0.122, + "grad_norm": 0.15172167122364044, + "learning_rate": 0.00029062711038397996, + "loss": 0.3325, + "num_input_tokens_seen": 799539200, + "step": 12200, + "train_runtime": 7952.1371, + "train_tokens_per_second": 100543.94 + }, + { + "epoch": 0.123, + "grad_norm": 0.13253241777420044, + "learning_rate": 0.00029046078004597175, + "loss": 0.3239, + "num_input_tokens_seen": 806092800, + "step": 12300, + "train_runtime": 8016.3597, + "train_tokens_per_second": 100555.966 + }, + { + "epoch": 0.124, + "grad_norm": 0.2943899929523468, + "learning_rate": 0.00029029303526946796, + "loss": 0.3238, + "num_input_tokens_seen": 812646400, + "step": 12400, + "train_runtime": 8079.6597, + "train_tokens_per_second": 100579.286 + }, + { + "epoch": 0.125, + "grad_norm": 0.1583172082901001, + "learning_rate": 0.0002901238777436565, + "loss": 0.3217, + "num_input_tokens_seen": 819200000, + "step": 12500, + "train_runtime": 8148.9297, + "train_tokens_per_second": 100528.539 + }, + { + "epoch": 0.126, + "grad_norm": 0.1598382592201233, + "learning_rate": 0.00028995330917195184, + "loss": 0.3245, + "num_input_tokens_seen": 825753600, + "step": 12600, + "train_runtime": 8213.0201, + "train_tokens_per_second": 100542.016 + }, + { + "epoch": 0.127, + "grad_norm": 0.13507018983364105, + "learning_rate": 0.00028978133127197765, + "loss": 0.3247, + "num_input_tokens_seen": 832307200, + "step": 12700, + "train_runtime": 8277.3925, + "train_tokens_per_second": 100551.859 + }, + { + "epoch": 0.128, + "grad_norm": 0.1688830703496933, + "learning_rate": 0.0002896079457755493, + "loss": 0.3258, + "num_input_tokens_seen": 838860800, + "step": 12800, + "train_runtime": 8342.3491, + "train_tokens_per_second": 100554.507 + }, + { + "epoch": 0.129, + "grad_norm": 0.2753322422504425, + "learning_rate": 0.000289433154428657, + "loss": 0.3249, + "num_input_tokens_seen": 845414400, + "step": 12900, + "train_runtime": 8406.9898, + "train_tokens_per_second": 100560.892 + }, + { + "epoch": 0.13, + "grad_norm": 0.20588786900043488, + "learning_rate": 0.0002892569589914476, + "loss": 0.3232, + "num_input_tokens_seen": 851968000, + "step": 13000, + "train_runtime": 8475.9626, + "train_tokens_per_second": 100515.781 + }, + { + "epoch": 0.131, + "grad_norm": 0.1462445855140686, + "learning_rate": 0.0002890793612382072, + "loss": 0.3239, + "num_input_tokens_seen": 858521600, + "step": 13100, + "train_runtime": 8539.9861, + "train_tokens_per_second": 100529.625 + }, + { + "epoch": 0.132, + "grad_norm": 0.11379440873861313, + "learning_rate": 0.0002889003629573432, + "loss": 0.3249, + "num_input_tokens_seen": 865075200, + "step": 13200, + "train_runtime": 8604.867, + "train_tokens_per_second": 100533.244 + }, + { + "epoch": 0.133, + "grad_norm": 0.12769202888011932, + "learning_rate": 0.00028871996595136626, + "loss": 0.327, + "num_input_tokens_seen": 871628800, + "step": 13300, + "train_runtime": 8669.3605, + "train_tokens_per_second": 100541.303 + }, + { + "epoch": 0.134, + "grad_norm": 0.14837151765823364, + "learning_rate": 0.0002885381720368723, + "loss": 0.321, + "num_input_tokens_seen": 878182400, + "step": 13400, + "train_runtime": 8738.2624, + "train_tokens_per_second": 100498.515 + }, + { + "epoch": 0.135, + "grad_norm": 0.1538904309272766, + "learning_rate": 0.000288354983044524, + "loss": 0.3207, + "num_input_tokens_seen": 884736000, + "step": 13500, + "train_runtime": 8802.2586, + "train_tokens_per_second": 100512.385 + }, + { + "epoch": 0.136, + "grad_norm": 0.12802962958812714, + "learning_rate": 0.00028817040081903245, + "loss": 0.3241, + "num_input_tokens_seen": 891289600, + "step": 13600, + "train_runtime": 8866.1163, + "train_tokens_per_second": 100527.624 + }, + { + "epoch": 0.137, + "grad_norm": 0.35466450452804565, + "learning_rate": 0.00028798442721913867, + "loss": 0.3214, + "num_input_tokens_seen": 897843200, + "step": 13700, + "train_runtime": 8930.5828, + "train_tokens_per_second": 100535.79 + }, + { + "epoch": 0.138, + "grad_norm": 0.13867586851119995, + "learning_rate": 0.00028779706411759465, + "loss": 0.3199, + "num_input_tokens_seen": 904396800, + "step": 13800, + "train_runtime": 9001.3287, + "train_tokens_per_second": 100473.7 + }, + { + "epoch": 0.139, + "grad_norm": 0.2114623785018921, + "learning_rate": 0.00028760831340114484, + "loss": 0.3234, + "num_input_tokens_seen": 910950400, + "step": 13900, + "train_runtime": 9066.3163, + "train_tokens_per_second": 100476.353 + }, + { + "epoch": 0.14, + "grad_norm": 0.14202618598937988, + "learning_rate": 0.00028741817697050683, + "loss": 0.3232, + "num_input_tokens_seen": 917504000, + "step": 14000, + "train_runtime": 9130.2003, + "train_tokens_per_second": 100491.114 + }, + { + "epoch": 0.141, + "grad_norm": 0.1686236560344696, + "learning_rate": 0.00028722665674035233, + "loss": 0.3203, + "num_input_tokens_seen": 924057600, + "step": 14100, + "train_runtime": 9195.1426, + "train_tokens_per_second": 100494.102 + }, + { + "epoch": 0.142, + "grad_norm": 0.14483292400836945, + "learning_rate": 0.0002870337546392879, + "loss": 0.3321, + "num_input_tokens_seen": 930611200, + "step": 14200, + "train_runtime": 9259.404, + "train_tokens_per_second": 100504.438 + }, + { + "epoch": 0.143, + "grad_norm": 0.12517394125461578, + "learning_rate": 0.00028683947260983576, + "loss": 0.3233, + "num_input_tokens_seen": 937164800, + "step": 14300, + "train_runtime": 9324.1454, + "train_tokens_per_second": 100509.458 + }, + { + "epoch": 0.144, + "grad_norm": 0.24776680767536163, + "learning_rate": 0.00028664381260841356, + "loss": 0.3192, + "num_input_tokens_seen": 943718400, + "step": 14400, + "train_runtime": 9393.645, + "train_tokens_per_second": 100463.494 + }, + { + "epoch": 0.145, + "grad_norm": 0.4200928807258606, + "learning_rate": 0.0002864467766053154, + "loss": 0.321, + "num_input_tokens_seen": 950272000, + "step": 14500, + "train_runtime": 9456.5857, + "train_tokens_per_second": 100487.853 + }, + { + "epoch": 0.146, + "grad_norm": 0.14573471248149872, + "learning_rate": 0.00028624836658469165, + "loss": 0.3198, + "num_input_tokens_seen": 956825600, + "step": 14600, + "train_runtime": 9525.9633, + "train_tokens_per_second": 100443.973 + }, + { + "epoch": 0.147, + "grad_norm": 0.1546989232301712, + "learning_rate": 0.00028604858454452906, + "loss": 0.3267, + "num_input_tokens_seen": 963379200, + "step": 14700, + "train_runtime": 9585.7512, + "train_tokens_per_second": 100501.169 + }, + { + "epoch": 0.148, + "grad_norm": 0.172988623380661, + "learning_rate": 0.00028584743249663057, + "loss": 0.3222, + "num_input_tokens_seen": 969932800, + "step": 14800, + "train_runtime": 9650.7111, + "train_tokens_per_second": 100503.765 + }, + { + "epoch": 0.149, + "grad_norm": 0.19345735013484955, + "learning_rate": 0.000285644912466595, + "loss": 0.3194, + "num_input_tokens_seen": 976486400, + "step": 14900, + "train_runtime": 9721.1196, + "train_tokens_per_second": 100449.994 + }, + { + "epoch": 0.15, + "grad_norm": 0.13317954540252686, + "learning_rate": 0.00028544102649379684, + "loss": 0.3236, + "num_input_tokens_seen": 983040000, + "step": 15000, + "train_runtime": 9784.7921, + "train_tokens_per_second": 100466.11 + }, + { + "epoch": 0.151, + "grad_norm": 0.17458604276180267, + "learning_rate": 0.00028523577663136556, + "loss": 0.3208, + "num_input_tokens_seen": 989593600, + "step": 15100, + "train_runtime": 9853.1273, + "train_tokens_per_second": 100434.468 + }, + { + "epoch": 0.152, + "grad_norm": 0.1358109712600708, + "learning_rate": 0.000285029164946165, + "loss": 0.3237, + "num_input_tokens_seen": 996147200, + "step": 15200, + "train_runtime": 9917.7044, + "train_tokens_per_second": 100441.307 + }, + { + "epoch": 0.153, + "grad_norm": 0.16100633144378662, + "learning_rate": 0.0002848211935187725, + "loss": 0.3267, + "num_input_tokens_seen": 1002700800, + "step": 15300, + "train_runtime": 9982.8922, + "train_tokens_per_second": 100441.914 + }, + { + "epoch": 0.154, + "grad_norm": 0.20419622957706451, + "learning_rate": 0.0002846118644434581, + "loss": 0.3193, + "num_input_tokens_seen": 1009254400, + "step": 15400, + "train_runtime": 10046.3454, + "train_tokens_per_second": 100459.855 + }, + { + "epoch": 0.155, + "grad_norm": 0.17805695533752441, + "learning_rate": 0.00028440117982816326, + "loss": 0.3159, + "num_input_tokens_seen": 1015808000, + "step": 15500, + "train_runtime": 10110.0124, + "train_tokens_per_second": 100475.446 + }, + { + "epoch": 0.156, + "grad_norm": 0.17533563077449799, + "learning_rate": 0.0002841891417944796, + "loss": 0.3216, + "num_input_tokens_seen": 1022361600, + "step": 15600, + "train_runtime": 10178.7469, + "train_tokens_per_second": 100440.812 + }, + { + "epoch": 0.157, + "grad_norm": 0.13143610954284668, + "learning_rate": 0.0002839757524776279, + "loss": 0.3234, + "num_input_tokens_seen": 1028915200, + "step": 15700, + "train_runtime": 10243.1395, + "train_tokens_per_second": 100449.203 + }, + { + "epoch": 0.158, + "grad_norm": 0.13563373684883118, + "learning_rate": 0.0002837610140264361, + "loss": 0.3194, + "num_input_tokens_seen": 1035468800, + "step": 15800, + "train_runtime": 10307.5423, + "train_tokens_per_second": 100457.39 + }, + { + "epoch": 0.159, + "grad_norm": 0.14616088569164276, + "learning_rate": 0.0002835449286033182, + "loss": 0.3178, + "num_input_tokens_seen": 1042022400, + "step": 15900, + "train_runtime": 10378.0909, + "train_tokens_per_second": 100405.982 + }, + { + "epoch": 0.16, + "grad_norm": 0.1539888232946396, + "learning_rate": 0.0002833274983842518, + "loss": 0.3156, + "num_input_tokens_seen": 1048576000, + "step": 16000, + "train_runtime": 10441.484, + "train_tokens_per_second": 100424.039 + }, + { + "epoch": 0.161, + "grad_norm": 0.15786372125148773, + "learning_rate": 0.0002831087255587569, + "loss": 0.318, + "num_input_tokens_seen": 1055129600, + "step": 16100, + "train_runtime": 10505.72, + "train_tokens_per_second": 100433.821 + }, + { + "epoch": 0.162, + "grad_norm": 0.14359760284423828, + "learning_rate": 0.0002828886123298734, + "loss": 0.3179, + "num_input_tokens_seen": 1061683200, + "step": 16200, + "train_runtime": 10570.7713, + "train_tokens_per_second": 100435.736 + }, + { + "epoch": 0.163, + "grad_norm": 0.1415397673845291, + "learning_rate": 0.00028266716091413906, + "loss": 0.32, + "num_input_tokens_seen": 1068236800, + "step": 16300, + "train_runtime": 10635.2645, + "train_tokens_per_second": 100442.899 + }, + { + "epoch": 0.164, + "grad_norm": 0.1199110895395279, + "learning_rate": 0.0002824443735415673, + "loss": 0.3188, + "num_input_tokens_seen": 1074790400, + "step": 16400, + "train_runtime": 10704.7074, + "train_tokens_per_second": 100403.529 + }, + { + "epoch": 0.165, + "grad_norm": 0.18369431793689728, + "learning_rate": 0.0002822202524556243, + "loss": 0.3208, + "num_input_tokens_seen": 1081344000, + "step": 16500, + "train_runtime": 10770.1863, + "train_tokens_per_second": 100401.606 + }, + { + "epoch": 0.166, + "grad_norm": 0.2615172266960144, + "learning_rate": 0.00028199479991320695, + "loss": 0.3224, + "num_input_tokens_seen": 1087897600, + "step": 16600, + "train_runtime": 10834.6749, + "train_tokens_per_second": 100408.883 + }, + { + "epoch": 0.167, + "grad_norm": 0.1250002384185791, + "learning_rate": 0.00028176801818461994, + "loss": 0.3171, + "num_input_tokens_seen": 1094451200, + "step": 16700, + "train_runtime": 10899.3075, + "train_tokens_per_second": 100414.747 + }, + { + "epoch": 0.168, + "grad_norm": 0.14198775589466095, + "learning_rate": 0.00028153990955355273, + "loss": 0.3194, + "num_input_tokens_seen": 1101004800, + "step": 16800, + "train_runtime": 10964.3423, + "train_tokens_per_second": 100416.858 + }, + { + "epoch": 0.169, + "grad_norm": 0.14076939225196838, + "learning_rate": 0.00028131047631705665, + "loss": 0.3189, + "num_input_tokens_seen": 1107558400, + "step": 16900, + "train_runtime": 11033.6033, + "train_tokens_per_second": 100380.48 + }, + { + "epoch": 0.17, + "grad_norm": 0.13334921002388, + "learning_rate": 0.00028107972078552187, + "loss": 0.3198, + "num_input_tokens_seen": 1114112000, + "step": 17000, + "train_runtime": 11098.612, + "train_tokens_per_second": 100383.003 + }, + { + "epoch": 0.171, + "grad_norm": 0.13615840673446655, + "learning_rate": 0.0002808476452826541, + "loss": 0.3168, + "num_input_tokens_seen": 1120665600, + "step": 17100, + "train_runtime": 11161.3832, + "train_tokens_per_second": 100405.62 + }, + { + "epoch": 0.172, + "grad_norm": 0.14747090637683868, + "learning_rate": 0.00028061425214545094, + "loss": 0.3163, + "num_input_tokens_seen": 1127219200, + "step": 17200, + "train_runtime": 11231.5954, + "train_tokens_per_second": 100361.45 + }, + { + "epoch": 0.173, + "grad_norm": 0.15957149863243103, + "learning_rate": 0.00028037954372417883, + "loss": 0.317, + "num_input_tokens_seen": 1133772800, + "step": 17300, + "train_runtime": 11295.5019, + "train_tokens_per_second": 100373.831 + }, + { + "epoch": 0.174, + "grad_norm": 0.20420241355895996, + "learning_rate": 0.0002801435223823488, + "loss": 0.3207, + "num_input_tokens_seen": 1140326400, + "step": 17400, + "train_runtime": 11360.8649, + "train_tokens_per_second": 100373.203 + }, + { + "epoch": 0.175, + "grad_norm": 0.20070046186447144, + "learning_rate": 0.00027990619049669336, + "loss": 0.3206, + "num_input_tokens_seen": 1146880000, + "step": 17500, + "train_runtime": 11424.854, + "train_tokens_per_second": 100384.652 + }, + { + "epoch": 0.176, + "grad_norm": 0.13903649151325226, + "learning_rate": 0.00027966755045714177, + "loss": 0.3227, + "num_input_tokens_seen": 1153433600, + "step": 17600, + "train_runtime": 11488.6874, + "train_tokens_per_second": 100397.336 + }, + { + "epoch": 0.177, + "grad_norm": 0.15853877365589142, + "learning_rate": 0.00027942760466679673, + "loss": 0.3168, + "num_input_tokens_seen": 1159987200, + "step": 17700, + "train_runtime": 11559.2862, + "train_tokens_per_second": 100351.11 + }, + { + "epoch": 0.178, + "grad_norm": 0.14262589812278748, + "learning_rate": 0.00027918635554190956, + "loss": 0.3235, + "num_input_tokens_seen": 1166540800, + "step": 17800, + "train_runtime": 11622.4751, + "train_tokens_per_second": 100369.395 + }, + { + "epoch": 0.179, + "grad_norm": 0.14338357746601105, + "learning_rate": 0.00027894380551185636, + "loss": 0.3204, + "num_input_tokens_seen": 1173094400, + "step": 17900, + "train_runtime": 11687.9668, + "train_tokens_per_second": 100367.705 + }, + { + "epoch": 0.18, + "grad_norm": 0.12374505400657654, + "learning_rate": 0.00027869995701911314, + "loss": 0.3156, + "num_input_tokens_seen": 1179648000, + "step": 18000, + "train_runtime": 11751.6619, + "train_tokens_per_second": 100381.377 + }, + { + "epoch": 0.181, + "grad_norm": 0.11708634346723557, + "learning_rate": 0.0002784548125192316, + "loss": 0.3145, + "num_input_tokens_seen": 1186201600, + "step": 18100, + "train_runtime": 11816.0633, + "train_tokens_per_second": 100388.9 + }, + { + "epoch": 0.182, + "grad_norm": 0.1318449079990387, + "learning_rate": 0.0002782083744808141, + "loss": 0.3159, + "num_input_tokens_seen": 1192755200, + "step": 18200, + "train_runtime": 11887.7736, + "train_tokens_per_second": 100334.616 + }, + { + "epoch": 0.183, + "grad_norm": 0.3383175730705261, + "learning_rate": 0.000277960645385489, + "loss": 0.3191, + "num_input_tokens_seen": 1199308800, + "step": 18300, + "train_runtime": 11953.3207, + "train_tokens_per_second": 100332.688 + }, + { + "epoch": 0.184, + "grad_norm": 0.13779285550117493, + "learning_rate": 0.00027771162772788544, + "loss": 0.3168, + "num_input_tokens_seen": 1205862400, + "step": 18400, + "train_runtime": 12016.7432, + "train_tokens_per_second": 100348.521 + }, + { + "epoch": 0.185, + "grad_norm": 0.15161630511283875, + "learning_rate": 0.00027746132401560857, + "loss": 0.3146, + "num_input_tokens_seen": 1212416000, + "step": 18500, + "train_runtime": 12081.3443, + "train_tokens_per_second": 100354.395 + }, + { + "epoch": 0.186, + "grad_norm": 0.1523953378200531, + "learning_rate": 0.0002772097367692139, + "loss": 0.3172, + "num_input_tokens_seen": 1218969600, + "step": 18600, + "train_runtime": 12145.9663, + "train_tokens_per_second": 100360.035 + }, + { + "epoch": 0.187, + "grad_norm": 0.12802754342556, + "learning_rate": 0.00027695686852218226, + "loss": 0.3198, + "num_input_tokens_seen": 1225523200, + "step": 18700, + "train_runtime": 12215.5887, + "train_tokens_per_second": 100324.53 + }, + { + "epoch": 0.188, + "grad_norm": 0.13653679192066193, + "learning_rate": 0.00027670272182089416, + "loss": 0.319, + "num_input_tokens_seen": 1232076800, + "step": 18800, + "train_runtime": 12280.146, + "train_tokens_per_second": 100330.794 + }, + { + "epoch": 0.189, + "grad_norm": 0.15152159333229065, + "learning_rate": 0.0002764472992246039, + "loss": 0.3165, + "num_input_tokens_seen": 1238630400, + "step": 18900, + "train_runtime": 12344.6292, + "train_tokens_per_second": 100337.594 + }, + { + "epoch": 0.19, + "grad_norm": 0.13211041688919067, + "learning_rate": 0.0002761906033054143, + "loss": 0.3161, + "num_input_tokens_seen": 1245184000, + "step": 19000, + "train_runtime": 12407.4556, + "train_tokens_per_second": 100357.724 + }, + { + "epoch": 0.191, + "grad_norm": 0.19933822751045227, + "learning_rate": 0.00027593263664825045, + "loss": 0.3173, + "num_input_tokens_seen": 1251737600, + "step": 19100, + "train_runtime": 12472.5241, + "train_tokens_per_second": 100359.606 + }, + { + "epoch": 0.192, + "grad_norm": 0.1472938358783722, + "learning_rate": 0.00027567340185083363, + "loss": 0.3157, + "num_input_tokens_seen": 1258291200, + "step": 19200, + "train_runtime": 12542.0532, + "train_tokens_per_second": 100325.774 + }, + { + "epoch": 0.193, + "grad_norm": 0.1466071903705597, + "learning_rate": 0.00027541290152365537, + "loss": 0.3188, + "num_input_tokens_seen": 1264844800, + "step": 19300, + "train_runtime": 12606.5735, + "train_tokens_per_second": 100332.164 + }, + { + "epoch": 0.194, + "grad_norm": 0.1384386122226715, + "learning_rate": 0.00027515113828995117, + "loss": 0.318, + "num_input_tokens_seen": 1271398400, + "step": 19400, + "train_runtime": 12672.5058, + "train_tokens_per_second": 100327.309 + }, + { + "epoch": 0.195, + "grad_norm": 0.16287657618522644, + "learning_rate": 0.00027488811478567374, + "loss": 0.3153, + "num_input_tokens_seen": 1277952000, + "step": 19500, + "train_runtime": 12735.4985, + "train_tokens_per_second": 100345.66 + }, + { + "epoch": 0.196, + "grad_norm": 0.14955779910087585, + "learning_rate": 0.0002746238336594671, + "loss": 0.3144, + "num_input_tokens_seen": 1284505600, + "step": 19600, + "train_runtime": 12804.8911, + "train_tokens_per_second": 100313.669 + }, + { + "epoch": 0.197, + "grad_norm": 0.15176887810230255, + "learning_rate": 0.00027435829757263894, + "loss": 0.3172, + "num_input_tokens_seen": 1291059200, + "step": 19700, + "train_runtime": 12869.0984, + "train_tokens_per_second": 100322.428 + }, + { + "epoch": 0.198, + "grad_norm": 0.12215608358383179, + "learning_rate": 0.0002740915091991349, + "loss": 0.3182, + "num_input_tokens_seen": 1297612800, + "step": 19800, + "train_runtime": 12932.8746, + "train_tokens_per_second": 100334.446 + }, + { + "epoch": 0.199, + "grad_norm": 0.248954638838768, + "learning_rate": 0.0002738234712255109, + "loss": 0.3171, + "num_input_tokens_seen": 1304166400, + "step": 19900, + "train_runtime": 13003.7739, + "train_tokens_per_second": 100291.378 + }, + { + "epoch": 0.2, + "grad_norm": 0.18855011463165283, + "learning_rate": 0.00027355418635090635, + "loss": 0.3181, + "num_input_tokens_seen": 1310720000, + "step": 20000, + "train_runtime": 13068.3505, + "train_tokens_per_second": 100297.279 + }, + { + "epoch": 0.201, + "grad_norm": 0.17624643445014954, + "learning_rate": 0.000273283657287017, + "loss": 0.3147, + "num_input_tokens_seen": 1317273600, + "step": 20100, + "train_runtime": 13133.7291, + "train_tokens_per_second": 100296.998 + }, + { + "epoch": 0.202, + "grad_norm": 0.12586164474487305, + "learning_rate": 0.00027301188675806745, + "loss": 0.3203, + "num_input_tokens_seen": 1323827200, + "step": 20200, + "train_runtime": 13197.5369, + "train_tokens_per_second": 100308.657 + }, + { + "epoch": 0.203, + "grad_norm": 0.13073797523975372, + "learning_rate": 0.0002727388775007839, + "loss": 0.3149, + "num_input_tokens_seen": 1330380800, + "step": 20300, + "train_runtime": 13261.8266, + "train_tokens_per_second": 100316.558 + }, + { + "epoch": 0.204, + "grad_norm": 0.12983232736587524, + "learning_rate": 0.0002724646322643666, + "loss": 0.3157, + "num_input_tokens_seen": 1336934400, + "step": 20400, + "train_runtime": 13325.295, + "train_tokens_per_second": 100330.567 + }, + { + "epoch": 0.205, + "grad_norm": 0.2400187999010086, + "learning_rate": 0.000272189153810462, + "loss": 0.3178, + "num_input_tokens_seen": 1343488000, + "step": 20500, + "train_runtime": 13395.2424, + "train_tokens_per_second": 100295.908 + }, + { + "epoch": 0.206, + "grad_norm": 0.11757266521453857, + "learning_rate": 0.0002719124449131351, + "loss": 0.3164, + "num_input_tokens_seen": 1350041600, + "step": 20600, + "train_runtime": 13459.4754, + "train_tokens_per_second": 100304.177 + }, + { + "epoch": 0.207, + "grad_norm": 0.1606636494398117, + "learning_rate": 0.00027163450835884144, + "loss": 0.3146, + "num_input_tokens_seen": 1356595200, + "step": 20700, + "train_runtime": 13524.1715, + "train_tokens_per_second": 100308.932 + }, + { + "epoch": 0.208, + "grad_norm": 0.1295078545808792, + "learning_rate": 0.00027135534694639894, + "loss": 0.3175, + "num_input_tokens_seen": 1363148800, + "step": 20800, + "train_runtime": 13588.4538, + "train_tokens_per_second": 100316.697 + }, + { + "epoch": 0.209, + "grad_norm": 0.18409083783626556, + "learning_rate": 0.00027107496348696003, + "loss": 0.3189, + "num_input_tokens_seen": 1369702400, + "step": 20900, + "train_runtime": 13653.2417, + "train_tokens_per_second": 100320.673 + }, + { + "epoch": 0.21, + "grad_norm": 0.12083840370178223, + "learning_rate": 0.00027079336080398296, + "loss": 0.3139, + "num_input_tokens_seen": 1376256000, + "step": 21000, + "train_runtime": 13723.0075, + "train_tokens_per_second": 100288.22 + }, + { + "epoch": 0.211, + "grad_norm": 0.16270384192466736, + "learning_rate": 0.00027051054173320366, + "loss": 0.3147, + "num_input_tokens_seen": 1382809600, + "step": 21100, + "train_runtime": 13787.7693, + "train_tokens_per_second": 100292.482 + }, + { + "epoch": 0.212, + "grad_norm": 0.12299864739179611, + "learning_rate": 0.000270226509122607, + "loss": 0.3137, + "num_input_tokens_seen": 1389363200, + "step": 21200, + "train_runtime": 13851.6298, + "train_tokens_per_second": 100303.229 + }, + { + "epoch": 0.213, + "grad_norm": 0.12248677760362625, + "learning_rate": 0.0002699412658323983, + "loss": 0.3177, + "num_input_tokens_seen": 1395916800, + "step": 21300, + "train_runtime": 13915.8434, + "train_tokens_per_second": 100311.333 + }, + { + "epoch": 0.214, + "grad_norm": 0.13090935349464417, + "learning_rate": 0.00026965481473497423, + "loss": 0.3146, + "num_input_tokens_seen": 1402470400, + "step": 21400, + "train_runtime": 13985.645, + "train_tokens_per_second": 100279.28 + }, + { + "epoch": 0.215, + "grad_norm": 0.1279245913028717, + "learning_rate": 0.0002693671587148942, + "loss": 0.3128, + "num_input_tokens_seen": 1409024000, + "step": 21500, + "train_runtime": 14050.4506, + "train_tokens_per_second": 100283.19 + }, + { + "epoch": 0.216, + "grad_norm": 0.15504342317581177, + "learning_rate": 0.0002690783006688511, + "loss": 0.3145, + "num_input_tokens_seen": 1415577600, + "step": 21600, + "train_runtime": 14115.855, + "train_tokens_per_second": 100282.81 + }, + { + "epoch": 0.217, + "grad_norm": 0.1325046420097351, + "learning_rate": 0.0002687882435056423, + "loss": 0.3138, + "num_input_tokens_seen": 1422131200, + "step": 21700, + "train_runtime": 14179.61, + "train_tokens_per_second": 100294.098 + }, + { + "epoch": 0.218, + "grad_norm": 0.17374184727668762, + "learning_rate": 0.0002684969901461402, + "loss": 0.3179, + "num_input_tokens_seen": 1428684800, + "step": 21800, + "train_runtime": 14245.0199, + "train_tokens_per_second": 100293.633 + }, + { + "epoch": 0.219, + "grad_norm": 0.16908228397369385, + "learning_rate": 0.000268204543523263, + "loss": 0.3182, + "num_input_tokens_seen": 1435238400, + "step": 21900, + "train_runtime": 14310.1147, + "train_tokens_per_second": 100295.381 + }, + { + "epoch": 0.22, + "grad_norm": 0.15052039921283722, + "learning_rate": 0.0002679109065819447, + "loss": 0.3148, + "num_input_tokens_seen": 1441792000, + "step": 22000, + "train_runtime": 14374.221, + "train_tokens_per_second": 100304.01 + }, + { + "epoch": 0.221, + "grad_norm": 0.1661474108695984, + "learning_rate": 0.0002676160822791062, + "loss": 0.3142, + "num_input_tokens_seen": 1448345600, + "step": 22100, + "train_runtime": 14445.9108, + "train_tokens_per_second": 100259.902 + }, + { + "epoch": 0.222, + "grad_norm": 0.16423378884792328, + "learning_rate": 0.00026732007358362496, + "loss": 0.323, + "num_input_tokens_seen": 1454899200, + "step": 22200, + "train_runtime": 14510.5733, + "train_tokens_per_second": 100264.763 + }, + { + "epoch": 0.223, + "grad_norm": 0.14868460595607758, + "learning_rate": 0.0002670228834763052, + "loss": 0.3155, + "num_input_tokens_seen": 1461452800, + "step": 22300, + "train_runtime": 14575.7382, + "train_tokens_per_second": 100266.126 + }, + { + "epoch": 0.224, + "grad_norm": 0.1287386268377304, + "learning_rate": 0.00026672451494984804, + "loss": 0.3152, + "num_input_tokens_seen": 1468006400, + "step": 22400, + "train_runtime": 14639.7379, + "train_tokens_per_second": 100275.456 + }, + { + "epoch": 0.225, + "grad_norm": 0.14276720583438873, + "learning_rate": 0.0002664249710088213, + "loss": 0.3131, + "num_input_tokens_seen": 1474560000, + "step": 22500, + "train_runtime": 14703.588, + "train_tokens_per_second": 100285.726 + }, + { + "epoch": 0.226, + "grad_norm": 0.1419740915298462, + "learning_rate": 0.00026612425466962893, + "loss": 0.3112, + "num_input_tokens_seen": 1481113600, + "step": 22600, + "train_runtime": 14773.1939, + "train_tokens_per_second": 100256.83 + }, + { + "epoch": 0.227, + "grad_norm": 0.12067803740501404, + "learning_rate": 0.00026582236896048134, + "loss": 0.3122, + "num_input_tokens_seen": 1487667200, + "step": 22700, + "train_runtime": 14837.1829, + "train_tokens_per_second": 100266.15 + }, + { + "epoch": 0.228, + "grad_norm": 0.1338939219713211, + "learning_rate": 0.00026551931692136413, + "loss": 0.3128, + "num_input_tokens_seen": 1494220800, + "step": 22800, + "train_runtime": 14900.9562, + "train_tokens_per_second": 100276.84 + }, + { + "epoch": 0.229, + "grad_norm": 0.16754469275474548, + "learning_rate": 0.00026521510160400804, + "loss": 0.3133, + "num_input_tokens_seen": 1500774400, + "step": 22900, + "train_runtime": 14965.1238, + "train_tokens_per_second": 100284.797 + }, + { + "epoch": 0.23, + "grad_norm": 0.12648451328277588, + "learning_rate": 0.00026490972607185793, + "loss": 0.311, + "num_input_tokens_seen": 1507328000, + "step": 23000, + "train_runtime": 15034.861, + "train_tokens_per_second": 100255.533 + }, + { + "epoch": 0.231, + "grad_norm": 0.12040221691131592, + "learning_rate": 0.0002646031934000421, + "loss": 0.3166, + "num_input_tokens_seen": 1513881600, + "step": 23100, + "train_runtime": 15099.2676, + "train_tokens_per_second": 100261.922 + }, + { + "epoch": 0.232, + "grad_norm": 0.12486282736063004, + "learning_rate": 0.00026429550667534095, + "loss": 0.3151, + "num_input_tokens_seen": 1520435200, + "step": 23200, + "train_runtime": 15164.1184, + "train_tokens_per_second": 100265.321 + }, + { + "epoch": 0.233, + "grad_norm": 0.18211719393730164, + "learning_rate": 0.0002639866689961565, + "loss": 0.3117, + "num_input_tokens_seen": 1526988800, + "step": 23300, + "train_runtime": 15229.7058, + "train_tokens_per_second": 100263.841 + }, + { + "epoch": 0.234, + "grad_norm": 0.13128802180290222, + "learning_rate": 0.00026367668347248083, + "loss": 0.3125, + "num_input_tokens_seen": 1533542400, + "step": 23400, + "train_runtime": 15293.6404, + "train_tokens_per_second": 100273.209 + }, + { + "epoch": 0.235, + "grad_norm": 0.11493753641843796, + "learning_rate": 0.0002633655532258646, + "loss": 0.317, + "num_input_tokens_seen": 1540096000, + "step": 23500, + "train_runtime": 15365.113, + "train_tokens_per_second": 100233.301 + }, + { + "epoch": 0.236, + "grad_norm": 0.15309779345989227, + "learning_rate": 0.000263053281389386, + "loss": 0.3136, + "num_input_tokens_seen": 1546649600, + "step": 23600, + "train_runtime": 15428.6523, + "train_tokens_per_second": 100245.282 + }, + { + "epoch": 0.237, + "grad_norm": 0.15829730033874512, + "learning_rate": 0.0002627398711076189, + "loss": 0.3098, + "num_input_tokens_seen": 1553203200, + "step": 23700, + "train_runtime": 15493.1944, + "train_tokens_per_second": 100250.675 + }, + { + "epoch": 0.238, + "grad_norm": 0.13252806663513184, + "learning_rate": 0.0002624253255366014, + "loss": 0.3096, + "num_input_tokens_seen": 1559756800, + "step": 23800, + "train_runtime": 15556.5037, + "train_tokens_per_second": 100263.969 + }, + { + "epoch": 0.239, + "grad_norm": 0.18889528512954712, + "learning_rate": 0.0002621096478438039, + "loss": 0.3146, + "num_input_tokens_seen": 1566310400, + "step": 23900, + "train_runtime": 15621.7412, + "train_tokens_per_second": 100264.777 + }, + { + "epoch": 0.24, + "grad_norm": 0.16285447776317596, + "learning_rate": 0.00026179284120809727, + "loss": 0.3168, + "num_input_tokens_seen": 1572864000, + "step": 24000, + "train_runtime": 15687.4424, + "train_tokens_per_second": 100262.615 + }, + { + "epoch": 0.241, + "grad_norm": 0.14852070808410645, + "learning_rate": 0.0002614749088197208, + "loss": 0.3115, + "num_input_tokens_seen": 1579417600, + "step": 24100, + "train_runtime": 15752.1472, + "train_tokens_per_second": 100266.813 + }, + { + "epoch": 0.242, + "grad_norm": 0.22735795378684998, + "learning_rate": 0.00026115585388025015, + "loss": 0.3099, + "num_input_tokens_seen": 1585971200, + "step": 24200, + "train_runtime": 15823.0117, + "train_tokens_per_second": 100231.943 + }, + { + "epoch": 0.243, + "grad_norm": 0.16086964309215546, + "learning_rate": 0.00026083567960256493, + "loss": 0.3107, + "num_input_tokens_seen": 1592524800, + "step": 24300, + "train_runtime": 15889.3517, + "train_tokens_per_second": 100225.914 + }, + { + "epoch": 0.244, + "grad_norm": 0.15085358917713165, + "learning_rate": 0.00026051438921081667, + "loss": 0.3112, + "num_input_tokens_seen": 1599078400, + "step": 24400, + "train_runtime": 15954.2137, + "train_tokens_per_second": 100229.22 + }, + { + "epoch": 0.245, + "grad_norm": 0.14889656007289886, + "learning_rate": 0.00026019198594039595, + "loss": 0.3147, + "num_input_tokens_seen": 1605632000, + "step": 24500, + "train_runtime": 16020.1883, + "train_tokens_per_second": 100225.539 + }, + { + "epoch": 0.246, + "grad_norm": 0.15055876970291138, + "learning_rate": 0.00025986847303790026, + "loss": 0.3125, + "num_input_tokens_seen": 1612185600, + "step": 24600, + "train_runtime": 16084.1346, + "train_tokens_per_second": 100234.525 + }, + { + "epoch": 0.247, + "grad_norm": 0.14507324993610382, + "learning_rate": 0.00025954385376110076, + "loss": 0.3115, + "num_input_tokens_seen": 1618739200, + "step": 24700, + "train_runtime": 16148.9618, + "train_tokens_per_second": 100237.973 + }, + { + "epoch": 0.248, + "grad_norm": 0.1229107677936554, + "learning_rate": 0.00025921813137891005, + "loss": 0.3147, + "num_input_tokens_seen": 1625292800, + "step": 24800, + "train_runtime": 16214.7466, + "train_tokens_per_second": 100235.473 + }, + { + "epoch": 0.249, + "grad_norm": 0.1423114389181137, + "learning_rate": 0.000258891309171349, + "loss": 0.3127, + "num_input_tokens_seen": 1631846400, + "step": 24900, + "train_runtime": 16278.9968, + "train_tokens_per_second": 100242.442 + }, + { + "epoch": 0.25, + "grad_norm": 0.15807275474071503, + "learning_rate": 0.00025856339042951344, + "loss": 0.3088, + "num_input_tokens_seen": 1638400000, + "step": 25000, + "train_runtime": 16343.5944, + "train_tokens_per_second": 100247.226 + }, + { + "epoch": 0.251, + "grad_norm": 0.15635885298252106, + "learning_rate": 0.0002582343784555415, + "loss": 0.3105, + "num_input_tokens_seen": 1644953600, + "step": 25100, + "train_runtime": 16414.1861, + "train_tokens_per_second": 100215.362 + }, + { + "epoch": 0.252, + "grad_norm": 0.13579483330249786, + "learning_rate": 0.00025790427656258017, + "loss": 0.3159, + "num_input_tokens_seen": 1651507200, + "step": 25200, + "train_runtime": 16478.0373, + "train_tokens_per_second": 100224.752 + }, + { + "epoch": 0.253, + "grad_norm": 0.14977572858333588, + "learning_rate": 0.00025757308807475185, + "loss": 0.3115, + "num_input_tokens_seen": 1658060800, + "step": 25300, + "train_runtime": 16542.7006, + "train_tokens_per_second": 100229.149 + }, + { + "epoch": 0.254, + "grad_norm": 0.1324361115694046, + "learning_rate": 0.00025724081632712086, + "loss": 0.3108, + "num_input_tokens_seen": 1664614400, + "step": 25400, + "train_runtime": 16607.2591, + "train_tokens_per_second": 100234.144 + }, + { + "epoch": 0.255, + "grad_norm": 0.12053392827510834, + "learning_rate": 0.0002569074646656601, + "loss": 0.3081, + "num_input_tokens_seen": 1671168000, + "step": 25500, + "train_runtime": 16676.4765, + "train_tokens_per_second": 100211.096 + }, + { + "epoch": 0.256, + "grad_norm": 0.16214688122272491, + "learning_rate": 0.00025657303644721695, + "loss": 0.3154, + "num_input_tokens_seen": 1677721600, + "step": 25600, + "train_runtime": 16741.4269, + "train_tokens_per_second": 100213.776 + }, + { + "epoch": 0.257, + "grad_norm": 0.13730435073375702, + "learning_rate": 0.00025623753503948004, + "loss": 0.3159, + "num_input_tokens_seen": 1684275200, + "step": 25700, + "train_runtime": 16805.4849, + "train_tokens_per_second": 100221.755 + }, + { + "epoch": 0.258, + "grad_norm": 0.16218283772468567, + "learning_rate": 0.00025590096382094475, + "loss": 0.3111, + "num_input_tokens_seen": 1690828800, + "step": 25800, + "train_runtime": 16869.8548, + "train_tokens_per_second": 100227.821 + }, + { + "epoch": 0.259, + "grad_norm": 0.15016646683216095, + "learning_rate": 0.00025556332618087945, + "loss": 0.3106, + "num_input_tokens_seen": 1697382400, + "step": 25900, + "train_runtime": 16938.0105, + "train_tokens_per_second": 100211.439 + }, + { + "epoch": 0.26, + "grad_norm": 0.1398506760597229, + "learning_rate": 0.00025522462551929155, + "loss": 0.313, + "num_input_tokens_seen": 1703936000, + "step": 26000, + "train_runtime": 17003.6995, + "train_tokens_per_second": 100209.722 + }, + { + "epoch": 0.261, + "grad_norm": 0.12380320578813553, + "learning_rate": 0.00025488486524689283, + "loss": 0.3133, + "num_input_tokens_seen": 1710489600, + "step": 26100, + "train_runtime": 17069.3522, + "train_tokens_per_second": 100208.232 + }, + { + "epoch": 0.262, + "grad_norm": 0.14536257088184357, + "learning_rate": 0.00025454404878506555, + "loss": 0.3115, + "num_input_tokens_seen": 1717043200, + "step": 26200, + "train_runtime": 17132.7395, + "train_tokens_per_second": 100220.003 + }, + { + "epoch": 0.263, + "grad_norm": 0.14442390203475952, + "learning_rate": 0.0002542021795658276, + "loss": 0.311, + "num_input_tokens_seen": 1723596800, + "step": 26300, + "train_runtime": 17196.4745, + "train_tokens_per_second": 100229.66 + }, + { + "epoch": 0.264, + "grad_norm": 0.12595972418785095, + "learning_rate": 0.0002538592610317984, + "loss": 0.3118, + "num_input_tokens_seen": 1730150400, + "step": 26400, + "train_runtime": 17266.9358, + "train_tokens_per_second": 100200.199 + }, + { + "epoch": 0.265, + "grad_norm": 0.1587669402360916, + "learning_rate": 0.00025351529663616355, + "loss": 0.3132, + "num_input_tokens_seen": 1736704000, + "step": 26500, + "train_runtime": 17331.5833, + "train_tokens_per_second": 100204.578 + }, + { + "epoch": 0.266, + "grad_norm": 0.1406719982624054, + "learning_rate": 0.00025317028984264087, + "loss": 0.3099, + "num_input_tokens_seen": 1743257600, + "step": 26600, + "train_runtime": 17395.5945, + "train_tokens_per_second": 100212.591 + }, + { + "epoch": 0.267, + "grad_norm": 0.1677832007408142, + "learning_rate": 0.0002528242441254448, + "loss": 0.309, + "num_input_tokens_seen": 1749811200, + "step": 26700, + "train_runtime": 17459.1185, + "train_tokens_per_second": 100223.342 + }, + { + "epoch": 0.268, + "grad_norm": 0.13640043139457703, + "learning_rate": 0.000252477162969252, + "loss": 0.3112, + "num_input_tokens_seen": 1756364800, + "step": 26800, + "train_runtime": 17523.2088, + "train_tokens_per_second": 100230.775 + }, + { + "epoch": 0.269, + "grad_norm": 0.12981313467025757, + "learning_rate": 0.00025212904986916584, + "loss": 0.3124, + "num_input_tokens_seen": 1762918400, + "step": 26900, + "train_runtime": 17587.6922, + "train_tokens_per_second": 100235.914 + }, + { + "epoch": 0.27, + "grad_norm": 0.14338868856430054, + "learning_rate": 0.00025177990833068133, + "loss": 0.3124, + "num_input_tokens_seen": 1769472000, + "step": 27000, + "train_runtime": 17658.758, + "train_tokens_per_second": 100203.649 + }, + { + "epoch": 0.271, + "grad_norm": 0.17518877983093262, + "learning_rate": 0.0002514297418696499, + "loss": 0.3076, + "num_input_tokens_seen": 1776025600, + "step": 27100, + "train_runtime": 17723.3886, + "train_tokens_per_second": 100208.016 + }, + { + "epoch": 0.272, + "grad_norm": 0.1369880735874176, + "learning_rate": 0.0002510785540122439, + "loss": 0.3114, + "num_input_tokens_seen": 1782579200, + "step": 27200, + "train_runtime": 17786.611, + "train_tokens_per_second": 100220.283 + }, + { + "epoch": 0.273, + "grad_norm": 0.15111377835273743, + "learning_rate": 0.0002507263482949212, + "loss": 0.3144, + "num_input_tokens_seen": 1789132800, + "step": 27300, + "train_runtime": 17852.1418, + "train_tokens_per_second": 100219.504 + }, + { + "epoch": 0.274, + "grad_norm": 0.140447199344635, + "learning_rate": 0.0002503731282643894, + "loss": 0.3103, + "num_input_tokens_seen": 1795686400, + "step": 27400, + "train_runtime": 17917.1236, + "train_tokens_per_second": 100221.801 + }, + { + "epoch": 0.275, + "grad_norm": 0.1373315006494522, + "learning_rate": 0.0002500188974775704, + "loss": 0.3095, + "num_input_tokens_seen": 1802240000, + "step": 27500, + "train_runtime": 17981.4799, + "train_tokens_per_second": 100227.568 + }, + { + "epoch": 0.276, + "grad_norm": 0.1453147530555725, + "learning_rate": 0.00024966365950156416, + "loss": 0.3085, + "num_input_tokens_seen": 1808793600, + "step": 27600, + "train_runtime": 18052.109, + "train_tokens_per_second": 100198.464 + }, + { + "epoch": 0.277, + "grad_norm": 0.19097484648227692, + "learning_rate": 0.00024930741791361326, + "loss": 0.3128, + "num_input_tokens_seen": 1815347200, + "step": 27700, + "train_runtime": 18117.9773, + "train_tokens_per_second": 100195.909 + }, + { + "epoch": 0.278, + "grad_norm": 0.2222718745470047, + "learning_rate": 0.0002489501763010664, + "loss": 0.3107, + "num_input_tokens_seen": 1821900800, + "step": 27800, + "train_runtime": 18178.1946, + "train_tokens_per_second": 100224.519 + }, + { + "epoch": 0.279, + "grad_norm": 0.16960225999355316, + "learning_rate": 0.00024859193826134285, + "loss": 0.3093, + "num_input_tokens_seen": 1828454400, + "step": 27900, + "train_runtime": 18248.1866, + "train_tokens_per_second": 100199.238 + }, + { + "epoch": 0.28, + "grad_norm": 0.15540289878845215, + "learning_rate": 0.00024823270740189556, + "loss": 0.3084, + "num_input_tokens_seen": 1835008000, + "step": 28000, + "train_runtime": 18313.0722, + "train_tokens_per_second": 100202.084 + }, + { + "epoch": 0.281, + "grad_norm": 0.1421203911304474, + "learning_rate": 0.00024787248734017527, + "loss": 0.3119, + "num_input_tokens_seen": 1841561600, + "step": 28100, + "train_runtime": 18377.039, + "train_tokens_per_second": 100209.919 + }, + { + "epoch": 0.282, + "grad_norm": 0.131204292178154, + "learning_rate": 0.0002475112817035941, + "loss": 0.3127, + "num_input_tokens_seen": 1848115200, + "step": 28200, + "train_runtime": 18441.4656, + "train_tokens_per_second": 100215.202 + }, + { + "epoch": 0.283, + "grad_norm": 0.1507508009672165, + "learning_rate": 0.0002471490941294887, + "loss": 0.3118, + "num_input_tokens_seen": 1854668800, + "step": 28300, + "train_runtime": 18511.3095, + "train_tokens_per_second": 100191.118 + }, + { + "epoch": 0.284, + "grad_norm": 0.12522923946380615, + "learning_rate": 0.000246785928265084, + "loss": 0.3104, + "num_input_tokens_seen": 1861222400, + "step": 28400, + "train_runtime": 18574.4697, + "train_tokens_per_second": 100203.259 + }, + { + "epoch": 0.285, + "grad_norm": 0.2087126076221466, + "learning_rate": 0.0002464217877674562, + "loss": 0.3132, + "num_input_tokens_seen": 1867776000, + "step": 28500, + "train_runtime": 18638.8332, + "train_tokens_per_second": 100208.848 + }, + { + "epoch": 0.286, + "grad_norm": 0.1495303064584732, + "learning_rate": 0.0002460566763034961, + "loss": 0.3159, + "num_input_tokens_seen": 1874329600, + "step": 28600, + "train_runtime": 18703.8924, + "train_tokens_per_second": 100210.671 + }, + { + "epoch": 0.287, + "grad_norm": 0.14563380181789398, + "learning_rate": 0.00024569059754987196, + "loss": 0.3116, + "num_input_tokens_seen": 1880883200, + "step": 28700, + "train_runtime": 18774.7813, + "train_tokens_per_second": 100181.364 + }, + { + "epoch": 0.288, + "grad_norm": 0.12803615629673004, + "learning_rate": 0.00024532355519299296, + "loss": 0.3099, + "num_input_tokens_seen": 1887436800, + "step": 28800, + "train_runtime": 18838.435, + "train_tokens_per_second": 100190.743 + }, + { + "epoch": 0.289, + "grad_norm": 0.5618897676467896, + "learning_rate": 0.0002449555529289714, + "loss": 0.3129, + "num_input_tokens_seen": 1893990400, + "step": 28900, + "train_runtime": 18901.8999, + "train_tokens_per_second": 100201.06 + }, + { + "epoch": 0.29, + "grad_norm": 0.15488959848880768, + "learning_rate": 0.0002445865944635861, + "loss": 0.3155, + "num_input_tokens_seen": 1900544000, + "step": 29000, + "train_runtime": 18967.9894, + "train_tokens_per_second": 100197.441 + }, + { + "epoch": 0.291, + "grad_norm": 0.13676992058753967, + "learning_rate": 0.0002442166835122446, + "loss": 0.3101, + "num_input_tokens_seen": 1907097600, + "step": 29100, + "train_runtime": 19031.1664, + "train_tokens_per_second": 100209.181 + }, + { + "epoch": 0.292, + "grad_norm": 0.11402736604213715, + "learning_rate": 0.00024384582379994614, + "loss": 0.3094, + "num_input_tokens_seen": 1913651200, + "step": 29200, + "train_runtime": 19096.1775, + "train_tokens_per_second": 100211.218 + }, + { + "epoch": 0.293, + "grad_norm": 0.1358448714017868, + "learning_rate": 0.00024347401906124388, + "loss": 0.309, + "num_input_tokens_seen": 1920204800, + "step": 29300, + "train_runtime": 19165.3098, + "train_tokens_per_second": 100191.691 + }, + { + "epoch": 0.294, + "grad_norm": 0.14608891308307648, + "learning_rate": 0.0002431012730402075, + "loss": 0.3119, + "num_input_tokens_seen": 1926758400, + "step": 29400, + "train_runtime": 19230.3069, + "train_tokens_per_second": 100193.845 + }, + { + "epoch": 0.295, + "grad_norm": 0.1501711755990982, + "learning_rate": 0.00024272758949038517, + "loss": 0.3091, + "num_input_tokens_seen": 1933312000, + "step": 29500, + "train_runtime": 19294.7627, + "train_tokens_per_second": 100198.796 + }, + { + "epoch": 0.296, + "grad_norm": 0.1614496409893036, + "learning_rate": 0.00024235297217476616, + "loss": 0.3104, + "num_input_tokens_seen": 1939865600, + "step": 29600, + "train_runtime": 19364.7415, + "train_tokens_per_second": 100175.135 + }, + { + "epoch": 0.297, + "grad_norm": 0.11902807652950287, + "learning_rate": 0.00024197742486574268, + "loss": 0.3126, + "num_input_tokens_seen": 1946419200, + "step": 29700, + "train_runtime": 19429.1038, + "train_tokens_per_second": 100180.596 + }, + { + "epoch": 0.298, + "grad_norm": 0.12998123466968536, + "learning_rate": 0.0002416009513450719, + "loss": 0.3102, + "num_input_tokens_seen": 1952972800, + "step": 29800, + "train_runtime": 19494.2244, + "train_tokens_per_second": 100182.124 + }, + { + "epoch": 0.299, + "grad_norm": 0.2079559862613678, + "learning_rate": 0.00024122355540383806, + "loss": 0.311, + "num_input_tokens_seen": 1959526400, + "step": 29900, + "train_runtime": 19559.2072, + "train_tokens_per_second": 100184.347 + }, + { + "epoch": 0.3, + "grad_norm": 0.15128397941589355, + "learning_rate": 0.00024084524084241405, + "loss": 0.3076, + "num_input_tokens_seen": 1966080000, + "step": 30000, + "train_runtime": 19623.3669, + "train_tokens_per_second": 100190.758 + }, + { + "epoch": 0.301, + "grad_norm": 0.13512304425239563, + "learning_rate": 0.00024046601147042332, + "loss": 0.3119, + "num_input_tokens_seen": 1972633600, + "step": 30100, + "train_runtime": 19688.91, + "train_tokens_per_second": 100190.086 + }, + { + "epoch": 0.302, + "grad_norm": 0.12716713547706604, + "learning_rate": 0.0002400858711067015, + "loss": 0.3093, + "num_input_tokens_seen": 1979187200, + "step": 30200, + "train_runtime": 19753.5863, + "train_tokens_per_second": 100193.816 + }, + { + "epoch": 0.303, + "grad_norm": 0.1301889717578888, + "learning_rate": 0.00023970482357925772, + "loss": 0.31, + "num_input_tokens_seen": 1985740800, + "step": 30300, + "train_runtime": 19823.6081, + "train_tokens_per_second": 100170.503 + }, + { + "epoch": 0.304, + "grad_norm": 0.13871292769908905, + "learning_rate": 0.00023932287272523646, + "loss": 0.3084, + "num_input_tokens_seen": 1992294400, + "step": 30400, + "train_runtime": 19887.7656, + "train_tokens_per_second": 100176.885 + }, + { + "epoch": 0.305, + "grad_norm": 0.12449346482753754, + "learning_rate": 0.00023894002239087847, + "loss": 0.3276, + "num_input_tokens_seen": 1998848000, + "step": 30500, + "train_runtime": 19952.5714, + "train_tokens_per_second": 100179.97 + }, + { + "epoch": 0.306, + "grad_norm": 0.1523977369070053, + "learning_rate": 0.0002385562764314825, + "loss": 0.3097, + "num_input_tokens_seen": 2005401600, + "step": 30600, + "train_runtime": 20017.8352, + "train_tokens_per_second": 100180.743 + }, + { + "epoch": 0.307, + "grad_norm": 0.1439458280801773, + "learning_rate": 0.00023817163871136596, + "loss": 0.3048, + "num_input_tokens_seen": 2011955200, + "step": 30700, + "train_runtime": 20081.8889, + "train_tokens_per_second": 100187.548 + }, + { + "epoch": 0.308, + "grad_norm": 0.12756380438804626, + "learning_rate": 0.00023778611310382652, + "loss": 0.3075, + "num_input_tokens_seen": 2018508800, + "step": 30800, + "train_runtime": 20145.6107, + "train_tokens_per_second": 100195.96 + }, + { + "epoch": 0.309, + "grad_norm": 0.14607320725917816, + "learning_rate": 0.0002373997034911027, + "loss": 0.3139, + "num_input_tokens_seen": 2025062400, + "step": 30900, + "train_runtime": 20210.9796, + "train_tokens_per_second": 100196.153 + }, + { + "epoch": 0.31, + "grad_norm": 0.12456675618886948, + "learning_rate": 0.00023701241376433506, + "loss": 0.3089, + "num_input_tokens_seen": 2031616000, + "step": 31000, + "train_runtime": 20281.0675, + "train_tokens_per_second": 100173.031 + }, + { + "epoch": 0.311, + "grad_norm": 0.13834626972675323, + "learning_rate": 0.0002366242478235268, + "loss": 0.3066, + "num_input_tokens_seen": 2038169600, + "step": 31100, + "train_runtime": 20346.0263, + "train_tokens_per_second": 100175.315 + }, + { + "epoch": 0.312, + "grad_norm": 0.1534184068441391, + "learning_rate": 0.00023623520957750471, + "loss": 0.3082, + "num_input_tokens_seen": 2044723200, + "step": 31200, + "train_runtime": 20409.76, + "train_tokens_per_second": 100183.598 + }, + { + "epoch": 0.313, + "grad_norm": 0.12966671586036682, + "learning_rate": 0.00023584530294387953, + "loss": 0.3126, + "num_input_tokens_seen": 2051276800, + "step": 31300, + "train_runtime": 20475.6348, + "train_tokens_per_second": 100181.353 + }, + { + "epoch": 0.314, + "grad_norm": 0.14474999904632568, + "learning_rate": 0.00023545453184900682, + "loss": 0.3091, + "num_input_tokens_seen": 2057830400, + "step": 31400, + "train_runtime": 20539.196, + "train_tokens_per_second": 100190.407 + }, + { + "epoch": 0.315, + "grad_norm": 0.13208946585655212, + "learning_rate": 0.00023506290022794706, + "loss": 0.3095, + "num_input_tokens_seen": 2064384000, + "step": 31500, + "train_runtime": 20604.221, + "train_tokens_per_second": 100192.286 + }, + { + "epoch": 0.316, + "grad_norm": 0.15090374648571014, + "learning_rate": 0.00023467041202442643, + "loss": 0.3073, + "num_input_tokens_seen": 2070937600, + "step": 31600, + "train_runtime": 20674.5759, + "train_tokens_per_second": 100168.323 + }, + { + "epoch": 0.317, + "grad_norm": 0.18638543784618378, + "learning_rate": 0.00023427707119079669, + "loss": 0.312, + "num_input_tokens_seen": 2077491200, + "step": 31700, + "train_runtime": 20738.8671, + "train_tokens_per_second": 100173.804 + }, + { + "epoch": 0.318, + "grad_norm": 0.1385478377342224, + "learning_rate": 0.0002338828816879957, + "loss": 0.3095, + "num_input_tokens_seen": 2084044800, + "step": 31800, + "train_runtime": 20802.7906, + "train_tokens_per_second": 100181.021 + }, + { + "epoch": 0.319, + "grad_norm": 0.15265443921089172, + "learning_rate": 0.00023348784748550744, + "loss": 0.3103, + "num_input_tokens_seen": 2090598400, + "step": 31900, + "train_runtime": 20868.0311, + "train_tokens_per_second": 100181.871 + }, + { + "epoch": 0.32, + "grad_norm": 0.15918248891830444, + "learning_rate": 0.00023309197256132184, + "loss": 0.3102, + "num_input_tokens_seen": 2097152000, + "step": 32000, + "train_runtime": 20937.8931, + "train_tokens_per_second": 100160.603 + }, + { + "epoch": 0.321, + "grad_norm": 0.14801020920276642, + "learning_rate": 0.00023269526090189505, + "loss": 0.3147, + "num_input_tokens_seen": 2103705600, + "step": 32100, + "train_runtime": 21002.9142, + "train_tokens_per_second": 100162.557 + }, + { + "epoch": 0.322, + "grad_norm": 0.18616679310798645, + "learning_rate": 0.00023229771650210907, + "loss": 0.3099, + "num_input_tokens_seen": 2110259200, + "step": 32200, + "train_runtime": 21067.872, + "train_tokens_per_second": 100164.801 + }, + { + "epoch": 0.323, + "grad_norm": 0.13931268453598022, + "learning_rate": 0.00023189934336523163, + "loss": 0.3115, + "num_input_tokens_seen": 2116812800, + "step": 32300, + "train_runtime": 21131.2256, + "train_tokens_per_second": 100174.634 + }, + { + "epoch": 0.324, + "grad_norm": 0.1734631061553955, + "learning_rate": 0.00023150014550287574, + "loss": 0.3112, + "num_input_tokens_seen": 2123366400, + "step": 32400, + "train_runtime": 21201.6285, + "train_tokens_per_second": 100151.099 + }, + { + "epoch": 0.325, + "grad_norm": 0.13876596093177795, + "learning_rate": 0.00023110012693495943, + "loss": 0.31, + "num_input_tokens_seen": 2129920000, + "step": 32500, + "train_runtime": 21265.8205, + "train_tokens_per_second": 100156.963 + }, + { + "epoch": 0.326, + "grad_norm": 0.20441171526908875, + "learning_rate": 0.00023069929168966527, + "loss": 0.3095, + "num_input_tokens_seen": 2136473600, + "step": 32600, + "train_runtime": 21329.6315, + "train_tokens_per_second": 100164.581 + }, + { + "epoch": 0.327, + "grad_norm": 0.12022672593593597, + "learning_rate": 0.0002302976438033997, + "loss": 0.3089, + "num_input_tokens_seen": 2143027200, + "step": 32700, + "train_runtime": 21394.0086, + "train_tokens_per_second": 100169.502 + }, + { + "epoch": 0.328, + "grad_norm": 0.23158074915409088, + "learning_rate": 0.0002298951873207525, + "loss": 0.3121, + "num_input_tokens_seen": 2149580800, + "step": 32800, + "train_runtime": 21459.8938, + "train_tokens_per_second": 100167.355 + }, + { + "epoch": 0.329, + "grad_norm": 0.11978685855865479, + "learning_rate": 0.00022949192629445606, + "loss": 0.308, + "num_input_tokens_seen": 2156134400, + "step": 32900, + "train_runtime": 21524.2825, + "train_tokens_per_second": 100172.185 + }, + { + "epoch": 0.33, + "grad_norm": 0.16882842779159546, + "learning_rate": 0.0002290878647853443, + "loss": 0.3076, + "num_input_tokens_seen": 2162688000, + "step": 33000, + "train_runtime": 21595.0222, + "train_tokens_per_second": 100147.524 + }, + { + "epoch": 0.331, + "grad_norm": 0.1368299126625061, + "learning_rate": 0.00022868300686231224, + "loss": 0.3078, + "num_input_tokens_seen": 2169241600, + "step": 33100, + "train_runtime": 21659.0361, + "train_tokens_per_second": 100154.115 + }, + { + "epoch": 0.332, + "grad_norm": 0.13301041722297668, + "learning_rate": 0.00022827735660227457, + "loss": 0.3103, + "num_input_tokens_seen": 2175795200, + "step": 33200, + "train_runtime": 21723.8934, + "train_tokens_per_second": 100156.779 + }, + { + "epoch": 0.333, + "grad_norm": 0.13545189797878265, + "learning_rate": 0.000227870918090125, + "loss": 0.3068, + "num_input_tokens_seen": 2182348800, + "step": 33300, + "train_runtime": 21788.4359, + "train_tokens_per_second": 100160.875 + }, + { + "epoch": 0.334, + "grad_norm": 0.2138141542673111, + "learning_rate": 0.00022746369541869476, + "loss": 0.3059, + "num_input_tokens_seen": 2188902400, + "step": 33400, + "train_runtime": 21853.4857, + "train_tokens_per_second": 100162.621 + }, + { + "epoch": 0.335, + "grad_norm": 0.1255991905927658, + "learning_rate": 0.00022705569268871163, + "loss": 0.3099, + "num_input_tokens_seen": 2195456000, + "step": 33500, + "train_runtime": 21918.1728, + "train_tokens_per_second": 100166.014 + }, + { + "epoch": 0.336, + "grad_norm": 0.1330287754535675, + "learning_rate": 0.00022664691400875865, + "loss": 0.3093, + "num_input_tokens_seen": 2202009600, + "step": 33600, + "train_runtime": 21987.6743, + "train_tokens_per_second": 100147.454 + }, + { + "epoch": 0.337, + "grad_norm": 0.1321260631084442, + "learning_rate": 0.00022623736349523254, + "loss": 0.3109, + "num_input_tokens_seen": 2208563200, + "step": 33700, + "train_runtime": 22052.5483, + "train_tokens_per_second": 100150.022 + }, + { + "epoch": 0.338, + "grad_norm": 0.13865865767002106, + "learning_rate": 0.00022582704527230238, + "loss": 0.3068, + "num_input_tokens_seen": 2215116800, + "step": 33800, + "train_runtime": 22117.0958, + "train_tokens_per_second": 100154.054 + }, + { + "epoch": 0.339, + "grad_norm": 0.13597998023033142, + "learning_rate": 0.0002254159634718682, + "loss": 0.3061, + "num_input_tokens_seen": 2221670400, + "step": 33900, + "train_runtime": 22180.0605, + "train_tokens_per_second": 100165.209 + }, + { + "epoch": 0.34, + "grad_norm": 0.14176584780216217, + "learning_rate": 0.00022500412223351915, + "loss": 0.3114, + "num_input_tokens_seen": 2228224000, + "step": 34000, + "train_runtime": 22251.2759, + "train_tokens_per_second": 100139.157 + }, + { + "epoch": 0.341, + "grad_norm": 0.13006241619586945, + "learning_rate": 0.0002245915257044919, + "loss": 0.3071, + "num_input_tokens_seen": 2234777600, + "step": 34100, + "train_runtime": 22315.7056, + "train_tokens_per_second": 100143.712 + }, + { + "epoch": 0.342, + "grad_norm": 0.186634823679924, + "learning_rate": 0.00022417817803962892, + "loss": 0.3032, + "num_input_tokens_seen": 2241331200, + "step": 34200, + "train_runtime": 22380.1064, + "train_tokens_per_second": 100148.371 + }, + { + "epoch": 0.343, + "grad_norm": 0.1767393946647644, + "learning_rate": 0.0002237640834013366, + "loss": 0.3085, + "num_input_tokens_seen": 2247884800, + "step": 34300, + "train_runtime": 22444.6012, + "train_tokens_per_second": 100152.584 + }, + { + "epoch": 0.344, + "grad_norm": 0.15075454115867615, + "learning_rate": 0.0002233492459595434, + "loss": 0.3099, + "num_input_tokens_seen": 2254438400, + "step": 34400, + "train_runtime": 22509.6493, + "train_tokens_per_second": 100154.31 + }, + { + "epoch": 0.345, + "grad_norm": 0.15754783153533936, + "learning_rate": 0.00022293366989165772, + "loss": 0.307, + "num_input_tokens_seen": 2260992000, + "step": 34500, + "train_runtime": 22579.4848, + "train_tokens_per_second": 100134.791 + }, + { + "epoch": 0.346, + "grad_norm": 0.13372038304805756, + "learning_rate": 0.00022251735938252587, + "loss": 0.3066, + "num_input_tokens_seen": 2267545600, + "step": 34600, + "train_runtime": 22643.953, + "train_tokens_per_second": 100139.123 + }, + { + "epoch": 0.347, + "grad_norm": 0.17753738164901733, + "learning_rate": 0.0002221003186243902, + "loss": 0.3087, + "num_input_tokens_seen": 2274099200, + "step": 34700, + "train_runtime": 22708.6869, + "train_tokens_per_second": 100142.259 + }, + { + "epoch": 0.348, + "grad_norm": 0.1375788450241089, + "learning_rate": 0.00022168255181684643, + "loss": 0.3064, + "num_input_tokens_seen": 2280652800, + "step": 34800, + "train_runtime": 22774.2018, + "train_tokens_per_second": 100141.942 + }, + { + "epoch": 0.349, + "grad_norm": 0.14929898083209991, + "learning_rate": 0.00022126406316680172, + "loss": 0.3108, + "num_input_tokens_seen": 2287206400, + "step": 34900, + "train_runtime": 22839.776, + "train_tokens_per_second": 100141.367 + }, + { + "epoch": 0.35, + "grad_norm": 0.15789327025413513, + "learning_rate": 0.00022084485688843208, + "loss": 0.3082, + "num_input_tokens_seen": 2293760000, + "step": 35000, + "train_runtime": 22904.3853, + "train_tokens_per_second": 100145.015 + }, + { + "epoch": 0.351, + "grad_norm": 0.1339723765850067, + "learning_rate": 0.00022042493720314003, + "loss": 0.3127, + "num_input_tokens_seen": 2300313600, + "step": 35100, + "train_runtime": 22968.8594, + "train_tokens_per_second": 100149.231 + }, + { + "epoch": 0.352, + "grad_norm": 0.14159700274467468, + "learning_rate": 0.00022000430833951228, + "loss": 0.3096, + "num_input_tokens_seen": 2306867200, + "step": 35200, + "train_runtime": 23033.0283, + "train_tokens_per_second": 100154.751 + }, + { + "epoch": 0.353, + "grad_norm": 0.17289403080940247, + "learning_rate": 0.00021958297453327673, + "loss": 0.3058, + "num_input_tokens_seen": 2313420800, + "step": 35300, + "train_runtime": 23103.5037, + "train_tokens_per_second": 100132.899 + }, + { + "epoch": 0.354, + "grad_norm": 0.1353076845407486, + "learning_rate": 0.00021916094002726012, + "loss": 0.3048, + "num_input_tokens_seen": 2319974400, + "step": 35400, + "train_runtime": 23166.8292, + "train_tokens_per_second": 100142.077 + }, + { + "epoch": 0.355, + "grad_norm": 0.12303294241428375, + "learning_rate": 0.00021873820907134534, + "loss": 0.3102, + "num_input_tokens_seen": 2326528000, + "step": 35500, + "train_runtime": 23232.6655, + "train_tokens_per_second": 100140.382 + }, + { + "epoch": 0.356, + "grad_norm": 0.14765286445617676, + "learning_rate": 0.0002183147859224283, + "loss": 0.3106, + "num_input_tokens_seen": 2333081600, + "step": 35600, + "train_runtime": 23296.4196, + "train_tokens_per_second": 100147.647 + }, + { + "epoch": 0.357, + "grad_norm": 0.13833215832710266, + "learning_rate": 0.00021789067484437544, + "loss": 0.3055, + "num_input_tokens_seen": 2339635200, + "step": 35700, + "train_runtime": 23361.5704, + "train_tokens_per_second": 100148.884 + }, + { + "epoch": 0.358, + "grad_norm": 0.13157132267951965, + "learning_rate": 0.00021746588010798068, + "loss": 0.3081, + "num_input_tokens_seen": 2346188800, + "step": 35800, + "train_runtime": 23430.7927, + "train_tokens_per_second": 100132.711 + }, + { + "epoch": 0.359, + "grad_norm": 0.12913836538791656, + "learning_rate": 0.00021704040599092216, + "loss": 0.3094, + "num_input_tokens_seen": 2352742400, + "step": 35900, + "train_runtime": 23495.4052, + "train_tokens_per_second": 100136.277 + }, + { + "epoch": 0.36, + "grad_norm": 0.13528013229370117, + "learning_rate": 0.00021661425677771965, + "loss": 0.3061, + "num_input_tokens_seen": 2359296000, + "step": 36000, + "train_runtime": 23559.8424, + "train_tokens_per_second": 100140.568 + }, + { + "epoch": 0.361, + "grad_norm": 0.15519119799137115, + "learning_rate": 0.00021618743675969095, + "loss": 0.3065, + "num_input_tokens_seen": 2365849600, + "step": 36100, + "train_runtime": 23624.7603, + "train_tokens_per_second": 100142.798 + }, + { + "epoch": 0.362, + "grad_norm": 0.14744772017002106, + "learning_rate": 0.0002157599502349089, + "loss": 0.3068, + "num_input_tokens_seen": 2372403200, + "step": 36200, + "train_runtime": 23688.8845, + "train_tokens_per_second": 100148.371 + }, + { + "epoch": 0.363, + "grad_norm": 0.13838911056518555, + "learning_rate": 0.00021533180150815802, + "loss": 0.3097, + "num_input_tokens_seen": 2378956800, + "step": 36300, + "train_runtime": 23759.9908, + "train_tokens_per_second": 100124.483 + }, + { + "epoch": 0.364, + "grad_norm": 0.12536117434501648, + "learning_rate": 0.00021490299489089132, + "loss": 0.3067, + "num_input_tokens_seen": 2385510400, + "step": 36400, + "train_runtime": 23823.7123, + "train_tokens_per_second": 100131.767 + }, + { + "epoch": 0.365, + "grad_norm": 0.14205192029476166, + "learning_rate": 0.00021447353470118656, + "loss": 0.3049, + "num_input_tokens_seen": 2392064000, + "step": 36500, + "train_runtime": 23887.5453, + "train_tokens_per_second": 100138.544 + }, + { + "epoch": 0.366, + "grad_norm": 0.11950815469026566, + "learning_rate": 0.00021404342526370326, + "loss": 0.3072, + "num_input_tokens_seen": 2398617600, + "step": 36600, + "train_runtime": 23951.3108, + "train_tokens_per_second": 100145.567 + }, + { + "epoch": 0.367, + "grad_norm": 0.1286599189043045, + "learning_rate": 0.00021361267090963846, + "loss": 0.3096, + "num_input_tokens_seen": 2405171200, + "step": 36700, + "train_runtime": 24016.5354, + "train_tokens_per_second": 100146.468 + }, + { + "epoch": 0.368, + "grad_norm": 0.12663663923740387, + "learning_rate": 0.0002131812759766839, + "loss": 0.3054, + "num_input_tokens_seen": 2411724800, + "step": 36800, + "train_runtime": 24085.8974, + "train_tokens_per_second": 100130.162 + }, + { + "epoch": 0.369, + "grad_norm": 0.16495896875858307, + "learning_rate": 0.00021274924480898169, + "loss": 0.3037, + "num_input_tokens_seen": 2418278400, + "step": 36900, + "train_runtime": 24149.4634, + "train_tokens_per_second": 100137.977 + }, + { + "epoch": 0.37, + "grad_norm": 0.13351881504058838, + "learning_rate": 0.00021231658175708087, + "loss": 0.309, + "num_input_tokens_seen": 2424832000, + "step": 37000, + "train_runtime": 24214.3635, + "train_tokens_per_second": 100140.233 + }, + { + "epoch": 0.371, + "grad_norm": 0.13137440383434296, + "learning_rate": 0.00021188329117789357, + "loss": 0.3061, + "num_input_tokens_seen": 2431385600, + "step": 37100, + "train_runtime": 24284.8537, + "train_tokens_per_second": 100119.426 + }, + { + "epoch": 0.372, + "grad_norm": 0.17069390416145325, + "learning_rate": 0.0002114493774346512, + "loss": 0.3075, + "num_input_tokens_seen": 2437939200, + "step": 37200, + "train_runtime": 24349.7441, + "train_tokens_per_second": 100121.759 + }, + { + "epoch": 0.373, + "grad_norm": 0.13554754853248596, + "learning_rate": 0.00021101484489686025, + "loss": 0.3056, + "num_input_tokens_seen": 2444492800, + "step": 37300, + "train_runtime": 24413.4106, + "train_tokens_per_second": 100129.099 + }, + { + "epoch": 0.374, + "grad_norm": 0.24161159992218018, + "learning_rate": 0.00021057969794025866, + "loss": 0.3084, + "num_input_tokens_seen": 2451046400, + "step": 37400, + "train_runtime": 24479.2787, + "train_tokens_per_second": 100127.395 + }, + { + "epoch": 0.375, + "grad_norm": 0.11480960994958878, + "learning_rate": 0.00021014394094677128, + "loss": 0.3065, + "num_input_tokens_seen": 2457600000, + "step": 37500, + "train_runtime": 24543.1085, + "train_tokens_per_second": 100134.015 + }, + { + "epoch": 0.376, + "grad_norm": 0.1333978921175003, + "learning_rate": 0.00020970757830446633, + "loss": 0.3047, + "num_input_tokens_seen": 2464153600, + "step": 37600, + "train_runtime": 24612.4036, + "train_tokens_per_second": 100118.365 + }, + { + "epoch": 0.377, + "grad_norm": 0.1306515485048294, + "learning_rate": 0.00020927061440751072, + "loss": 0.3039, + "num_input_tokens_seen": 2470707200, + "step": 37700, + "train_runtime": 24676.7406, + "train_tokens_per_second": 100122.915 + }, + { + "epoch": 0.378, + "grad_norm": 0.19177651405334473, + "learning_rate": 0.00020883305365612602, + "loss": 0.3091, + "num_input_tokens_seen": 2477260800, + "step": 37800, + "train_runtime": 24742.4612, + "train_tokens_per_second": 100121.842 + }, + { + "epoch": 0.379, + "grad_norm": 0.14794479310512543, + "learning_rate": 0.00020839490045654425, + "loss": 0.3103, + "num_input_tokens_seen": 2483814400, + "step": 37900, + "train_runtime": 24807.833, + "train_tokens_per_second": 100122.183 + }, + { + "epoch": 0.38, + "grad_norm": 0.1391579508781433, + "learning_rate": 0.00020795615922096313, + "loss": 0.305, + "num_input_tokens_seen": 2490368000, + "step": 38000, + "train_runtime": 24871.0815, + "train_tokens_per_second": 100131.07 + }, + { + "epoch": 0.381, + "grad_norm": 0.14466038346290588, + "learning_rate": 0.00020751683436750207, + "loss": 0.3066, + "num_input_tokens_seen": 2496921600, + "step": 38100, + "train_runtime": 24941.5584, + "train_tokens_per_second": 100110.89 + }, + { + "epoch": 0.382, + "grad_norm": 0.14706650376319885, + "learning_rate": 0.00020707693032015752, + "loss": 0.3131, + "num_input_tokens_seen": 2503475200, + "step": 38200, + "train_runtime": 25006.658, + "train_tokens_per_second": 100112.346 + }, + { + "epoch": 0.383, + "grad_norm": 0.1455349326133728, + "learning_rate": 0.00020663645150875834, + "loss": 0.3058, + "num_input_tokens_seen": 2510028800, + "step": 38300, + "train_runtime": 25070.3473, + "train_tokens_per_second": 100119.427 + }, + { + "epoch": 0.384, + "grad_norm": 0.13858123123645782, + "learning_rate": 0.00020619540236892125, + "loss": 0.3066, + "num_input_tokens_seen": 2516582400, + "step": 38400, + "train_runtime": 25135.6982, + "train_tokens_per_second": 100119.853 + }, + { + "epoch": 0.385, + "grad_norm": 0.17408473789691925, + "learning_rate": 0.00020575378734200616, + "loss": 0.3068, + "num_input_tokens_seen": 2523136000, + "step": 38500, + "train_runtime": 25206.1351, + "train_tokens_per_second": 100100.075 + }, + { + "epoch": 0.386, + "grad_norm": 0.12729153037071228, + "learning_rate": 0.0002053116108750715, + "loss": 0.3062, + "num_input_tokens_seen": 2529689600, + "step": 38600, + "train_runtime": 25270.823, + "train_tokens_per_second": 100103.174 + }, + { + "epoch": 0.387, + "grad_norm": 0.15452224016189575, + "learning_rate": 0.0002048688774208294, + "loss": 0.3029, + "num_input_tokens_seen": 2536243200, + "step": 38700, + "train_runtime": 25334.6018, + "train_tokens_per_second": 100109.851 + }, + { + "epoch": 0.388, + "grad_norm": 0.11749983578920364, + "learning_rate": 0.0002044255914376009, + "loss": 0.3055, + "num_input_tokens_seen": 2542796800, + "step": 38800, + "train_runtime": 25398.9456, + "train_tokens_per_second": 100114.266 + }, + { + "epoch": 0.389, + "grad_norm": 0.12558670341968536, + "learning_rate": 0.00020398175738927082, + "loss": 0.307, + "num_input_tokens_seen": 2549350400, + "step": 38900, + "train_runtime": 25469.3443, + "train_tokens_per_second": 100094.858 + }, + { + "epoch": 0.39, + "grad_norm": 0.11652723699808121, + "learning_rate": 0.00020353737974524312, + "loss": 0.3059, + "num_input_tokens_seen": 2555904000, + "step": 39000, + "train_runtime": 25534.1962, + "train_tokens_per_second": 100097.296 + }, + { + "epoch": 0.391, + "grad_norm": 0.14530417323112488, + "learning_rate": 0.00020309246298039584, + "loss": 0.3043, + "num_input_tokens_seen": 2562457600, + "step": 39100, + "train_runtime": 25597.7668, + "train_tokens_per_second": 100104.733 + }, + { + "epoch": 0.392, + "grad_norm": 0.2145591825246811, + "learning_rate": 0.0002026470115750357, + "loss": 0.3097, + "num_input_tokens_seen": 2569011200, + "step": 39200, + "train_runtime": 25662.2383, + "train_tokens_per_second": 100108.618 + }, + { + "epoch": 0.393, + "grad_norm": 0.13407446444034576, + "learning_rate": 0.0002022010300148535, + "loss": 0.3072, + "num_input_tokens_seen": 2575564800, + "step": 39300, + "train_runtime": 25726.7635, + "train_tokens_per_second": 100112.274 + }, + { + "epoch": 0.394, + "grad_norm": 0.20070548355579376, + "learning_rate": 0.0002017545227908786, + "loss": 0.3042, + "num_input_tokens_seen": 2582118400, + "step": 39400, + "train_runtime": 25798.3829, + "train_tokens_per_second": 100088.382 + }, + { + "epoch": 0.395, + "grad_norm": 0.12969562411308289, + "learning_rate": 0.00020130749439943376, + "loss": 0.3025, + "num_input_tokens_seen": 2588672000, + "step": 39500, + "train_runtime": 25861.9837, + "train_tokens_per_second": 100095.647 + }, + { + "epoch": 0.396, + "grad_norm": 0.22430787980556488, + "learning_rate": 0.00020085994934208998, + "loss": 0.3075, + "num_input_tokens_seen": 2595225600, + "step": 39600, + "train_runtime": 25927.1388, + "train_tokens_per_second": 100096.876 + }, + { + "epoch": 0.397, + "grad_norm": 0.1543964445590973, + "learning_rate": 0.00020041189212562094, + "loss": 0.3061, + "num_input_tokens_seen": 2601779200, + "step": 39700, + "train_runtime": 25990.8084, + "train_tokens_per_second": 100103.82 + }, + { + "epoch": 0.398, + "grad_norm": 0.17474599182605743, + "learning_rate": 0.0001999633272619579, + "loss": 0.3026, + "num_input_tokens_seen": 2608332800, + "step": 39800, + "train_runtime": 26055.1661, + "train_tokens_per_second": 100108.086 + }, + { + "epoch": 0.399, + "grad_norm": 0.12200487405061722, + "learning_rate": 0.00019951425926814404, + "loss": 0.3051, + "num_input_tokens_seen": 2614886400, + "step": 39900, + "train_runtime": 26125.5167, + "train_tokens_per_second": 100089.366 + }, + { + "epoch": 0.4, + "grad_norm": 0.12909364700317383, + "learning_rate": 0.00019906469266628904, + "loss": 0.3083, + "num_input_tokens_seen": 2621440000, + "step": 40000, + "train_runtime": 26189.9855, + "train_tokens_per_second": 100093.221 + }, + { + "epoch": 0.401, + "grad_norm": 0.14507311582565308, + "learning_rate": 0.0001986146319835236, + "loss": 0.3063, + "num_input_tokens_seen": 2627993600, + "step": 40100, + "train_runtime": 26254.1189, + "train_tokens_per_second": 100098.335 + }, + { + "epoch": 0.402, + "grad_norm": 0.15015749633312225, + "learning_rate": 0.00019816408175195383, + "loss": 0.3024, + "num_input_tokens_seen": 2634547200, + "step": 40200, + "train_runtime": 26317.4656, + "train_tokens_per_second": 100106.417 + }, + { + "epoch": 0.403, + "grad_norm": 0.1793050467967987, + "learning_rate": 0.0001977130465086155, + "loss": 0.3058, + "num_input_tokens_seen": 2641100800, + "step": 40300, + "train_runtime": 26387.6285, + "train_tokens_per_second": 100088.6 + }, + { + "epoch": 0.404, + "grad_norm": 0.13494957983493805, + "learning_rate": 0.0001972615307954286, + "loss": 0.3058, + "num_input_tokens_seen": 2647654400, + "step": 40400, + "train_runtime": 26452.3646, + "train_tokens_per_second": 100091.407 + }, + { + "epoch": 0.405, + "grad_norm": 0.15225248038768768, + "learning_rate": 0.00019680953915915124, + "loss": 0.3032, + "num_input_tokens_seen": 2654208000, + "step": 40500, + "train_runtime": 26516.6796, + "train_tokens_per_second": 100095.79 + }, + { + "epoch": 0.406, + "grad_norm": 0.15482735633850098, + "learning_rate": 0.00019635707615133427, + "loss": 0.3061, + "num_input_tokens_seen": 2660761600, + "step": 40600, + "train_runtime": 26585.3848, + "train_tokens_per_second": 100083.622 + }, + { + "epoch": 0.407, + "grad_norm": 0.15725013613700867, + "learning_rate": 0.00019590414632827513, + "loss": 0.3101, + "num_input_tokens_seen": 2667315200, + "step": 40700, + "train_runtime": 26649.9092, + "train_tokens_per_second": 100087.215 + }, + { + "epoch": 0.408, + "grad_norm": 0.16835036873817444, + "learning_rate": 0.00019545075425097204, + "loss": 0.3049, + "num_input_tokens_seen": 2673868800, + "step": 40800, + "train_runtime": 26714.9814, + "train_tokens_per_second": 100088.739 + }, + { + "epoch": 0.409, + "grad_norm": 0.167361319065094, + "learning_rate": 0.00019499690448507827, + "loss": 0.3027, + "num_input_tokens_seen": 2680422400, + "step": 40900, + "train_runtime": 26779.2716, + "train_tokens_per_second": 100093.178 + }, + { + "epoch": 0.41, + "grad_norm": 0.1781291663646698, + "learning_rate": 0.00019454260160085588, + "loss": 0.3005, + "num_input_tokens_seen": 2686976000, + "step": 41000, + "train_runtime": 26843.9197, + "train_tokens_per_second": 100096.261 + }, + { + "epoch": 0.411, + "grad_norm": 0.1289975345134735, + "learning_rate": 0.0001940878501731299, + "loss": 0.3085, + "num_input_tokens_seen": 2693529600, + "step": 41100, + "train_runtime": 26914.2047, + "train_tokens_per_second": 100078.365 + }, + { + "epoch": 0.412, + "grad_norm": 0.12804220616817474, + "learning_rate": 0.00019363265478124214, + "loss": 0.3062, + "num_input_tokens_seen": 2700083200, + "step": 41200, + "train_runtime": 26979.3069, + "train_tokens_per_second": 100079.784 + }, + { + "epoch": 0.413, + "grad_norm": 0.14838483929634094, + "learning_rate": 0.00019317702000900516, + "loss": 0.3065, + "num_input_tokens_seen": 2706636800, + "step": 41300, + "train_runtime": 27043.7101, + "train_tokens_per_second": 100083.783 + }, + { + "epoch": 0.414, + "grad_norm": 0.3049434423446655, + "learning_rate": 0.000192720950444656, + "loss": 0.3075, + "num_input_tokens_seen": 2713190400, + "step": 41400, + "train_runtime": 27108.2869, + "train_tokens_per_second": 100087.121 + }, + { + "epoch": 0.415, + "grad_norm": 0.16474822163581848, + "learning_rate": 0.00019226445068081018, + "loss": 0.3087, + "num_input_tokens_seen": 2719744000, + "step": 41500, + "train_runtime": 27173.4382, + "train_tokens_per_second": 100088.328 + }, + { + "epoch": 0.416, + "grad_norm": 0.18445253372192383, + "learning_rate": 0.00019180752531441523, + "loss": 0.3065, + "num_input_tokens_seen": 2726297600, + "step": 41600, + "train_runtime": 27237.7945, + "train_tokens_per_second": 100092.45 + }, + { + "epoch": 0.417, + "grad_norm": 0.1226682960987091, + "learning_rate": 0.00019135017894670456, + "loss": 0.3062, + "num_input_tokens_seen": 2732851200, + "step": 41700, + "train_runtime": 27307.5255, + "train_tokens_per_second": 100076.852 + }, + { + "epoch": 0.418, + "grad_norm": 0.12846247851848602, + "learning_rate": 0.0001908924161831509, + "loss": 0.3064, + "num_input_tokens_seen": 2739404800, + "step": 41800, + "train_runtime": 27371.4125, + "train_tokens_per_second": 100082.698 + }, + { + "epoch": 0.419, + "grad_norm": 0.14241133630275726, + "learning_rate": 0.0001904342416334203, + "loss": 0.3048, + "num_input_tokens_seen": 2745958400, + "step": 41900, + "train_runtime": 27436.5912, + "train_tokens_per_second": 100083.803 + }, + { + "epoch": 0.42, + "grad_norm": 0.19496770203113556, + "learning_rate": 0.00018997565991132532, + "loss": 0.3046, + "num_input_tokens_seen": 2752512000, + "step": 42000, + "train_runtime": 27500.5131, + "train_tokens_per_second": 100089.478 + }, + { + "epoch": 0.421, + "grad_norm": 0.16859756410121918, + "learning_rate": 0.0001895166756347789, + "loss": 0.3082, + "num_input_tokens_seen": 2759065600, + "step": 42100, + "train_runtime": 27570.8932, + "train_tokens_per_second": 100071.68 + }, + { + "epoch": 0.422, + "grad_norm": 0.13300351798534393, + "learning_rate": 0.0001890572934257475, + "loss": 0.3065, + "num_input_tokens_seen": 2765619200, + "step": 42200, + "train_runtime": 27634.6434, + "train_tokens_per_second": 100077.977 + }, + { + "epoch": 0.423, + "grad_norm": 0.14460822939872742, + "learning_rate": 0.00018859751791020497, + "loss": 0.3055, + "num_input_tokens_seen": 2772172800, + "step": 42300, + "train_runtime": 27700.3395, + "train_tokens_per_second": 100077.214 + }, + { + "epoch": 0.424, + "grad_norm": 0.1369091421365738, + "learning_rate": 0.0001881373537180856, + "loss": 0.3026, + "num_input_tokens_seen": 2778726400, + "step": 42400, + "train_runtime": 27764.0211, + "train_tokens_per_second": 100083.716 + }, + { + "epoch": 0.425, + "grad_norm": 0.15593157708644867, + "learning_rate": 0.00018767680548323766, + "loss": 0.3014, + "num_input_tokens_seen": 2785280000, + "step": 42500, + "train_runtime": 27828.3317, + "train_tokens_per_second": 100087.926 + }, + { + "epoch": 0.426, + "grad_norm": 0.18689674139022827, + "learning_rate": 0.0001872158778433768, + "loss": 0.3041, + "num_input_tokens_seen": 2791833600, + "step": 42600, + "train_runtime": 27897.9539, + "train_tokens_per_second": 100073.059 + }, + { + "epoch": 0.427, + "grad_norm": 0.1532142609357834, + "learning_rate": 0.0001867545754400392, + "loss": 0.3041, + "num_input_tokens_seen": 2798387200, + "step": 42700, + "train_runtime": 27964.2157, + "train_tokens_per_second": 100070.291 + }, + { + "epoch": 0.428, + "grad_norm": 0.12894967198371887, + "learning_rate": 0.000186292902918535, + "loss": 0.3047, + "num_input_tokens_seen": 2804940800, + "step": 42800, + "train_runtime": 28028.1798, + "train_tokens_per_second": 100075.739 + }, + { + "epoch": 0.429, + "grad_norm": 0.14526289701461792, + "learning_rate": 0.00018583086492790136, + "loss": 0.3097, + "num_input_tokens_seen": 2811494400, + "step": 42900, + "train_runtime": 28093.2724, + "train_tokens_per_second": 100077.142 + }, + { + "epoch": 0.43, + "grad_norm": 0.15546266734600067, + "learning_rate": 0.00018536846612085566, + "loss": 0.3066, + "num_input_tokens_seen": 2818048000, + "step": 43000, + "train_runtime": 28157.8145, + "train_tokens_per_second": 100080.495 + }, + { + "epoch": 0.431, + "grad_norm": 0.16307438910007477, + "learning_rate": 0.00018490571115374878, + "loss": 0.3073, + "num_input_tokens_seen": 2824601600, + "step": 43100, + "train_runtime": 28227.9591, + "train_tokens_per_second": 100063.968 + }, + { + "epoch": 0.432, + "grad_norm": 0.1360054761171341, + "learning_rate": 0.00018444260468651816, + "loss": 0.3013, + "num_input_tokens_seen": 2831155200, + "step": 43200, + "train_runtime": 28291.3921, + "train_tokens_per_second": 100071.258 + }, + { + "epoch": 0.433, + "grad_norm": 0.1404498666524887, + "learning_rate": 0.00018397915138264068, + "loss": 0.3066, + "num_input_tokens_seen": 2837708800, + "step": 43300, + "train_runtime": 28355.3195, + "train_tokens_per_second": 100076.771 + }, + { + "epoch": 0.434, + "grad_norm": 0.1926499307155609, + "learning_rate": 0.00018351535590908606, + "loss": 0.3012, + "num_input_tokens_seen": 2844262400, + "step": 43400, + "train_runtime": 28420.6726, + "train_tokens_per_second": 100077.237 + }, + { + "epoch": 0.435, + "grad_norm": 0.13713879883289337, + "learning_rate": 0.00018305122293626948, + "loss": 0.3029, + "num_input_tokens_seen": 2850816000, + "step": 43500, + "train_runtime": 28490.1826, + "train_tokens_per_second": 100063.1 + }, + { + "epoch": 0.436, + "grad_norm": 0.1541578322649002, + "learning_rate": 0.00018258675713800492, + "loss": 0.3061, + "num_input_tokens_seen": 2857369600, + "step": 43600, + "train_runtime": 28555.7903, + "train_tokens_per_second": 100062.704 + }, + { + "epoch": 0.437, + "grad_norm": 0.14117270708084106, + "learning_rate": 0.00018212196319145773, + "loss": 0.3053, + "num_input_tokens_seen": 2863923200, + "step": 43700, + "train_runtime": 28622.1811, + "train_tokens_per_second": 100059.572 + }, + { + "epoch": 0.438, + "grad_norm": 0.14943140745162964, + "learning_rate": 0.00018165684577709778, + "loss": 0.3043, + "num_input_tokens_seen": 2870476800, + "step": 43800, + "train_runtime": 28686.5648, + "train_tokens_per_second": 100063.455 + }, + { + "epoch": 0.439, + "grad_norm": 0.14043770730495453, + "learning_rate": 0.0001811914095786524, + "loss": 0.3048, + "num_input_tokens_seen": 2877030400, + "step": 43900, + "train_runtime": 28751.3532, + "train_tokens_per_second": 100065.913 + }, + { + "epoch": 0.44, + "grad_norm": 0.17811591923236847, + "learning_rate": 0.0001807256592830588, + "loss": 0.3088, + "num_input_tokens_seen": 2883584000, + "step": 44000, + "train_runtime": 28815.5193, + "train_tokens_per_second": 100070.52 + }, + { + "epoch": 0.441, + "grad_norm": 0.14588113129138947, + "learning_rate": 0.00018025959958041732, + "loss": 0.3017, + "num_input_tokens_seen": 2890137600, + "step": 44100, + "train_runtime": 28880.019, + "train_tokens_per_second": 100073.951 + }, + { + "epoch": 0.442, + "grad_norm": 0.22986213862895966, + "learning_rate": 0.00017979323516394407, + "loss": 0.3049, + "num_input_tokens_seen": 2896691200, + "step": 44200, + "train_runtime": 28945.7871, + "train_tokens_per_second": 100072.981 + }, + { + "epoch": 0.443, + "grad_norm": 0.853501558303833, + "learning_rate": 0.00017932657072992344, + "loss": 0.3081, + "num_input_tokens_seen": 2903244800, + "step": 44300, + "train_runtime": 29016.3509, + "train_tokens_per_second": 100055.476 + }, + { + "epoch": 0.444, + "grad_norm": 0.15835335850715637, + "learning_rate": 0.00017885961097766117, + "loss": 0.3035, + "num_input_tokens_seen": 2909798400, + "step": 44400, + "train_runtime": 29079.9877, + "train_tokens_per_second": 100061.886 + }, + { + "epoch": 0.445, + "grad_norm": 0.25418880581855774, + "learning_rate": 0.00017839236060943674, + "loss": 0.3014, + "num_input_tokens_seen": 2916352000, + "step": 44500, + "train_runtime": 29144.3776, + "train_tokens_per_second": 100065.681 + }, + { + "epoch": 0.446, + "grad_norm": 0.14922253787517548, + "learning_rate": 0.0001779248243304562, + "loss": 0.3038, + "num_input_tokens_seen": 2922905600, + "step": 44600, + "train_runtime": 29208.2393, + "train_tokens_per_second": 100071.27 + }, + { + "epoch": 0.447, + "grad_norm": 0.14103923738002777, + "learning_rate": 0.00017745700684880465, + "loss": 0.3064, + "num_input_tokens_seen": 2929459200, + "step": 44700, + "train_runtime": 29273.1105, + "train_tokens_per_second": 100073.383 + }, + { + "epoch": 0.448, + "grad_norm": 0.15813007950782776, + "learning_rate": 0.000176988912875399, + "loss": 0.3049, + "num_input_tokens_seen": 2936012800, + "step": 44800, + "train_runtime": 29342.9224, + "train_tokens_per_second": 100058.636 + }, + { + "epoch": 0.449, + "grad_norm": 0.1471075564622879, + "learning_rate": 0.00017652054712394028, + "loss": 0.3029, + "num_input_tokens_seen": 2942566400, + "step": 44900, + "train_runtime": 29408.1792, + "train_tokens_per_second": 100059.455 + }, + { + "epoch": 0.45, + "grad_norm": 0.16910097002983093, + "learning_rate": 0.0001760519143108665, + "loss": 0.3026, + "num_input_tokens_seen": 2949120000, + "step": 45000, + "train_runtime": 29472.6802, + "train_tokens_per_second": 100062.837 + }, + { + "epoch": 0.451, + "grad_norm": 0.15087512135505676, + "learning_rate": 0.00017558301915530483, + "loss": 0.305, + "num_input_tokens_seen": 2955673600, + "step": 45100, + "train_runtime": 29537.0324, + "train_tokens_per_second": 100066.708 + }, + { + "epoch": 0.452, + "grad_norm": 0.16292531788349152, + "learning_rate": 0.00017511386637902428, + "loss": 0.305, + "num_input_tokens_seen": 2962227200, + "step": 45200, + "train_runtime": 29600.4356, + "train_tokens_per_second": 100073.77 + }, + { + "epoch": 0.453, + "grad_norm": 0.14504611492156982, + "learning_rate": 0.00017464446070638814, + "loss": 0.3061, + "num_input_tokens_seen": 2968780800, + "step": 45300, + "train_runtime": 29670.2849, + "train_tokens_per_second": 100059.06 + }, + { + "epoch": 0.454, + "grad_norm": 0.14068329334259033, + "learning_rate": 0.00017417480686430622, + "loss": 0.3096, + "num_input_tokens_seen": 2975334400, + "step": 45400, + "train_runtime": 29735.31, + "train_tokens_per_second": 100060.648 + }, + { + "epoch": 0.455, + "grad_norm": 0.139748677611351, + "learning_rate": 0.00017370490958218765, + "loss": 0.3027, + "num_input_tokens_seen": 2981888000, + "step": 45500, + "train_runtime": 29800.4491, + "train_tokens_per_second": 100061.848 + }, + { + "epoch": 0.456, + "grad_norm": 0.1487821340560913, + "learning_rate": 0.00017323477359189272, + "loss": 0.3023, + "num_input_tokens_seen": 2988441600, + "step": 45600, + "train_runtime": 29869.053, + "train_tokens_per_second": 100051.434 + }, + { + "epoch": 0.457, + "grad_norm": 0.15015476942062378, + "learning_rate": 0.00017276440362768564, + "loss": 0.3028, + "num_input_tokens_seen": 2994995200, + "step": 45700, + "train_runtime": 29933.644, + "train_tokens_per_second": 100054.481 + }, + { + "epoch": 0.458, + "grad_norm": 0.1298416256904602, + "learning_rate": 0.0001722938044261868, + "loss": 0.3058, + "num_input_tokens_seen": 3001548800, + "step": 45800, + "train_runtime": 29997.6813, + "train_tokens_per_second": 100059.36 + }, + { + "epoch": 0.459, + "grad_norm": 0.1956530213356018, + "learning_rate": 0.0001718229807263249, + "loss": 0.3033, + "num_input_tokens_seen": 3008102400, + "step": 45900, + "train_runtime": 30067.1877, + "train_tokens_per_second": 100046.018 + }, + { + "epoch": 0.46, + "grad_norm": 0.15267929434776306, + "learning_rate": 0.0001713519372692894, + "loss": 0.3028, + "num_input_tokens_seen": 3014656000, + "step": 46000, + "train_runtime": 30131.0143, + "train_tokens_per_second": 100051.594 + }, + { + "epoch": 0.461, + "grad_norm": 0.13846905529499054, + "learning_rate": 0.0001708806787984826, + "loss": 0.3036, + "num_input_tokens_seen": 3021209600, + "step": 46100, + "train_runtime": 30195.5066, + "train_tokens_per_second": 100054.94 + }, + { + "epoch": 0.462, + "grad_norm": 0.13704828917980194, + "learning_rate": 0.00017040921005947212, + "loss": 0.3094, + "num_input_tokens_seen": 3027763200, + "step": 46200, + "train_runtime": 30260.3523, + "train_tokens_per_second": 100057.104 + }, + { + "epoch": 0.463, + "grad_norm": 0.15288543701171875, + "learning_rate": 0.0001699375357999429, + "loss": 0.3014, + "num_input_tokens_seen": 3034316800, + "step": 46300, + "train_runtime": 30325.5675, + "train_tokens_per_second": 100058.039 + }, + { + "epoch": 0.464, + "grad_norm": 0.19963988661766052, + "learning_rate": 0.0001694656607696496, + "loss": 0.3061, + "num_input_tokens_seen": 3040870400, + "step": 46400, + "train_runtime": 30399.8434, + "train_tokens_per_second": 100029.147 + }, + { + "epoch": 0.465, + "grad_norm": 0.14533430337905884, + "learning_rate": 0.0001689935897203684, + "loss": 0.3056, + "num_input_tokens_seen": 3047424000, + "step": 46500, + "train_runtime": 30464.3563, + "train_tokens_per_second": 100032.443 + }, + { + "epoch": 0.466, + "grad_norm": 0.14005503058433533, + "learning_rate": 0.0001685213274058496, + "loss": 0.3016, + "num_input_tokens_seen": 3053977600, + "step": 46600, + "train_runtime": 30528.7292, + "train_tokens_per_second": 100036.185 + }, + { + "epoch": 0.467, + "grad_norm": 0.17612388730049133, + "learning_rate": 0.00016804887858176944, + "loss": 0.3006, + "num_input_tokens_seen": 3060531200, + "step": 46700, + "train_runtime": 30592.7142, + "train_tokens_per_second": 100041.179 + }, + { + "epoch": 0.468, + "grad_norm": 0.13526348769664764, + "learning_rate": 0.00016757624800568238, + "loss": 0.3001, + "num_input_tokens_seen": 3067084800, + "step": 46800, + "train_runtime": 30656.5144, + "train_tokens_per_second": 100046.755 + }, + { + "epoch": 0.469, + "grad_norm": 0.6205772161483765, + "learning_rate": 0.00016710344043697301, + "loss": 0.3016, + "num_input_tokens_seen": 3073638400, + "step": 46900, + "train_runtime": 30727.0215, + "train_tokens_per_second": 100030.47 + }, + { + "epoch": 0.47, + "grad_norm": 0.15328101813793182, + "learning_rate": 0.0001666304606368083, + "loss": 0.3049, + "num_input_tokens_seen": 3080192000, + "step": 47000, + "train_runtime": 30792.0203, + "train_tokens_per_second": 100032.15 + }, + { + "epoch": 0.471, + "grad_norm": 0.1804981380701065, + "learning_rate": 0.00016615731336808962, + "loss": 0.3008, + "num_input_tokens_seen": 3086745600, + "step": 47100, + "train_runtime": 30856.1119, + "train_tokens_per_second": 100036.764 + }, + { + "epoch": 0.472, + "grad_norm": 0.1460595428943634, + "learning_rate": 0.0001656840033954047, + "loss": 0.2996, + "num_input_tokens_seen": 3093299200, + "step": 47200, + "train_runtime": 30922.3293, + "train_tokens_per_second": 100034.482 + }, + { + "epoch": 0.473, + "grad_norm": 0.17493313550949097, + "learning_rate": 0.00016521053548497973, + "loss": 0.3005, + "num_input_tokens_seen": 3099852800, + "step": 47300, + "train_runtime": 30985.6891, + "train_tokens_per_second": 100041.435 + }, + { + "epoch": 0.474, + "grad_norm": 0.11990969628095627, + "learning_rate": 0.0001647369144046313, + "loss": 0.2995, + "num_input_tokens_seen": 3106406400, + "step": 47400, + "train_runtime": 31056.5152, + "train_tokens_per_second": 100024.307 + }, + { + "epoch": 0.475, + "grad_norm": 0.15634778141975403, + "learning_rate": 0.00016426314492371842, + "loss": 0.3054, + "num_input_tokens_seen": 3112960000, + "step": 47500, + "train_runtime": 31121.0302, + "train_tokens_per_second": 100027.537 + }, + { + "epoch": 0.476, + "grad_norm": 0.14218732714653015, + "learning_rate": 0.0001637892318130945, + "loss": 0.3036, + "num_input_tokens_seen": 3119513600, + "step": 47600, + "train_runtime": 31185.6411, + "train_tokens_per_second": 100030.446 + }, + { + "epoch": 0.477, + "grad_norm": 0.147688090801239, + "learning_rate": 0.00016331517984505934, + "loss": 0.3003, + "num_input_tokens_seen": 3126067200, + "step": 47700, + "train_runtime": 31250.7507, + "train_tokens_per_second": 100031.748 + }, + { + "epoch": 0.478, + "grad_norm": 0.1728331595659256, + "learning_rate": 0.00016284099379331092, + "loss": 0.2997, + "num_input_tokens_seen": 3132620800, + "step": 47800, + "train_runtime": 31321.2751, + "train_tokens_per_second": 100015.749 + }, + { + "epoch": 0.479, + "grad_norm": 0.12835726141929626, + "learning_rate": 0.00016236667843289759, + "loss": 0.2989, + "num_input_tokens_seen": 3139174400, + "step": 47900, + "train_runtime": 31386.2974, + "train_tokens_per_second": 100017.353 + }, + { + "epoch": 0.48, + "grad_norm": 0.13368946313858032, + "learning_rate": 0.00016189223854016973, + "loss": 0.3078, + "num_input_tokens_seen": 3145728000, + "step": 48000, + "train_runtime": 31451.659, + "train_tokens_per_second": 100017.872 + }, + { + "epoch": 0.481, + "grad_norm": 0.12727653980255127, + "learning_rate": 0.00016141767889273182, + "loss": 0.3017, + "num_input_tokens_seen": 3152281600, + "step": 48100, + "train_runtime": 31516.5086, + "train_tokens_per_second": 100020.013 + }, + { + "epoch": 0.482, + "grad_norm": 0.16222263872623444, + "learning_rate": 0.00016094300426939417, + "loss": 0.3009, + "num_input_tokens_seen": 3158835200, + "step": 48200, + "train_runtime": 31581.3453, + "train_tokens_per_second": 100022.186 + }, + { + "epoch": 0.483, + "grad_norm": 0.15287387371063232, + "learning_rate": 0.00016046821945012505, + "loss": 0.2975, + "num_input_tokens_seen": 3165388800, + "step": 48300, + "train_runtime": 31645.8484, + "train_tokens_per_second": 100025.405 + }, + { + "epoch": 0.484, + "grad_norm": 0.13035738468170166, + "learning_rate": 0.00015999332921600226, + "loss": 0.3046, + "num_input_tokens_seen": 3171942400, + "step": 48400, + "train_runtime": 31716.5254, + "train_tokens_per_second": 100009.139 + }, + { + "epoch": 0.485, + "grad_norm": 0.16508948802947998, + "learning_rate": 0.00015951833834916532, + "loss": 0.3061, + "num_input_tokens_seen": 3178496000, + "step": 48500, + "train_runtime": 31781.7614, + "train_tokens_per_second": 100010.064 + }, + { + "epoch": 0.486, + "grad_norm": 0.1543286293745041, + "learning_rate": 0.00015904325163276672, + "loss": 0.2995, + "num_input_tokens_seen": 3185049600, + "step": 48600, + "train_runtime": 31847.2029, + "train_tokens_per_second": 100010.34 + }, + { + "epoch": 0.487, + "grad_norm": 0.13470540940761566, + "learning_rate": 0.00015856807385092466, + "loss": 0.3067, + "num_input_tokens_seen": 3191603200, + "step": 48700, + "train_runtime": 31911.0411, + "train_tokens_per_second": 100015.64 + }, + { + "epoch": 0.488, + "grad_norm": 0.15521059930324554, + "learning_rate": 0.00015809280978867405, + "loss": 0.3009, + "num_input_tokens_seen": 3198156800, + "step": 48800, + "train_runtime": 31975.3091, + "train_tokens_per_second": 100019.574 + }, + { + "epoch": 0.489, + "grad_norm": 0.16505663096904755, + "learning_rate": 0.0001576174642319187, + "loss": 0.3019, + "num_input_tokens_seen": 3204710400, + "step": 48900, + "train_runtime": 32039.3359, + "train_tokens_per_second": 100024.246 + }, + { + "epoch": 0.49, + "grad_norm": 0.15701062977313995, + "learning_rate": 0.0001571420419673831, + "loss": 0.3025, + "num_input_tokens_seen": 3211264000, + "step": 49000, + "train_runtime": 32104.9123, + "train_tokens_per_second": 100024.07 + }, + { + "epoch": 0.491, + "grad_norm": 0.22376379370689392, + "learning_rate": 0.0001566665477825642, + "loss": 0.3035, + "num_input_tokens_seen": 3217817600, + "step": 49100, + "train_runtime": 32177.5739, + "train_tokens_per_second": 100001.871 + }, + { + "epoch": 0.492, + "grad_norm": 0.1716614067554474, + "learning_rate": 0.0001561909864656831, + "loss": 0.3046, + "num_input_tokens_seen": 3224371200, + "step": 49200, + "train_runtime": 32241.8903, + "train_tokens_per_second": 100005.65 + }, + { + "epoch": 0.493, + "grad_norm": 0.17557290196418762, + "learning_rate": 0.00015571536280563705, + "loss": 0.2987, + "num_input_tokens_seen": 3230924800, + "step": 49300, + "train_runtime": 32307.4373, + "train_tokens_per_second": 100005.605 + }, + { + "epoch": 0.494, + "grad_norm": 0.16884572803974152, + "learning_rate": 0.000155239681591951, + "loss": 0.2986, + "num_input_tokens_seen": 3237478400, + "step": 49400, + "train_runtime": 32371.4412, + "train_tokens_per_second": 100010.326 + }, + { + "epoch": 0.495, + "grad_norm": 0.15279650688171387, + "learning_rate": 0.00015476394761472953, + "loss": 0.2982, + "num_input_tokens_seen": 3244032000, + "step": 49500, + "train_runtime": 32436.5241, + "train_tokens_per_second": 100011.702 + }, + { + "epoch": 0.496, + "grad_norm": 0.1866491436958313, + "learning_rate": 0.00015428816566460843, + "loss": 0.3038, + "num_input_tokens_seen": 3250585600, + "step": 49600, + "train_runtime": 32508.3167, + "train_tokens_per_second": 99992.43 + }, + { + "epoch": 0.497, + "grad_norm": 0.14084835350513458, + "learning_rate": 0.00015381234053270669, + "loss": 0.3027, + "num_input_tokens_seen": 3257139200, + "step": 49700, + "train_runtime": 32572.1194, + "train_tokens_per_second": 99997.767 + }, + { + "epoch": 0.498, + "grad_norm": 0.16111333668231964, + "learning_rate": 0.0001533364770105781, + "loss": 0.3015, + "num_input_tokens_seen": 3263692800, + "step": 49800, + "train_runtime": 32637.2501, + "train_tokens_per_second": 99999.013 + }, + { + "epoch": 0.499, + "grad_norm": 0.14655210077762604, + "learning_rate": 0.0001528605798901631, + "loss": 0.3012, + "num_input_tokens_seen": 3270246400, + "step": 49900, + "train_runtime": 32707.4201, + "train_tokens_per_second": 99984.847 + }, + { + "epoch": 0.5, + "grad_norm": 0.1385914832353592, + "learning_rate": 0.00015238465396374027, + "loss": 0.3027, + "num_input_tokens_seen": 3276800000, + "step": 50000, + "train_runtime": 32772.7798, + "train_tokens_per_second": 99985.415 + }, + { + "epoch": 0.501, + "grad_norm": 0.1433262825012207, + "learning_rate": 0.00015190870402387858, + "loss": 0.3006, + "num_input_tokens_seen": 3283353600, + "step": 50100, + "train_runtime": 32837.3412, + "train_tokens_per_second": 99988.412 + }, + { + "epoch": 0.502, + "grad_norm": 0.15529057383537292, + "learning_rate": 0.00015143273486338857, + "loss": 0.2995, + "num_input_tokens_seen": 3289907200, + "step": 50200, + "train_runtime": 32902.1033, + "train_tokens_per_second": 99990.787 + }, + { + "epoch": 0.503, + "grad_norm": 0.1301671862602234, + "learning_rate": 0.00015095675127527438, + "loss": 0.3055, + "num_input_tokens_seen": 3296460800, + "step": 50300, + "train_runtime": 32967.0743, + "train_tokens_per_second": 99992.519 + }, + { + "epoch": 0.504, + "grad_norm": 0.1454419493675232, + "learning_rate": 0.00015048075805268547, + "loss": 0.3036, + "num_input_tokens_seen": 3303014400, + "step": 50400, + "train_runtime": 33033.1243, + "train_tokens_per_second": 99990.978 + }, + { + "epoch": 0.505, + "grad_norm": 0.1473357379436493, + "learning_rate": 0.00015000475998886825, + "loss": 0.3018, + "num_input_tokens_seen": 3309568000, + "step": 50500, + "train_runtime": 33105.2406, + "train_tokens_per_second": 99971.121 + }, + { + "epoch": 0.506, + "grad_norm": 0.13996386528015137, + "learning_rate": 0.00014952876187711804, + "loss": 0.2974, + "num_input_tokens_seen": 3316121600, + "step": 50600, + "train_runtime": 33169.1198, + "train_tokens_per_second": 99976.171 + }, + { + "epoch": 0.507, + "grad_norm": 0.14000660181045532, + "learning_rate": 0.00014905276851073053, + "loss": 0.2992, + "num_input_tokens_seen": 3322675200, + "step": 50700, + "train_runtime": 33234.0005, + "train_tokens_per_second": 99978.19 + }, + { + "epoch": 0.508, + "grad_norm": 0.14661286771297455, + "learning_rate": 0.00014857678468295352, + "loss": 0.3045, + "num_input_tokens_seen": 3329228800, + "step": 50800, + "train_runtime": 33299.7758, + "train_tokens_per_second": 99977.514 + }, + { + "epoch": 0.509, + "grad_norm": 0.15111635625362396, + "learning_rate": 0.00014810081518693902, + "loss": 0.3006, + "num_input_tokens_seen": 3335782400, + "step": 50900, + "train_runtime": 33370.9097, + "train_tokens_per_second": 99960.787 + }, + { + "epoch": 0.51, + "grad_norm": 0.12965109944343567, + "learning_rate": 0.0001476248648156945, + "loss": 0.2986, + "num_input_tokens_seen": 3342336000, + "step": 51000, + "train_runtime": 33435.7602, + "train_tokens_per_second": 99962.913 + }, + { + "epoch": 0.511, + "grad_norm": 0.13791891932487488, + "learning_rate": 0.00014714893836203485, + "loss": 0.2994, + "num_input_tokens_seen": 3348889600, + "step": 51100, + "train_runtime": 33500.2878, + "train_tokens_per_second": 99965.995 + }, + { + "epoch": 0.512, + "grad_norm": 0.1420348435640335, + "learning_rate": 0.0001466730406185343, + "loss": 0.2996, + "num_input_tokens_seen": 3355443200, + "step": 51200, + "train_runtime": 33564.5521, + "train_tokens_per_second": 99969.849 + }, + { + "epoch": 0.513, + "grad_norm": 0.1938745528459549, + "learning_rate": 0.0001461971763774778, + "loss": 0.3007, + "num_input_tokens_seen": 3361996800, + "step": 51300, + "train_runtime": 33630.8004, + "train_tokens_per_second": 99967.79 + }, + { + "epoch": 0.514, + "grad_norm": 0.1449531763792038, + "learning_rate": 0.0001457213504308129, + "loss": 0.3011, + "num_input_tokens_seen": 3368550400, + "step": 51400, + "train_runtime": 33696.4447, + "train_tokens_per_second": 99967.532 + }, + { + "epoch": 0.515, + "grad_norm": 0.16473324596881866, + "learning_rate": 0.00014524556757010177, + "loss": 0.3005, + "num_input_tokens_seen": 3375104000, + "step": 51500, + "train_runtime": 33766.6492, + "train_tokens_per_second": 99953.773 + }, + { + "epoch": 0.516, + "grad_norm": 0.1542610377073288, + "learning_rate": 0.00014476983258647234, + "loss": 0.3012, + "num_input_tokens_seen": 3381657600, + "step": 51600, + "train_runtime": 33832.0917, + "train_tokens_per_second": 99954.139 + }, + { + "epoch": 0.517, + "grad_norm": 0.1388223022222519, + "learning_rate": 0.0001442941502705707, + "loss": 0.3031, + "num_input_tokens_seen": 3388211200, + "step": 51700, + "train_runtime": 33896.7212, + "train_tokens_per_second": 99956.901 + }, + { + "epoch": 0.518, + "grad_norm": 0.19452647864818573, + "learning_rate": 0.0001438185254125125, + "loss": 0.3011, + "num_input_tokens_seen": 3394764800, + "step": 51800, + "train_runtime": 33962.0557, + "train_tokens_per_second": 99957.577 + }, + { + "epoch": 0.519, + "grad_norm": 0.16043786704540253, + "learning_rate": 0.00014334296280183473, + "loss": 0.2997, + "num_input_tokens_seen": 3401318400, + "step": 51900, + "train_runtime": 34027.5551, + "train_tokens_per_second": 99957.766 + }, + { + "epoch": 0.52, + "grad_norm": 0.19769923388957977, + "learning_rate": 0.00014286746722744768, + "loss": 0.3007, + "num_input_tokens_seen": 3407872000, + "step": 52000, + "train_runtime": 34098.2307, + "train_tokens_per_second": 99942.781 + }, + { + "epoch": 0.521, + "grad_norm": 0.1524592489004135, + "learning_rate": 0.00014239204347758647, + "loss": 0.299, + "num_input_tokens_seen": 3414425600, + "step": 52100, + "train_runtime": 34164.2522, + "train_tokens_per_second": 99941.47 + }, + { + "epoch": 0.522, + "grad_norm": 0.14221727848052979, + "learning_rate": 0.00014191669633976294, + "loss": 0.3029, + "num_input_tokens_seen": 3420979200, + "step": 52200, + "train_runtime": 34227.7165, + "train_tokens_per_second": 99947.632 + }, + { + "epoch": 0.523, + "grad_norm": 0.15958262979984283, + "learning_rate": 0.00014144143060071756, + "loss": 0.3005, + "num_input_tokens_seen": 3427532800, + "step": 52300, + "train_runtime": 34292.9446, + "train_tokens_per_second": 99948.629 + }, + { + "epoch": 0.524, + "grad_norm": 0.1545192301273346, + "learning_rate": 0.000140966251046371, + "loss": 0.3024, + "num_input_tokens_seen": 3434086400, + "step": 52400, + "train_runtime": 34357.5392, + "train_tokens_per_second": 99951.466 + }, + { + "epoch": 0.525, + "grad_norm": 0.14636173844337463, + "learning_rate": 0.0001404911624617761, + "loss": 0.2967, + "num_input_tokens_seen": 3440640000, + "step": 52500, + "train_runtime": 34423.9361, + "train_tokens_per_second": 99949.058 + }, + { + "epoch": 0.526, + "grad_norm": 0.26764926314353943, + "learning_rate": 0.00014001616963106966, + "loss": 0.2982, + "num_input_tokens_seen": 3447193600, + "step": 52600, + "train_runtime": 34489.4544, + "train_tokens_per_second": 99949.206 + }, + { + "epoch": 0.527, + "grad_norm": 0.20636320114135742, + "learning_rate": 0.00013954127733742416, + "loss": 0.3011, + "num_input_tokens_seen": 3453747200, + "step": 52700, + "train_runtime": 34559.9071, + "train_tokens_per_second": 99935.083 + }, + { + "epoch": 0.528, + "grad_norm": 0.1523534059524536, + "learning_rate": 0.0001390664903629998, + "loss": 0.3042, + "num_input_tokens_seen": 3460300800, + "step": 52800, + "train_runtime": 34624.4507, + "train_tokens_per_second": 99938.071 + }, + { + "epoch": 0.529, + "grad_norm": 0.15213948488235474, + "learning_rate": 0.0001385918134888961, + "loss": 0.3024, + "num_input_tokens_seen": 3466854400, + "step": 52900, + "train_runtime": 34690.2273, + "train_tokens_per_second": 99937.495 + }, + { + "epoch": 0.53, + "grad_norm": 0.14115960896015167, + "learning_rate": 0.00013811725149510387, + "loss": 0.2999, + "num_input_tokens_seen": 3473408000, + "step": 53000, + "train_runtime": 34756.5786, + "train_tokens_per_second": 99935.268 + }, + { + "epoch": 0.531, + "grad_norm": 0.16747893393039703, + "learning_rate": 0.0001376428091604572, + "loss": 0.3011, + "num_input_tokens_seen": 3479961600, + "step": 53100, + "train_runtime": 34823.0381, + "train_tokens_per_second": 99932.74 + }, + { + "epoch": 0.532, + "grad_norm": 0.1266140639781952, + "learning_rate": 0.00013716849126258512, + "loss": 0.2985, + "num_input_tokens_seen": 3486515200, + "step": 53200, + "train_runtime": 34892.7557, + "train_tokens_per_second": 99920.89 + }, + { + "epoch": 0.533, + "grad_norm": 0.14753171801567078, + "learning_rate": 0.00013669430257786354, + "loss": 0.2996, + "num_input_tokens_seen": 3493068800, + "step": 53300, + "train_runtime": 34957.0461, + "train_tokens_per_second": 99924.599 + }, + { + "epoch": 0.534, + "grad_norm": 0.2617182731628418, + "learning_rate": 0.00013622024788136728, + "loss": 0.3027, + "num_input_tokens_seen": 3499622400, + "step": 53400, + "train_runtime": 35022.8837, + "train_tokens_per_second": 99923.879 + }, + { + "epoch": 0.535, + "grad_norm": 0.17150761187076569, + "learning_rate": 0.00013574633194682185, + "loss": 0.3027, + "num_input_tokens_seen": 3506176000, + "step": 53500, + "train_runtime": 35088.2396, + "train_tokens_per_second": 99924.534 + }, + { + "epoch": 0.536, + "grad_norm": 0.16566570103168488, + "learning_rate": 0.0001352725595465555, + "loss": 0.2999, + "num_input_tokens_seen": 3512729600, + "step": 53600, + "train_runtime": 35153.6189, + "train_tokens_per_second": 99925.12 + }, + { + "epoch": 0.537, + "grad_norm": 0.13577675819396973, + "learning_rate": 0.000134798935451451, + "loss": 0.2969, + "num_input_tokens_seen": 3519283200, + "step": 53700, + "train_runtime": 35225.0068, + "train_tokens_per_second": 99908.659 + }, + { + "epoch": 0.538, + "grad_norm": 0.20843537151813507, + "learning_rate": 0.00013432546443089768, + "loss": 0.2967, + "num_input_tokens_seen": 3525836800, + "step": 53800, + "train_runtime": 35288.0858, + "train_tokens_per_second": 99915.785 + }, + { + "epoch": 0.539, + "grad_norm": 0.15664201974868774, + "learning_rate": 0.0001338521512527436, + "loss": 0.3007, + "num_input_tokens_seen": 3532390400, + "step": 53900, + "train_runtime": 35353.7477, + "train_tokens_per_second": 99915.586 + }, + { + "epoch": 0.54, + "grad_norm": 0.14205297827720642, + "learning_rate": 0.00013337900068324712, + "loss": 0.3001, + "num_input_tokens_seen": 3538944000, + "step": 54000, + "train_runtime": 35423.5891, + "train_tokens_per_second": 99903.598 + }, + { + "epoch": 0.541, + "grad_norm": 0.13229498267173767, + "learning_rate": 0.00013290601748702918, + "loss": 0.2931, + "num_input_tokens_seen": 3545497600, + "step": 54100, + "train_runtime": 35489.6646, + "train_tokens_per_second": 99902.257 + }, + { + "epoch": 0.542, + "grad_norm": 0.1380510926246643, + "learning_rate": 0.00013243320642702543, + "loss": 0.3116, + "num_input_tokens_seen": 3552051200, + "step": 54200, + "train_runtime": 35554.9224, + "train_tokens_per_second": 99903.219 + }, + { + "epoch": 0.543, + "grad_norm": 0.16735288500785828, + "learning_rate": 0.0001319605722644379, + "loss": 0.2998, + "num_input_tokens_seen": 3558604800, + "step": 54300, + "train_runtime": 35619.4728, + "train_tokens_per_second": 99906.161 + }, + { + "epoch": 0.544, + "grad_norm": 0.17502574622631073, + "learning_rate": 0.0001314881197586874, + "loss": 0.3004, + "num_input_tokens_seen": 3565158400, + "step": 54400, + "train_runtime": 35685.8161, + "train_tokens_per_second": 99904.074 + }, + { + "epoch": 0.545, + "grad_norm": 0.14805424213409424, + "learning_rate": 0.0001310158536673654, + "loss": 0.2983, + "num_input_tokens_seen": 3571712000, + "step": 54500, + "train_runtime": 35750.1467, + "train_tokens_per_second": 99907.618 + }, + { + "epoch": 0.546, + "grad_norm": 0.1533045917749405, + "learning_rate": 0.0001305437787461862, + "loss": 0.2976, + "num_input_tokens_seen": 3578265600, + "step": 54600, + "train_runtime": 35816.4973, + "train_tokens_per_second": 99905.515 + }, + { + "epoch": 0.547, + "grad_norm": 0.18475773930549622, + "learning_rate": 0.00013007189974893903, + "loss": 0.2951, + "num_input_tokens_seen": 3584819200, + "step": 54700, + "train_runtime": 35886.6478, + "train_tokens_per_second": 99892.841 + }, + { + "epoch": 0.548, + "grad_norm": 0.13913068175315857, + "learning_rate": 0.00012960022142744016, + "loss": 0.297, + "num_input_tokens_seen": 3591372800, + "step": 54800, + "train_runtime": 35950.7798, + "train_tokens_per_second": 99896.937 + }, + { + "epoch": 0.549, + "grad_norm": 0.15448203682899475, + "learning_rate": 0.00012912874853148506, + "loss": 0.303, + "num_input_tokens_seen": 3597926400, + "step": 54900, + "train_runtime": 36015.8762, + "train_tokens_per_second": 99898.344 + }, + { + "epoch": 0.55, + "grad_norm": 0.15416036546230316, + "learning_rate": 0.00012865748580880053, + "loss": 0.2979, + "num_input_tokens_seen": 3604480000, + "step": 55000, + "train_runtime": 36080.201, + "train_tokens_per_second": 99901.883 + }, + { + "epoch": 0.551, + "grad_norm": 0.14506150782108307, + "learning_rate": 0.0001281864380049969, + "loss": 0.2983, + "num_input_tokens_seen": 3611033600, + "step": 55100, + "train_runtime": 36150.2521, + "train_tokens_per_second": 99889.583 + }, + { + "epoch": 0.552, + "grad_norm": 0.17357710003852844, + "learning_rate": 0.00012771560986352042, + "loss": 0.2986, + "num_input_tokens_seen": 3617587200, + "step": 55200, + "train_runtime": 36215.2659, + "train_tokens_per_second": 99891.223 + }, + { + "epoch": 0.553, + "grad_norm": 0.16711916029453278, + "learning_rate": 0.0001272450061256052, + "loss": 0.2979, + "num_input_tokens_seen": 3624140800, + "step": 55300, + "train_runtime": 36279.3222, + "train_tokens_per_second": 99895.494 + }, + { + "epoch": 0.554, + "grad_norm": 0.1502256691455841, + "learning_rate": 0.00012677463153022565, + "loss": 0.3007, + "num_input_tokens_seen": 3630694400, + "step": 55400, + "train_runtime": 36345.9552, + "train_tokens_per_second": 99892.667 + }, + { + "epoch": 0.555, + "grad_norm": 0.15480037033557892, + "learning_rate": 0.0001263044908140488, + "loss": 0.2975, + "num_input_tokens_seen": 3637248000, + "step": 55500, + "train_runtime": 36415.9598, + "train_tokens_per_second": 99880.602 + }, + { + "epoch": 0.556, + "grad_norm": 0.15693609416484833, + "learning_rate": 0.00012583458871138632, + "loss": 0.2978, + "num_input_tokens_seen": 3643801600, + "step": 55600, + "train_runtime": 36480.5541, + "train_tokens_per_second": 99883.395 + }, + { + "epoch": 0.557, + "grad_norm": 0.147445410490036, + "learning_rate": 0.00012536492995414723, + "loss": 0.2991, + "num_input_tokens_seen": 3650355200, + "step": 55700, + "train_runtime": 36545.2319, + "train_tokens_per_second": 99885.95 + }, + { + "epoch": 0.558, + "grad_norm": 0.13640980422496796, + "learning_rate": 0.00012489551927179007, + "loss": 0.2987, + "num_input_tokens_seen": 3656908800, + "step": 55800, + "train_runtime": 36611.0993, + "train_tokens_per_second": 99885.25 + }, + { + "epoch": 0.559, + "grad_norm": 0.14373840391635895, + "learning_rate": 0.00012442636139127508, + "loss": 0.3, + "num_input_tokens_seen": 3663462400, + "step": 55900, + "train_runtime": 36676.4606, + "train_tokens_per_second": 99885.931 + }, + { + "epoch": 0.56, + "grad_norm": 0.14679211378097534, + "learning_rate": 0.00012395746103701695, + "loss": 0.2996, + "num_input_tokens_seen": 3670016000, + "step": 56000, + "train_runtime": 36748.2938, + "train_tokens_per_second": 99869.018 + }, + { + "epoch": 0.561, + "grad_norm": 0.15536077320575714, + "learning_rate": 0.00012348882293083708, + "loss": 0.2953, + "num_input_tokens_seen": 3676569600, + "step": 56100, + "train_runtime": 36813.4246, + "train_tokens_per_second": 99870.35 + }, + { + "epoch": 0.562, + "grad_norm": 0.16678054630756378, + "learning_rate": 0.00012302045179191594, + "loss": 0.2969, + "num_input_tokens_seen": 3683123200, + "step": 56200, + "train_runtime": 36877.8431, + "train_tokens_per_second": 99873.607 + }, + { + "epoch": 0.563, + "grad_norm": 0.15781697630882263, + "learning_rate": 0.00012255235233674572, + "loss": 0.2972, + "num_input_tokens_seen": 3689676800, + "step": 56300, + "train_runtime": 36943.2178, + "train_tokens_per_second": 99874.267 + }, + { + "epoch": 0.564, + "grad_norm": 0.13541863858699799, + "learning_rate": 0.00012208452927908278, + "loss": 0.302, + "num_input_tokens_seen": 3696230400, + "step": 56400, + "train_runtime": 37008.8029, + "train_tokens_per_second": 99874.357 + }, + { + "epoch": 0.565, + "grad_norm": 0.1400034874677658, + "learning_rate": 0.00012161698732990003, + "loss": 0.3, + "num_input_tokens_seen": 3702784000, + "step": 56500, + "train_runtime": 37078.9889, + "train_tokens_per_second": 99862.054 + }, + { + "epoch": 0.566, + "grad_norm": 0.1511828452348709, + "learning_rate": 0.00012114973119733987, + "loss": 0.3017, + "num_input_tokens_seen": 3709337600, + "step": 56600, + "train_runtime": 37144.0507, + "train_tokens_per_second": 99863.573 + }, + { + "epoch": 0.567, + "grad_norm": 0.15576902031898499, + "learning_rate": 0.00012068276558666616, + "loss": 0.2981, + "num_input_tokens_seen": 3715891200, + "step": 56700, + "train_runtime": 37206.97, + "train_tokens_per_second": 99870.836 + }, + { + "epoch": 0.568, + "grad_norm": 0.24084219336509705, + "learning_rate": 0.00012021609520021752, + "loss": 0.3025, + "num_input_tokens_seen": 3722444800, + "step": 56800, + "train_runtime": 37278.1305, + "train_tokens_per_second": 99855.995 + }, + { + "epoch": 0.569, + "grad_norm": 0.16832643747329712, + "learning_rate": 0.00011974972473735957, + "loss": 0.301, + "num_input_tokens_seen": 3728998400, + "step": 56900, + "train_runtime": 37343.2452, + "train_tokens_per_second": 99857.374 + }, + { + "epoch": 0.57, + "grad_norm": 0.18326181173324585, + "learning_rate": 0.00011928365889443764, + "loss": 0.2987, + "num_input_tokens_seen": 3735552000, + "step": 57000, + "train_runtime": 37407.594, + "train_tokens_per_second": 99860.793 + }, + { + "epoch": 0.571, + "grad_norm": 0.15526984632015228, + "learning_rate": 0.00011881790236472966, + "loss": 0.2991, + "num_input_tokens_seen": 3742105600, + "step": 57100, + "train_runtime": 37474.3952, + "train_tokens_per_second": 99857.665 + }, + { + "epoch": 0.572, + "grad_norm": 0.18177416920661926, + "learning_rate": 0.00011835245983839869, + "loss": 0.3002, + "num_input_tokens_seen": 3748659200, + "step": 57200, + "train_runtime": 37538.8922, + "train_tokens_per_second": 99860.677 + }, + { + "epoch": 0.573, + "grad_norm": 0.1915498822927475, + "learning_rate": 0.00011788733600244575, + "loss": 0.2986, + "num_input_tokens_seen": 3755212800, + "step": 57300, + "train_runtime": 37605.3867, + "train_tokens_per_second": 99858.375 + }, + { + "epoch": 0.574, + "grad_norm": 0.15175184607505798, + "learning_rate": 0.00011742253554066278, + "loss": 0.3015, + "num_input_tokens_seen": 3761766400, + "step": 57400, + "train_runtime": 37678.0051, + "train_tokens_per_second": 99839.851 + }, + { + "epoch": 0.575, + "grad_norm": 0.16369026899337769, + "learning_rate": 0.00011695806313358523, + "loss": 0.3003, + "num_input_tokens_seen": 3768320000, + "step": 57500, + "train_runtime": 37742.0245, + "train_tokens_per_second": 99844.141 + }, + { + "epoch": 0.576, + "grad_norm": 0.16646848618984222, + "learning_rate": 0.00011649392345844506, + "loss": 0.2972, + "num_input_tokens_seen": 3774873600, + "step": 57600, + "train_runtime": 37807.5481, + "train_tokens_per_second": 99844.444 + }, + { + "epoch": 0.577, + "grad_norm": 0.14035099744796753, + "learning_rate": 0.00011603012118912372, + "loss": 0.2985, + "num_input_tokens_seen": 3781427200, + "step": 57700, + "train_runtime": 37871.8826, + "train_tokens_per_second": 99847.88 + }, + { + "epoch": 0.578, + "grad_norm": 0.14899714291095734, + "learning_rate": 0.00011556666099610485, + "loss": 0.3008, + "num_input_tokens_seen": 3787980800, + "step": 57800, + "train_runtime": 37943.2827, + "train_tokens_per_second": 99832.712 + }, + { + "epoch": 0.579, + "grad_norm": 0.15600667893886566, + "learning_rate": 0.00011510354754642745, + "loss": 0.303, + "num_input_tokens_seen": 3794534400, + "step": 57900, + "train_runtime": 38008.9332, + "train_tokens_per_second": 99832.699 + }, + { + "epoch": 0.58, + "grad_norm": 0.1631072610616684, + "learning_rate": 0.00011464078550363887, + "loss": 0.2978, + "num_input_tokens_seen": 3801088000, + "step": 58000, + "train_runtime": 38073.7575, + "train_tokens_per_second": 99834.853 + }, + { + "epoch": 0.581, + "grad_norm": 0.1560899019241333, + "learning_rate": 0.0001141783795277477, + "loss": 0.299, + "num_input_tokens_seen": 3807641600, + "step": 58100, + "train_runtime": 38139.694, + "train_tokens_per_second": 99834.089 + }, + { + "epoch": 0.582, + "grad_norm": 0.1506076604127884, + "learning_rate": 0.00011371633427517696, + "loss": 0.2985, + "num_input_tokens_seen": 3814195200, + "step": 58200, + "train_runtime": 38209.9556, + "train_tokens_per_second": 99822.026 + }, + { + "epoch": 0.583, + "grad_norm": 0.16049940884113312, + "learning_rate": 0.00011325465439871731, + "loss": 0.2998, + "num_input_tokens_seen": 3820748800, + "step": 58300, + "train_runtime": 38274.5015, + "train_tokens_per_second": 99824.913 + }, + { + "epoch": 0.584, + "grad_norm": 0.15604519844055176, + "learning_rate": 0.00011279334454747989, + "loss": 0.2969, + "num_input_tokens_seen": 3827302400, + "step": 58400, + "train_runtime": 38341.4547, + "train_tokens_per_second": 99821.523 + }, + { + "epoch": 0.585, + "grad_norm": 0.15963351726531982, + "learning_rate": 0.00011233240936684981, + "loss": 0.2988, + "num_input_tokens_seen": 3833856000, + "step": 58500, + "train_runtime": 38406.0222, + "train_tokens_per_second": 99824.345 + }, + { + "epoch": 0.586, + "grad_norm": 0.15443411469459534, + "learning_rate": 0.00011187185349843916, + "loss": 0.298, + "num_input_tokens_seen": 3840409600, + "step": 58600, + "train_runtime": 38472.0656, + "train_tokens_per_second": 99823.327 + }, + { + "epoch": 0.587, + "grad_norm": 0.15459220111370087, + "learning_rate": 0.00011141168158004053, + "loss": 0.3004, + "num_input_tokens_seen": 3846963200, + "step": 58700, + "train_runtime": 38542.0532, + "train_tokens_per_second": 99812.098 + }, + { + "epoch": 0.588, + "grad_norm": 0.16199928522109985, + "learning_rate": 0.00011095189824557998, + "loss": 0.2985, + "num_input_tokens_seen": 3853516800, + "step": 58800, + "train_runtime": 38609.4411, + "train_tokens_per_second": 99807.63 + }, + { + "epoch": 0.589, + "grad_norm": 0.2209610939025879, + "learning_rate": 0.00011049250812507054, + "loss": 0.3005, + "num_input_tokens_seen": 3860070400, + "step": 58900, + "train_runtime": 38675.4402, + "train_tokens_per_second": 99806.761 + }, + { + "epoch": 0.59, + "grad_norm": 0.22285670042037964, + "learning_rate": 0.00011003351584456571, + "loss": 0.298, + "num_input_tokens_seen": 3866624000, + "step": 59000, + "train_runtime": 38740.3065, + "train_tokens_per_second": 99808.813 + }, + { + "epoch": 0.591, + "grad_norm": 0.2148812711238861, + "learning_rate": 0.0001095749260261126, + "loss": 0.2966, + "num_input_tokens_seen": 3873177600, + "step": 59100, + "train_runtime": 38806.3344, + "train_tokens_per_second": 99807.871 + }, + { + "epoch": 0.592, + "grad_norm": 0.21284043788909912, + "learning_rate": 0.00010911674328770559, + "loss": 0.3009, + "num_input_tokens_seen": 3879731200, + "step": 59200, + "train_runtime": 38871.8466, + "train_tokens_per_second": 99808.256 + }, + { + "epoch": 0.593, + "grad_norm": 0.1655593365430832, + "learning_rate": 0.00010865897224323979, + "loss": 0.2981, + "num_input_tokens_seen": 3886284800, + "step": 59300, + "train_runtime": 38937.7196, + "train_tokens_per_second": 99807.714 + }, + { + "epoch": 0.594, + "grad_norm": 0.17153207957744598, + "learning_rate": 0.00010820161750246453, + "loss": 0.3042, + "num_input_tokens_seen": 3892838400, + "step": 59400, + "train_runtime": 39004.8582, + "train_tokens_per_second": 99803.937 + }, + { + "epoch": 0.595, + "grad_norm": 0.15362666547298431, + "learning_rate": 0.00010774468367093696, + "loss": 0.3001, + "num_input_tokens_seen": 3899392000, + "step": 59500, + "train_runtime": 39068.7475, + "train_tokens_per_second": 99808.472 + }, + { + "epoch": 0.596, + "grad_norm": 0.15481388568878174, + "learning_rate": 0.00010728817534997573, + "loss": 0.2973, + "num_input_tokens_seen": 3905945600, + "step": 59600, + "train_runtime": 39137.2916, + "train_tokens_per_second": 99801.122 + }, + { + "epoch": 0.597, + "grad_norm": 0.1292748749256134, + "learning_rate": 0.00010683209713661453, + "loss": 0.2993, + "num_input_tokens_seen": 3912499200, + "step": 59700, + "train_runtime": 39198.2818, + "train_tokens_per_second": 99813.028 + }, + { + "epoch": 0.598, + "grad_norm": 0.14853951334953308, + "learning_rate": 0.00010637645362355589, + "loss": 0.2967, + "num_input_tokens_seen": 3919052800, + "step": 59800, + "train_runtime": 39262.6162, + "train_tokens_per_second": 99816.395 + }, + { + "epoch": 0.599, + "grad_norm": 0.13745439052581787, + "learning_rate": 0.00010592124939912497, + "loss": 0.3023, + "num_input_tokens_seen": 3925606400, + "step": 59900, + "train_runtime": 39328.4755, + "train_tokens_per_second": 99815.88 + }, + { + "epoch": 0.6, + "grad_norm": 0.14352121949195862, + "learning_rate": 0.00010546648904722326, + "loss": 0.2973, + "num_input_tokens_seen": 3932160000, + "step": 60000, + "train_runtime": 39393.6967, + "train_tokens_per_second": 99816.984 + }, + { + "epoch": 0.601, + "grad_norm": 0.16375063359737396, + "learning_rate": 0.0001050121771472824, + "loss": 0.2934, + "num_input_tokens_seen": 3938713600, + "step": 60100, + "train_runtime": 39465.7876, + "train_tokens_per_second": 99800.709 + }, + { + "epoch": 0.602, + "grad_norm": 0.144679456949234, + "learning_rate": 0.0001045583182742182, + "loss": 0.2983, + "num_input_tokens_seen": 3945267200, + "step": 60200, + "train_runtime": 39531.166, + "train_tokens_per_second": 99801.438 + }, + { + "epoch": 0.603, + "grad_norm": 0.33903974294662476, + "learning_rate": 0.00010410491699838448, + "loss": 0.2981, + "num_input_tokens_seen": 3951820800, + "step": 60300, + "train_runtime": 39596.8662, + "train_tokens_per_second": 99801.352 + }, + { + "epoch": 0.604, + "grad_norm": 0.1823410987854004, + "learning_rate": 0.00010365197788552707, + "loss": 0.2986, + "num_input_tokens_seen": 3958374400, + "step": 60400, + "train_runtime": 39664.1206, + "train_tokens_per_second": 99797.357 + }, + { + "epoch": 0.605, + "grad_norm": 0.18758277595043182, + "learning_rate": 0.00010319950549673778, + "loss": 0.2967, + "num_input_tokens_seen": 3964928000, + "step": 60500, + "train_runtime": 39728.4695, + "train_tokens_per_second": 99800.673 + }, + { + "epoch": 0.606, + "grad_norm": 0.173909991979599, + "learning_rate": 0.00010274750438840855, + "loss": 0.2981, + "num_input_tokens_seen": 3971481600, + "step": 60600, + "train_runtime": 39794.5098, + "train_tokens_per_second": 99799.737 + }, + { + "epoch": 0.607, + "grad_norm": 0.14504651725292206, + "learning_rate": 0.00010229597911218554, + "loss": 0.2967, + "num_input_tokens_seen": 3978035200, + "step": 60700, + "train_runtime": 39864.8024, + "train_tokens_per_second": 99788.158 + }, + { + "epoch": 0.608, + "grad_norm": 0.1418026238679886, + "learning_rate": 0.00010184493421492324, + "loss": 0.2976, + "num_input_tokens_seen": 3984588800, + "step": 60800, + "train_runtime": 39931.2064, + "train_tokens_per_second": 99786.337 + }, + { + "epoch": 0.609, + "grad_norm": 0.18415790796279907, + "learning_rate": 0.0001013943742386388, + "loss": 0.2997, + "num_input_tokens_seen": 3991142400, + "step": 60900, + "train_runtime": 39996.7127, + "train_tokens_per_second": 99786.761 + }, + { + "epoch": 0.61, + "grad_norm": 0.14107364416122437, + "learning_rate": 0.00010094430372046616, + "loss": 0.2979, + "num_input_tokens_seen": 3997696000, + "step": 61000, + "train_runtime": 40068.6157, + "train_tokens_per_second": 99771.253 + } + ], + "logging_steps": 100, + "max_steps": 100000, + "num_input_tokens_seen": 3997696000, + "num_train_epochs": 9223372036854775807, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.5963643723776e+16, + "train_batch_size": 256, + "trial_name": null, + "trial_params": null +}