{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 15148, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00026409613099168095, "grad_norm": 344.0, "learning_rate": 0.0, "loss": 13.395, "num_input_tokens_seen": 65536, "step": 1, "train_runtime": 39.9594, "train_tokens_per_second": 1640.065 }, { "epoch": 0.00264096130991681, "grad_norm": 31.5, "learning_rate": 9.89010989010989e-06, "loss": 12.1214, "num_input_tokens_seen": 655360, "step": 10, "train_runtime": 183.0716, "train_tokens_per_second": 3579.802 }, { "epoch": 0.00528192261983362, "grad_norm": 3.703125, "learning_rate": 2.087912087912088e-05, "loss": 10.1795, "num_input_tokens_seen": 1310720, "step": 20, "train_runtime": 442.7935, "train_tokens_per_second": 2960.116 }, { "epoch": 0.00792288392975043, "grad_norm": 4.40625, "learning_rate": 3.1868131868131866e-05, "loss": 9.794, "num_input_tokens_seen": 1966080, "step": 30, "train_runtime": 710.3312, "train_tokens_per_second": 2767.836 }, { "epoch": 0.01056384523966724, "grad_norm": 15.375, "learning_rate": 4.2857142857142856e-05, "loss": 9.2328, "num_input_tokens_seen": 2621440, "step": 40, "train_runtime": 978.4124, "train_tokens_per_second": 2679.279 }, { "epoch": 0.013204806549584048, "grad_norm": 20.0, "learning_rate": 5.3846153846153853e-05, "loss": 7.5562, "num_input_tokens_seen": 3276800, "step": 50, "train_runtime": 1245.024, "train_tokens_per_second": 2631.917 }, { "epoch": 0.01584576785950086, "grad_norm": 15.25, "learning_rate": 6.483516483516484e-05, "loss": 6.6608, "num_input_tokens_seen": 3932160, "step": 60, "train_runtime": 1512.3636, "train_tokens_per_second": 2600.01 }, { "epoch": 0.018486729169417668, "grad_norm": 6.90625, "learning_rate": 7.582417582417581e-05, "loss": 6.3654, "num_input_tokens_seen": 4587520, "step": 70, "train_runtime": 1781.6818, "train_tokens_per_second": 2574.826 }, { "epoch": 0.02112769047933448, "grad_norm": 6.8125, "learning_rate": 8.681318681318681e-05, "loss": 5.9489, "num_input_tokens_seen": 5242880, "step": 80, "train_runtime": 2051.9937, "train_tokens_per_second": 2555.018 }, { "epoch": 0.023768651789251288, "grad_norm": 6.0625, "learning_rate": 9.780219780219781e-05, "loss": 5.5165, "num_input_tokens_seen": 5898240, "step": 90, "train_runtime": 2374.9912, "train_tokens_per_second": 2483.479 }, { "epoch": 0.026409613099168096, "grad_norm": 4.96875, "learning_rate": 0.00010879120879120879, "loss": 5.1193, "num_input_tokens_seen": 6553600, "step": 100, "train_runtime": 2729.9406, "train_tokens_per_second": 2400.638 }, { "epoch": 0.029050574409084907, "grad_norm": 4.6875, "learning_rate": 0.00011978021978021979, "loss": 4.9257, "num_input_tokens_seen": 7208960, "step": 110, "train_runtime": 3085.8271, "train_tokens_per_second": 2336.152 }, { "epoch": 0.03169153571900172, "grad_norm": 5.84375, "learning_rate": 0.00013076923076923077, "loss": 4.6357, "num_input_tokens_seen": 7864320, "step": 120, "train_runtime": 3441.9154, "train_tokens_per_second": 2284.867 }, { "epoch": 0.034332497028918524, "grad_norm": 4.71875, "learning_rate": 0.00014175824175824176, "loss": 4.6891, "num_input_tokens_seen": 8519680, "step": 130, "train_runtime": 3798.0993, "train_tokens_per_second": 2243.143 }, { "epoch": 0.036973458338835336, "grad_norm": 2.75, "learning_rate": 0.00015274725274725277, "loss": 4.5238, "num_input_tokens_seen": 9175040, "step": 140, "train_runtime": 4158.3319, "train_tokens_per_second": 2206.423 }, { "epoch": 0.03961441964875215, "grad_norm": 3.375, "learning_rate": 0.00016373626373626375, "loss": 4.1358, "num_input_tokens_seen": 9830400, "step": 150, "train_runtime": 4518.0275, "train_tokens_per_second": 2175.817 }, { "epoch": 0.04225538095866896, "grad_norm": 2.921875, "learning_rate": 0.0001747252747252747, "loss": 4.2246, "num_input_tokens_seen": 10485760, "step": 160, "train_runtime": 4877.0489, "train_tokens_per_second": 2150.022 }, { "epoch": 0.044896342268585764, "grad_norm": 3.140625, "learning_rate": 0.00018571428571428572, "loss": 3.9947, "num_input_tokens_seen": 11141120, "step": 170, "train_runtime": 5236.3642, "train_tokens_per_second": 2127.644 }, { "epoch": 0.047537303578502575, "grad_norm": 2.359375, "learning_rate": 0.0001967032967032967, "loss": 3.9596, "num_input_tokens_seen": 11796480, "step": 180, "train_runtime": 5592.6933, "train_tokens_per_second": 2109.266 }, { "epoch": 0.05017826488841939, "grad_norm": 5.03125, "learning_rate": 0.0002076923076923077, "loss": 3.9512, "num_input_tokens_seen": 12451840, "step": 190, "train_runtime": 5949.0933, "train_tokens_per_second": 2093.065 }, { "epoch": 0.05281922619833619, "grad_norm": 2.125, "learning_rate": 0.0002186813186813187, "loss": 3.8762, "num_input_tokens_seen": 13107200, "step": 200, "train_runtime": 6307.5561, "train_tokens_per_second": 2078.016 }, { "epoch": 0.055460187508253, "grad_norm": 2.109375, "learning_rate": 0.00022967032967032965, "loss": 3.7407, "num_input_tokens_seen": 13762560, "step": 210, "train_runtime": 6669.6459, "train_tokens_per_second": 2063.462 }, { "epoch": 0.058101148818169815, "grad_norm": 1.9375, "learning_rate": 0.00024065934065934066, "loss": 3.8132, "num_input_tokens_seen": 14417920, "step": 220, "train_runtime": 7028.6859, "train_tokens_per_second": 2051.297 }, { "epoch": 0.06074211012808663, "grad_norm": 2.15625, "learning_rate": 0.00025164835164835165, "loss": 3.7891, "num_input_tokens_seen": 15073280, "step": 230, "train_runtime": 7387.9861, "train_tokens_per_second": 2040.242 }, { "epoch": 0.06338307143800344, "grad_norm": 1.5078125, "learning_rate": 0.00026263736263736266, "loss": 3.6183, "num_input_tokens_seen": 15728640, "step": 240, "train_runtime": 7747.3492, "train_tokens_per_second": 2030.196 }, { "epoch": 0.06602403274792025, "grad_norm": 1.9453125, "learning_rate": 0.00027362637362637367, "loss": 3.7294, "num_input_tokens_seen": 16384000, "step": 250, "train_runtime": 8107.0182, "train_tokens_per_second": 2020.965 }, { "epoch": 0.06866499405783705, "grad_norm": 1.6640625, "learning_rate": 0.0002846153846153846, "loss": 3.5328, "num_input_tokens_seen": 17039360, "step": 260, "train_runtime": 8467.8519, "train_tokens_per_second": 2012.241 }, { "epoch": 0.07130595536775386, "grad_norm": 1.8515625, "learning_rate": 0.00029560439560439563, "loss": 3.6193, "num_input_tokens_seen": 17694720, "step": 270, "train_runtime": 8827.9374, "train_tokens_per_second": 2004.4 }, { "epoch": 0.07394691667767067, "grad_norm": 1.4921875, "learning_rate": 0.00030659340659340665, "loss": 3.5792, "num_input_tokens_seen": 18350080, "step": 280, "train_runtime": 9190.5559, "train_tokens_per_second": 1996.624 }, { "epoch": 0.07658787798758748, "grad_norm": 1.453125, "learning_rate": 0.00031758241758241755, "loss": 3.5012, "num_input_tokens_seen": 19005440, "step": 290, "train_runtime": 9551.7396, "train_tokens_per_second": 1989.736 }, { "epoch": 0.0792288392975043, "grad_norm": 1.1171875, "learning_rate": 0.00032857142857142856, "loss": 3.5406, "num_input_tokens_seen": 19660800, "step": 300, "train_runtime": 9913.3256, "train_tokens_per_second": 1983.27 }, { "epoch": 0.0818698006074211, "grad_norm": 1.390625, "learning_rate": 0.00033956043956043957, "loss": 3.5479, "num_input_tokens_seen": 20316160, "step": 310, "train_runtime": 10272.1454, "train_tokens_per_second": 1977.791 }, { "epoch": 0.08451076191733792, "grad_norm": 1.546875, "learning_rate": 0.0003505494505494505, "loss": 3.4769, "num_input_tokens_seen": 20971520, "step": 320, "train_runtime": 10630.524, "train_tokens_per_second": 1972.764 }, { "epoch": 0.08715172322725472, "grad_norm": 1.5390625, "learning_rate": 0.00036153846153846154, "loss": 3.5208, "num_input_tokens_seen": 21626880, "step": 330, "train_runtime": 10988.8532, "train_tokens_per_second": 1968.074 }, { "epoch": 0.08979268453717153, "grad_norm": 1.390625, "learning_rate": 0.00037252747252747255, "loss": 3.5306, "num_input_tokens_seen": 22282240, "step": 340, "train_runtime": 11348.4705, "train_tokens_per_second": 1963.458 }, { "epoch": 0.09243364584708834, "grad_norm": 1.3359375, "learning_rate": 0.00038351648351648356, "loss": 3.5036, "num_input_tokens_seen": 22937600, "step": 350, "train_runtime": 11707.3514, "train_tokens_per_second": 1959.248 }, { "epoch": 0.09507460715700515, "grad_norm": 1.125, "learning_rate": 0.0003945054945054945, "loss": 3.4787, "num_input_tokens_seen": 23592960, "step": 360, "train_runtime": 12066.3216, "train_tokens_per_second": 1955.274 }, { "epoch": 0.09771556846692196, "grad_norm": 1.3203125, "learning_rate": 0.0004054945054945055, "loss": 3.3933, "num_input_tokens_seen": 24248320, "step": 370, "train_runtime": 12425.7846, "train_tokens_per_second": 1951.452 }, { "epoch": 0.10035652977683877, "grad_norm": 1.2734375, "learning_rate": 0.00041648351648351654, "loss": 3.4365, "num_input_tokens_seen": 24903680, "step": 380, "train_runtime": 12785.724, "train_tokens_per_second": 1947.772 }, { "epoch": 0.10299749108675559, "grad_norm": 1.25, "learning_rate": 0.00042747252747252744, "loss": 3.3713, "num_input_tokens_seen": 25559040, "step": 390, "train_runtime": 13145.3532, "train_tokens_per_second": 1944.34 }, { "epoch": 0.10563845239667238, "grad_norm": 1.2265625, "learning_rate": 0.00043846153846153845, "loss": 3.4548, "num_input_tokens_seen": 26214400, "step": 400, "train_runtime": 13504.0465, "train_tokens_per_second": 1941.226 }, { "epoch": 0.1082794137065892, "grad_norm": 0.9453125, "learning_rate": 0.00044945054945054946, "loss": 3.5148, "num_input_tokens_seen": 26869760, "step": 410, "train_runtime": 13863.849, "train_tokens_per_second": 1938.117 }, { "epoch": 0.110920375016506, "grad_norm": 0.859375, "learning_rate": 0.0004604395604395604, "loss": 3.4389, "num_input_tokens_seen": 27525120, "step": 420, "train_runtime": 14223.2273, "train_tokens_per_second": 1935.223 }, { "epoch": 0.11356133632642282, "grad_norm": 0.89453125, "learning_rate": 0.0004714285714285714, "loss": 3.4073, "num_input_tokens_seen": 28180480, "step": 430, "train_runtime": 14584.6221, "train_tokens_per_second": 1932.205 }, { "epoch": 0.11620229763633963, "grad_norm": 1.0, "learning_rate": 0.00048241758241758244, "loss": 3.3578, "num_input_tokens_seen": 28835840, "step": 440, "train_runtime": 14947.6491, "train_tokens_per_second": 1929.122 }, { "epoch": 0.11884325894625644, "grad_norm": 0.95703125, "learning_rate": 0.0004934065934065934, "loss": 3.3008, "num_input_tokens_seen": 29491200, "step": 450, "train_runtime": 15308.7424, "train_tokens_per_second": 1926.429 }, { "epoch": 0.12148422025617325, "grad_norm": 1.0390625, "learning_rate": 0.0004999999085657815, "loss": 3.3657, "num_input_tokens_seen": 30146560, "step": 460, "train_runtime": 15670.4393, "train_tokens_per_second": 1923.785 }, { "epoch": 0.12412518156609005, "grad_norm": 0.69140625, "learning_rate": 0.0004999988799315912, "loss": 3.3419, "num_input_tokens_seen": 30801920, "step": 470, "train_runtime": 16031.0698, "train_tokens_per_second": 1921.389 }, { "epoch": 0.12676614287600688, "grad_norm": 0.734375, "learning_rate": 0.0004999967083751558, "loss": 3.2552, "num_input_tokens_seen": 31457280, "step": 480, "train_runtime": 16392.9295, "train_tokens_per_second": 1918.954 }, { "epoch": 0.1294071041859237, "grad_norm": 0.74609375, "learning_rate": 0.0004999933939064028, "loss": 3.2604, "num_input_tokens_seen": 32112640, "step": 490, "train_runtime": 16754.6432, "train_tokens_per_second": 1916.641 }, { "epoch": 0.1320480654958405, "grad_norm": 0.7734375, "learning_rate": 0.0004999889365404853, "loss": 3.3463, "num_input_tokens_seen": 32768000, "step": 500, "train_runtime": 17119.8942, "train_tokens_per_second": 1914.031 }, { "epoch": 0.13468902680575728, "grad_norm": 0.875, "learning_rate": 0.0004999833362977808, "loss": 3.2953, "num_input_tokens_seen": 33423360, "step": 510, "train_runtime": 17482.3757, "train_tokens_per_second": 1911.832 }, { "epoch": 0.1373299881156741, "grad_norm": 0.61328125, "learning_rate": 0.0004999765932038922, "loss": 3.2767, "num_input_tokens_seen": 34078720, "step": 520, "train_runtime": 17845.7721, "train_tokens_per_second": 1909.624 }, { "epoch": 0.1399709494255909, "grad_norm": 0.5859375, "learning_rate": 0.0004999687072896469, "loss": 3.2203, "num_input_tokens_seen": 34734080, "step": 530, "train_runtime": 18209.6586, "train_tokens_per_second": 1907.454 }, { "epoch": 0.14261191073550772, "grad_norm": 0.734375, "learning_rate": 0.0004999596785910971, "loss": 3.2896, "num_input_tokens_seen": 35389440, "step": 540, "train_runtime": 18573.9267, "train_tokens_per_second": 1905.329 }, { "epoch": 0.14525287204542453, "grad_norm": 0.63671875, "learning_rate": 0.0004999495071495191, "loss": 3.2258, "num_input_tokens_seen": 36044800, "step": 550, "train_runtime": 18927.2647, "train_tokens_per_second": 1904.385 }, { "epoch": 0.14789383335534134, "grad_norm": 0.65625, "learning_rate": 0.0004999381930114139, "loss": 3.1621, "num_input_tokens_seen": 36700160, "step": 560, "train_runtime": 19241.9441, "train_tokens_per_second": 1907.3 }, { "epoch": 0.15053479466525815, "grad_norm": 0.625, "learning_rate": 0.0004999257362285067, "loss": 3.1957, "num_input_tokens_seen": 37355520, "step": 570, "train_runtime": 19556.4805, "train_tokens_per_second": 1910.135 }, { "epoch": 0.15317575597517497, "grad_norm": 0.56640625, "learning_rate": 0.000499912136857746, "loss": 3.245, "num_input_tokens_seen": 38010880, "step": 580, "train_runtime": 19869.9929, "train_tokens_per_second": 1912.979 }, { "epoch": 0.15581671728509178, "grad_norm": 0.59765625, "learning_rate": 0.0004998973949613041, "loss": 3.2056, "num_input_tokens_seen": 38666240, "step": 590, "train_runtime": 20184.3191, "train_tokens_per_second": 1915.657 }, { "epoch": 0.1584576785950086, "grad_norm": 0.5625, "learning_rate": 0.0004998815106065771, "loss": 3.0587, "num_input_tokens_seen": 39321600, "step": 600, "train_runtime": 20498.9508, "train_tokens_per_second": 1918.225 }, { "epoch": 0.1610986399049254, "grad_norm": 0.53125, "learning_rate": 0.0004998644838661833, "loss": 3.1037, "num_input_tokens_seen": 39976960, "step": 610, "train_runtime": 20812.9691, "train_tokens_per_second": 1920.772 }, { "epoch": 0.1637396012148422, "grad_norm": 0.56640625, "learning_rate": 0.0004998463148179641, "loss": 3.1465, "num_input_tokens_seen": 40632320, "step": 620, "train_runtime": 21126.4959, "train_tokens_per_second": 1923.287 }, { "epoch": 0.16638056252475902, "grad_norm": 0.546875, "learning_rate": 0.0004998270035449831, "loss": 3.1053, "num_input_tokens_seen": 41287680, "step": 630, "train_runtime": 21440.6055, "train_tokens_per_second": 1925.677 }, { "epoch": 0.16902152383467584, "grad_norm": 0.59375, "learning_rate": 0.0004998065501355258, "loss": 3.1986, "num_input_tokens_seen": 41943040, "step": 640, "train_runtime": 21754.8128, "train_tokens_per_second": 1927.989 }, { "epoch": 0.17166248514459262, "grad_norm": 0.58984375, "learning_rate": 0.0004997849546830994, "loss": 3.1209, "num_input_tokens_seen": 42598400, "step": 650, "train_runtime": 22068.5213, "train_tokens_per_second": 1930.279 }, { "epoch": 0.17430344645450943, "grad_norm": 0.53125, "learning_rate": 0.0004997622172864317, "loss": 3.136, "num_input_tokens_seen": 43253760, "step": 660, "train_runtime": 22382.6292, "train_tokens_per_second": 1932.47 }, { "epoch": 0.17694440776442624, "grad_norm": 0.53515625, "learning_rate": 0.000499738338049472, "loss": 3.0647, "num_input_tokens_seen": 43909120, "step": 670, "train_runtime": 22696.8961, "train_tokens_per_second": 1934.587 }, { "epoch": 0.17958536907434305, "grad_norm": 0.5390625, "learning_rate": 0.0004997133170813886, "loss": 3.1302, "num_input_tokens_seen": 44564480, "step": 680, "train_runtime": 23012.1109, "train_tokens_per_second": 1936.566 }, { "epoch": 0.18222633038425987, "grad_norm": 0.484375, "learning_rate": 0.0004996871544965707, "loss": 3.0429, "num_input_tokens_seen": 45219840, "step": 690, "train_runtime": 23327.9941, "train_tokens_per_second": 1938.437 }, { "epoch": 0.18486729169417668, "grad_norm": 0.50390625, "learning_rate": 0.0004996598504146256, "loss": 3.0483, "num_input_tokens_seen": 45875200, "step": 700, "train_runtime": 23645.2585, "train_tokens_per_second": 1940.144 }, { "epoch": 0.1875082530040935, "grad_norm": 0.51171875, "learning_rate": 0.0004996314049603798, "loss": 3.0258, "num_input_tokens_seen": 46530560, "step": 710, "train_runtime": 23966.8706, "train_tokens_per_second": 1941.453 }, { "epoch": 0.1901492143140103, "grad_norm": 0.490234375, "learning_rate": 0.0004996018182638778, "loss": 3.0474, "num_input_tokens_seen": 47185920, "step": 720, "train_runtime": 24284.225, "train_tokens_per_second": 1943.069 }, { "epoch": 0.1927901756239271, "grad_norm": 0.466796875, "learning_rate": 0.000499571090460381, "loss": 3.0208, "num_input_tokens_seen": 47841280, "step": 730, "train_runtime": 24599.8854, "train_tokens_per_second": 1944.777 }, { "epoch": 0.19543113693384392, "grad_norm": 0.4765625, "learning_rate": 0.0004995392216903683, "loss": 3.0995, "num_input_tokens_seen": 48496640, "step": 740, "train_runtime": 24916.0996, "train_tokens_per_second": 1946.398 }, { "epoch": 0.19807209824376074, "grad_norm": 0.59375, "learning_rate": 0.0004995062120995344, "loss": 2.9743, "num_input_tokens_seen": 49152000, "step": 750, "train_runtime": 25232.3577, "train_tokens_per_second": 1947.975 }, { "epoch": 0.20071305955367755, "grad_norm": 0.490234375, "learning_rate": 0.0004994720618387896, "loss": 3.04, "num_input_tokens_seen": 49807360, "step": 760, "train_runtime": 25549.3423, "train_tokens_per_second": 1949.458 }, { "epoch": 0.20335402086359436, "grad_norm": 0.57421875, "learning_rate": 0.0004994367710642587, "loss": 2.9516, "num_input_tokens_seen": 50462720, "step": 770, "train_runtime": 25865.1856, "train_tokens_per_second": 1950.99 }, { "epoch": 0.20599498217351117, "grad_norm": 0.51171875, "learning_rate": 0.0004994003399372812, "loss": 3.0103, "num_input_tokens_seen": 51118080, "step": 780, "train_runtime": 26180.2975, "train_tokens_per_second": 1952.54 }, { "epoch": 0.20863594348342795, "grad_norm": 0.490234375, "learning_rate": 0.0004993627686244094, "loss": 3.0131, "num_input_tokens_seen": 51773440, "step": 790, "train_runtime": 26496.843, "train_tokens_per_second": 1953.947 }, { "epoch": 0.21127690479334477, "grad_norm": 0.462890625, "learning_rate": 0.0004993240572974086, "loss": 3.031, "num_input_tokens_seen": 52428800, "step": 800, "train_runtime": 26813.6464, "train_tokens_per_second": 1955.303 }, { "epoch": 0.21391786610326158, "grad_norm": 0.470703125, "learning_rate": 0.0004992842061332557, "loss": 2.9113, "num_input_tokens_seen": 53084160, "step": 810, "train_runtime": 27130.06, "train_tokens_per_second": 1956.655 }, { "epoch": 0.2165588274131784, "grad_norm": 0.484375, "learning_rate": 0.0004992432153141385, "loss": 2.9847, "num_input_tokens_seen": 53739520, "step": 820, "train_runtime": 27448.1176, "train_tokens_per_second": 1957.858 }, { "epoch": 0.2191997887230952, "grad_norm": 0.455078125, "learning_rate": 0.0004992010850274552, "loss": 2.9724, "num_input_tokens_seen": 54394880, "step": 830, "train_runtime": 27762.5366, "train_tokens_per_second": 1959.291 }, { "epoch": 0.221840750033012, "grad_norm": 0.46484375, "learning_rate": 0.0004991578154658133, "loss": 3.0043, "num_input_tokens_seen": 55050240, "step": 840, "train_runtime": 28069.7859, "train_tokens_per_second": 1961.192 }, { "epoch": 0.22448171134292882, "grad_norm": 0.443359375, "learning_rate": 0.0004991134068270287, "loss": 2.9317, "num_input_tokens_seen": 55705600, "step": 850, "train_runtime": 28319.1428, "train_tokens_per_second": 1967.065 }, { "epoch": 0.22712267265284564, "grad_norm": 0.49609375, "learning_rate": 0.0004990678593141246, "loss": 2.9107, "num_input_tokens_seen": 56360960, "step": 860, "train_runtime": 28526.2213, "train_tokens_per_second": 1975.76 }, { "epoch": 0.22976363396276245, "grad_norm": 0.55859375, "learning_rate": 0.0004990211731353312, "loss": 2.9912, "num_input_tokens_seen": 57016320, "step": 870, "train_runtime": 28732.8072, "train_tokens_per_second": 1984.363 }, { "epoch": 0.23240459527267926, "grad_norm": 0.48046875, "learning_rate": 0.000498973348504084, "loss": 2.965, "num_input_tokens_seen": 57671680, "step": 880, "train_runtime": 28942.9681, "train_tokens_per_second": 1992.597 }, { "epoch": 0.23504555658259607, "grad_norm": 0.4921875, "learning_rate": 0.0004989243856390233, "loss": 3.0581, "num_input_tokens_seen": 58327040, "step": 890, "train_runtime": 29154.1158, "train_tokens_per_second": 2000.645 }, { "epoch": 0.23768651789251288, "grad_norm": 0.447265625, "learning_rate": 0.0004988742847639932, "loss": 2.9547, "num_input_tokens_seen": 58982400, "step": 900, "train_runtime": 29364.6284, "train_tokens_per_second": 2008.621 }, { "epoch": 0.2403274792024297, "grad_norm": 0.49609375, "learning_rate": 0.0004988230461080403, "loss": 2.9386, "num_input_tokens_seen": 59637760, "step": 910, "train_runtime": 29574.7867, "train_tokens_per_second": 2016.507 }, { "epoch": 0.2429684405123465, "grad_norm": 0.51171875, "learning_rate": 0.0004987706699054129, "loss": 2.9689, "num_input_tokens_seen": 60293120, "step": 920, "train_runtime": 29786.007, "train_tokens_per_second": 2024.21 }, { "epoch": 0.2456094018222633, "grad_norm": 0.50390625, "learning_rate": 0.0004987171563955597, "loss": 2.9408, "num_input_tokens_seen": 60948480, "step": 930, "train_runtime": 29996.8903, "train_tokens_per_second": 2031.827 }, { "epoch": 0.2482503631321801, "grad_norm": 0.5078125, "learning_rate": 0.0004986625058231289, "loss": 2.9308, "num_input_tokens_seen": 61603840, "step": 940, "train_runtime": 30207.0952, "train_tokens_per_second": 2039.383 }, { "epoch": 0.25089132444209694, "grad_norm": 0.455078125, "learning_rate": 0.0004986067184379673, "loss": 2.9415, "num_input_tokens_seen": 62259200, "step": 950, "train_runtime": 30418.1436, "train_tokens_per_second": 2046.778 }, { "epoch": 0.25353228575201375, "grad_norm": 0.447265625, "learning_rate": 0.0004985497944951182, "loss": 2.8731, "num_input_tokens_seen": 62914560, "step": 960, "train_runtime": 30627.5422, "train_tokens_per_second": 2054.182 }, { "epoch": 0.25617324706193056, "grad_norm": 0.431640625, "learning_rate": 0.0004984917342548217, "loss": 2.9454, "num_input_tokens_seen": 63569920, "step": 970, "train_runtime": 30837.6788, "train_tokens_per_second": 2061.437 }, { "epoch": 0.2588142083718474, "grad_norm": 0.431640625, "learning_rate": 0.000498432537982512, "loss": 2.9318, "num_input_tokens_seen": 64225280, "step": 980, "train_runtime": 31048.345, "train_tokens_per_second": 2068.557 }, { "epoch": 0.2614551696817642, "grad_norm": 0.4375, "learning_rate": 0.0004983722059488176, "loss": 2.885, "num_input_tokens_seen": 64880640, "step": 990, "train_runtime": 31259.0861, "train_tokens_per_second": 2075.577 }, { "epoch": 0.264096130991681, "grad_norm": 0.46484375, "learning_rate": 0.0004983107384295588, "loss": 2.9069, "num_input_tokens_seen": 65536000, "step": 1000, "train_runtime": 31469.6335, "train_tokens_per_second": 2082.516 }, { "epoch": 0.26673709230159776, "grad_norm": 0.435546875, "learning_rate": 0.0004982481357057474, "loss": 2.942, "num_input_tokens_seen": 66191360, "step": 1010, "train_runtime": 31685.0895, "train_tokens_per_second": 2089.038 }, { "epoch": 0.26937805361151457, "grad_norm": 0.416015625, "learning_rate": 0.0004981843980635846, "loss": 2.7781, "num_input_tokens_seen": 66846720, "step": 1020, "train_runtime": 31889.6356, "train_tokens_per_second": 2096.19 }, { "epoch": 0.2720190149214314, "grad_norm": 0.5390625, "learning_rate": 0.0004981195257944607, "loss": 2.9807, "num_input_tokens_seen": 67502080, "step": 1030, "train_runtime": 32099.7982, "train_tokens_per_second": 2102.882 }, { "epoch": 0.2746599762313482, "grad_norm": 0.458984375, "learning_rate": 0.0004980535191949528, "loss": 2.9739, "num_input_tokens_seen": 68157440, "step": 1040, "train_runtime": 32310.6244, "train_tokens_per_second": 2109.444 }, { "epoch": 0.277300937541265, "grad_norm": 0.5859375, "learning_rate": 0.0004979863785668237, "loss": 2.8292, "num_input_tokens_seen": 68812800, "step": 1050, "train_runtime": 32520.9509, "train_tokens_per_second": 2115.953 }, { "epoch": 0.2799418988511818, "grad_norm": 0.4765625, "learning_rate": 0.000497918104217021, "loss": 2.9008, "num_input_tokens_seen": 69468160, "step": 1060, "train_runtime": 32730.4591, "train_tokens_per_second": 2122.432 }, { "epoch": 0.2825828601610986, "grad_norm": 0.42578125, "learning_rate": 0.0004978486964576752, "loss": 2.8412, "num_input_tokens_seen": 70123520, "step": 1070, "train_runtime": 32939.7476, "train_tokens_per_second": 2128.842 }, { "epoch": 0.28522382147101544, "grad_norm": 0.41796875, "learning_rate": 0.0004977781556060984, "loss": 2.9267, "num_input_tokens_seen": 70778880, "step": 1080, "train_runtime": 33149.2502, "train_tokens_per_second": 2135.158 }, { "epoch": 0.28786478278093225, "grad_norm": 0.416015625, "learning_rate": 0.0004977064819847828, "loss": 2.8814, "num_input_tokens_seen": 71434240, "step": 1090, "train_runtime": 33359.5416, "train_tokens_per_second": 2141.344 }, { "epoch": 0.29050574409084906, "grad_norm": 0.42578125, "learning_rate": 0.0004976336759213994, "loss": 2.8877, "num_input_tokens_seen": 72089600, "step": 1100, "train_runtime": 33569.673, "train_tokens_per_second": 2147.462 }, { "epoch": 0.2931467054007659, "grad_norm": 0.470703125, "learning_rate": 0.0004975597377487965, "loss": 2.8939, "num_input_tokens_seen": 72744960, "step": 1110, "train_runtime": 33779.1378, "train_tokens_per_second": 2153.547 }, { "epoch": 0.2957876667106827, "grad_norm": 0.41015625, "learning_rate": 0.0004974846678049977, "loss": 2.9035, "num_input_tokens_seen": 73400320, "step": 1120, "train_runtime": 33988.6643, "train_tokens_per_second": 2159.553 }, { "epoch": 0.2984286280205995, "grad_norm": 0.423828125, "learning_rate": 0.0004974084664332012, "loss": 2.8358, "num_input_tokens_seen": 74055680, "step": 1130, "train_runtime": 34197.6484, "train_tokens_per_second": 2165.52 }, { "epoch": 0.3010695893305163, "grad_norm": 0.4140625, "learning_rate": 0.0004973311339817774, "loss": 2.8287, "num_input_tokens_seen": 74711040, "step": 1140, "train_runtime": 34407.5712, "train_tokens_per_second": 2171.355 }, { "epoch": 0.3037105506404331, "grad_norm": 0.494140625, "learning_rate": 0.0004972526708042678, "loss": 2.8682, "num_input_tokens_seen": 75366400, "step": 1150, "train_runtime": 34617.1799, "train_tokens_per_second": 2177.139 }, { "epoch": 0.30635151195034993, "grad_norm": 0.455078125, "learning_rate": 0.0004971730772593834, "loss": 2.9179, "num_input_tokens_seen": 76021760, "step": 1160, "train_runtime": 34826.9259, "train_tokens_per_second": 2182.844 }, { "epoch": 0.30899247326026674, "grad_norm": 0.45703125, "learning_rate": 0.0004970923537110026, "loss": 2.8583, "num_input_tokens_seen": 76677120, "step": 1170, "train_runtime": 35036.9229, "train_tokens_per_second": 2188.466 }, { "epoch": 0.31163343457018355, "grad_norm": 0.451171875, "learning_rate": 0.00049701050052817, "loss": 2.8784, "num_input_tokens_seen": 77332480, "step": 1180, "train_runtime": 35246.7906, "train_tokens_per_second": 2194.029 }, { "epoch": 0.31427439588010037, "grad_norm": 0.396484375, "learning_rate": 0.0004969275180850948, "loss": 2.8666, "num_input_tokens_seen": 77987840, "step": 1190, "train_runtime": 35455.8558, "train_tokens_per_second": 2199.576 }, { "epoch": 0.3169153571900172, "grad_norm": 0.390625, "learning_rate": 0.0004968434067611483, "loss": 2.7926, "num_input_tokens_seen": 78643200, "step": 1200, "train_runtime": 35665.5735, "train_tokens_per_second": 2205.017 }, { "epoch": 0.319556318499934, "grad_norm": 0.482421875, "learning_rate": 0.0004967581669408632, "loss": 2.8923, "num_input_tokens_seen": 79298560, "step": 1210, "train_runtime": 35874.4887, "train_tokens_per_second": 2210.444 }, { "epoch": 0.3221972798098508, "grad_norm": 0.62890625, "learning_rate": 0.0004966717990139313, "loss": 2.8482, "num_input_tokens_seen": 79953920, "step": 1220, "train_runtime": 36083.0476, "train_tokens_per_second": 2215.831 }, { "epoch": 0.3248382411197676, "grad_norm": 0.412109375, "learning_rate": 0.0004965843033752015, "loss": 2.8508, "num_input_tokens_seen": 80609280, "step": 1230, "train_runtime": 36292.5082, "train_tokens_per_second": 2221.1 }, { "epoch": 0.3274792024296844, "grad_norm": 0.41015625, "learning_rate": 0.0004964956804246784, "loss": 2.9238, "num_input_tokens_seen": 81264640, "step": 1240, "train_runtime": 36502.8026, "train_tokens_per_second": 2226.258 }, { "epoch": 0.33012016373960124, "grad_norm": 0.421875, "learning_rate": 0.0004964059305675205, "loss": 2.8232, "num_input_tokens_seen": 81920000, "step": 1250, "train_runtime": 36711.7136, "train_tokens_per_second": 2231.44 }, { "epoch": 0.33276112504951805, "grad_norm": 0.392578125, "learning_rate": 0.0004963150542140381, "loss": 2.8759, "num_input_tokens_seen": 82575360, "step": 1260, "train_runtime": 36921.2047, "train_tokens_per_second": 2236.529 }, { "epoch": 0.33540208635943486, "grad_norm": 0.39453125, "learning_rate": 0.0004962230517796915, "loss": 2.8602, "num_input_tokens_seen": 83230720, "step": 1270, "train_runtime": 37131.0352, "train_tokens_per_second": 2241.541 }, { "epoch": 0.33804304766935167, "grad_norm": 0.380859375, "learning_rate": 0.0004961299236850889, "loss": 2.7801, "num_input_tokens_seen": 83886080, "step": 1280, "train_runtime": 37340.6942, "train_tokens_per_second": 2246.506 }, { "epoch": 0.3406840089792684, "grad_norm": 0.40234375, "learning_rate": 0.000496035670355985, "loss": 2.8335, "num_input_tokens_seen": 84541440, "step": 1290, "train_runtime": 37550.6167, "train_tokens_per_second": 2251.4 }, { "epoch": 0.34332497028918524, "grad_norm": 0.41796875, "learning_rate": 0.0004959402922232788, "loss": 2.7464, "num_input_tokens_seen": 85196800, "step": 1300, "train_runtime": 37760.9191, "train_tokens_per_second": 2256.216 }, { "epoch": 0.34596593159910205, "grad_norm": 0.392578125, "learning_rate": 0.0004958437897230112, "loss": 2.7879, "num_input_tokens_seen": 85852160, "step": 1310, "train_runtime": 37970.4444, "train_tokens_per_second": 2261.026 }, { "epoch": 0.34860689290901886, "grad_norm": 0.416015625, "learning_rate": 0.0004957461632963637, "loss": 2.8249, "num_input_tokens_seen": 86507520, "step": 1320, "train_runtime": 38180.3322, "train_tokens_per_second": 2265.761 }, { "epoch": 0.3512478542189357, "grad_norm": 0.37890625, "learning_rate": 0.0004956474133896558, "loss": 2.8435, "num_input_tokens_seen": 87162880, "step": 1330, "train_runtime": 38390.9946, "train_tokens_per_second": 2270.399 }, { "epoch": 0.3538888155288525, "grad_norm": 0.365234375, "learning_rate": 0.0004955475404543436, "loss": 2.858, "num_input_tokens_seen": 87818240, "step": 1340, "train_runtime": 38600.5942, "train_tokens_per_second": 2275.049 }, { "epoch": 0.3565297768387693, "grad_norm": 0.39453125, "learning_rate": 0.0004954465449470172, "loss": 2.7555, "num_input_tokens_seen": 88473600, "step": 1350, "train_runtime": 38810.0092, "train_tokens_per_second": 2279.659 }, { "epoch": 0.3591707381486861, "grad_norm": 0.380859375, "learning_rate": 0.0004953444273293983, "loss": 2.785, "num_input_tokens_seen": 89128960, "step": 1360, "train_runtime": 39020.2344, "train_tokens_per_second": 2284.173 }, { "epoch": 0.3618116994586029, "grad_norm": 0.390625, "learning_rate": 0.0004952411880683394, "loss": 2.8254, "num_input_tokens_seen": 89784320, "step": 1370, "train_runtime": 39228.1232, "train_tokens_per_second": 2288.774 }, { "epoch": 0.36445266076851973, "grad_norm": 0.380859375, "learning_rate": 0.0004951368276358201, "loss": 2.7833, "num_input_tokens_seen": 90439680, "step": 1380, "train_runtime": 39436.5908, "train_tokens_per_second": 2293.294 }, { "epoch": 0.36709362207843654, "grad_norm": 0.416015625, "learning_rate": 0.0004950313465089464, "loss": 2.7848, "num_input_tokens_seen": 91095040, "step": 1390, "train_runtime": 39645.6075, "train_tokens_per_second": 2297.733 }, { "epoch": 0.36973458338835336, "grad_norm": 0.3828125, "learning_rate": 0.0004949247451699468, "loss": 2.7894, "num_input_tokens_seen": 91750400, "step": 1400, "train_runtime": 39855.598, "train_tokens_per_second": 2302.071 }, { "epoch": 0.37237554469827017, "grad_norm": 0.412109375, "learning_rate": 0.0004948170241061721, "loss": 2.8428, "num_input_tokens_seen": 92405760, "step": 1410, "train_runtime": 40065.6311, "train_tokens_per_second": 2306.36 }, { "epoch": 0.375016506008187, "grad_norm": 0.361328125, "learning_rate": 0.0004947081838100916, "loss": 2.7368, "num_input_tokens_seen": 93061120, "step": 1420, "train_runtime": 40275.3038, "train_tokens_per_second": 2310.625 }, { "epoch": 0.3776574673181038, "grad_norm": 0.40625, "learning_rate": 0.0004945982247792913, "loss": 2.8151, "num_input_tokens_seen": 93716480, "step": 1430, "train_runtime": 40484.8888, "train_tokens_per_second": 2314.851 }, { "epoch": 0.3802984286280206, "grad_norm": 0.373046875, "learning_rate": 0.0004944871475164722, "loss": 2.7474, "num_input_tokens_seen": 94371840, "step": 1440, "train_runtime": 40694.6037, "train_tokens_per_second": 2319.026 }, { "epoch": 0.3829393899379374, "grad_norm": 0.412109375, "learning_rate": 0.0004943749525294471, "loss": 2.8309, "num_input_tokens_seen": 95027200, "step": 1450, "train_runtime": 40904.0292, "train_tokens_per_second": 2323.175 }, { "epoch": 0.3855803512478542, "grad_norm": 0.37109375, "learning_rate": 0.0004942616403311391, "loss": 2.7217, "num_input_tokens_seen": 95682560, "step": 1460, "train_runtime": 41113.147, "train_tokens_per_second": 2327.298 }, { "epoch": 0.38822131255777104, "grad_norm": 0.369140625, "learning_rate": 0.0004941472114395784, "loss": 2.7462, "num_input_tokens_seen": 96337920, "step": 1470, "train_runtime": 41322.714, "train_tokens_per_second": 2331.355 }, { "epoch": 0.39086227386768785, "grad_norm": 0.41796875, "learning_rate": 0.0004940316663779008, "loss": 2.8393, "num_input_tokens_seen": 96993280, "step": 1480, "train_runtime": 41531.818, "train_tokens_per_second": 2335.397 }, { "epoch": 0.39350323517760466, "grad_norm": 0.404296875, "learning_rate": 0.0004939150056743446, "loss": 2.8199, "num_input_tokens_seen": 97648640, "step": 1490, "train_runtime": 41740.5354, "train_tokens_per_second": 2339.42 }, { "epoch": 0.39614419648752147, "grad_norm": 0.466796875, "learning_rate": 0.000493797229862249, "loss": 2.8282, "num_input_tokens_seen": 98304000, "step": 1500, "train_runtime": 41949.9329, "train_tokens_per_second": 2343.365 }, { "epoch": 0.3987851577974383, "grad_norm": 0.384765625, "learning_rate": 0.0004936783394800504, "loss": 2.8253, "num_input_tokens_seen": 98959360, "step": 1510, "train_runtime": 42165.4456, "train_tokens_per_second": 2346.93 }, { "epoch": 0.4014261191073551, "grad_norm": 0.369140625, "learning_rate": 0.0004935583350712812, "loss": 2.7516, "num_input_tokens_seen": 99614720, "step": 1520, "train_runtime": 42365.9689, "train_tokens_per_second": 2351.291 }, { "epoch": 0.4040670804172719, "grad_norm": 0.365234375, "learning_rate": 0.0004934372171845667, "loss": 2.7645, "num_input_tokens_seen": 100270080, "step": 1530, "train_runtime": 42572.3174, "train_tokens_per_second": 2355.288 }, { "epoch": 0.4067080417271887, "grad_norm": 0.373046875, "learning_rate": 0.0004933149863736228, "loss": 2.7305, "num_input_tokens_seen": 100925440, "step": 1540, "train_runtime": 42778.6279, "train_tokens_per_second": 2359.249 }, { "epoch": 0.40934900303710553, "grad_norm": 0.361328125, "learning_rate": 0.0004931916431972531, "loss": 2.7933, "num_input_tokens_seen": 101580800, "step": 1550, "train_runtime": 42985.0685, "train_tokens_per_second": 2363.165 }, { "epoch": 0.41198996434702234, "grad_norm": 0.38671875, "learning_rate": 0.0004930671882193468, "loss": 2.7548, "num_input_tokens_seen": 102236160, "step": 1560, "train_runtime": 43191.7226, "train_tokens_per_second": 2367.031 }, { "epoch": 0.41463092565693915, "grad_norm": 0.43359375, "learning_rate": 0.000492941622008876, "loss": 2.7387, "num_input_tokens_seen": 102891520, "step": 1570, "train_runtime": 43397.9618, "train_tokens_per_second": 2370.884 }, { "epoch": 0.4172718869668559, "grad_norm": 0.37890625, "learning_rate": 0.000492814945139893, "loss": 2.7517, "num_input_tokens_seen": 103546880, "step": 1580, "train_runtime": 43603.6755, "train_tokens_per_second": 2374.728 }, { "epoch": 0.4199128482767727, "grad_norm": 0.369140625, "learning_rate": 0.0004926871581915273, "loss": 2.6632, "num_input_tokens_seen": 104202240, "step": 1590, "train_runtime": 43810.0188, "train_tokens_per_second": 2378.503 }, { "epoch": 0.42255380958668953, "grad_norm": 0.3984375, "learning_rate": 0.000492558261747984, "loss": 2.7857, "num_input_tokens_seen": 104857600, "step": 1600, "train_runtime": 44016.6706, "train_tokens_per_second": 2382.225 }, { "epoch": 0.42519477089660634, "grad_norm": 0.361328125, "learning_rate": 0.00049242825639854, "loss": 2.715, "num_input_tokens_seen": 105512960, "step": 1610, "train_runtime": 44223.1041, "train_tokens_per_second": 2385.924 }, { "epoch": 0.42783573220652316, "grad_norm": 0.37890625, "learning_rate": 0.0004922971427375422, "loss": 2.7376, "num_input_tokens_seen": 106168320, "step": 1620, "train_runtime": 44429.7495, "train_tokens_per_second": 2389.577 }, { "epoch": 0.43047669351643997, "grad_norm": 0.37890625, "learning_rate": 0.0004921649213644041, "loss": 2.7056, "num_input_tokens_seen": 106823680, "step": 1630, "train_runtime": 44636.392, "train_tokens_per_second": 2393.197 }, { "epoch": 0.4331176548263568, "grad_norm": 0.35546875, "learning_rate": 0.0004920315928836032, "loss": 2.749, "num_input_tokens_seen": 107479040, "step": 1640, "train_runtime": 44843.2607, "train_tokens_per_second": 2396.771 }, { "epoch": 0.4357586161362736, "grad_norm": 0.34765625, "learning_rate": 0.0004918971579046788, "loss": 2.7726, "num_input_tokens_seen": 108134400, "step": 1650, "train_runtime": 45049.4992, "train_tokens_per_second": 2400.346 }, { "epoch": 0.4383995774461904, "grad_norm": 0.345703125, "learning_rate": 0.0004917616170422286, "loss": 2.7144, "num_input_tokens_seen": 108789760, "step": 1660, "train_runtime": 45256.0425, "train_tokens_per_second": 2403.873 }, { "epoch": 0.4410405387561072, "grad_norm": 0.39453125, "learning_rate": 0.0004916249709159057, "loss": 2.7613, "num_input_tokens_seen": 109445120, "step": 1670, "train_runtime": 45463.2858, "train_tokens_per_second": 2407.33 }, { "epoch": 0.443681500066024, "grad_norm": 0.44140625, "learning_rate": 0.0004914872201504169, "loss": 2.7181, "num_input_tokens_seen": 110100480, "step": 1680, "train_runtime": 45669.7593, "train_tokens_per_second": 2410.796 }, { "epoch": 0.44632246137594084, "grad_norm": 0.35546875, "learning_rate": 0.0004913483653755184, "loss": 2.7729, "num_input_tokens_seen": 110755840, "step": 1690, "train_runtime": 45876.0618, "train_tokens_per_second": 2414.24 }, { "epoch": 0.44896342268585765, "grad_norm": 0.353515625, "learning_rate": 0.0004912084072260141, "loss": 2.6527, "num_input_tokens_seen": 111411200, "step": 1700, "train_runtime": 46082.1107, "train_tokens_per_second": 2417.667 }, { "epoch": 0.45160438399577446, "grad_norm": 0.33984375, "learning_rate": 0.0004910673463417519, "loss": 2.6773, "num_input_tokens_seen": 112066560, "step": 1710, "train_runtime": 46288.4468, "train_tokens_per_second": 2421.048 }, { "epoch": 0.4542453453056913, "grad_norm": 0.412109375, "learning_rate": 0.0004909251833676211, "loss": 2.6628, "num_input_tokens_seen": 112721920, "step": 1720, "train_runtime": 46494.9697, "train_tokens_per_second": 2424.39 }, { "epoch": 0.4568863066156081, "grad_norm": 0.353515625, "learning_rate": 0.0004907819189535496, "loss": 2.7297, "num_input_tokens_seen": 113377280, "step": 1730, "train_runtime": 46700.8195, "train_tokens_per_second": 2427.736 }, { "epoch": 0.4595272679255249, "grad_norm": 0.359375, "learning_rate": 0.0004906375537545006, "loss": 2.7085, "num_input_tokens_seen": 114032640, "step": 1740, "train_runtime": 46907.3536, "train_tokens_per_second": 2431.018 }, { "epoch": 0.4621682292354417, "grad_norm": 0.349609375, "learning_rate": 0.00049049208843047, "loss": 2.6987, "num_input_tokens_seen": 114688000, "step": 1750, "train_runtime": 47113.9027, "train_tokens_per_second": 2434.271 }, { "epoch": 0.4648091905453585, "grad_norm": 0.3515625, "learning_rate": 0.0004903455236464828, "loss": 2.7408, "num_input_tokens_seen": 115343360, "step": 1760, "train_runtime": 47320.3422, "train_tokens_per_second": 2437.501 }, { "epoch": 0.46745015185527533, "grad_norm": 0.40625, "learning_rate": 0.0004901978600725909, "loss": 2.6906, "num_input_tokens_seen": 115998720, "step": 1770, "train_runtime": 47526.7886, "train_tokens_per_second": 2440.702 }, { "epoch": 0.47009111316519214, "grad_norm": 0.345703125, "learning_rate": 0.000490049098383869, "loss": 2.7082, "num_input_tokens_seen": 116654080, "step": 1780, "train_runtime": 47733.0043, "train_tokens_per_second": 2443.887 }, { "epoch": 0.47273207447510895, "grad_norm": 0.40234375, "learning_rate": 0.0004898992392604124, "loss": 2.751, "num_input_tokens_seen": 117309440, "step": 1790, "train_runtime": 47939.0501, "train_tokens_per_second": 2447.054 }, { "epoch": 0.47537303578502577, "grad_norm": 0.35546875, "learning_rate": 0.0004897482833873334, "loss": 2.687, "num_input_tokens_seen": 117964800, "step": 1800, "train_runtime": 48145.2663, "train_tokens_per_second": 2450.185 }, { "epoch": 0.4780139970949426, "grad_norm": 0.34375, "learning_rate": 0.0004895962314547585, "loss": 2.7141, "num_input_tokens_seen": 118620160, "step": 1810, "train_runtime": 48351.9308, "train_tokens_per_second": 2453.266 }, { "epoch": 0.4806549584048594, "grad_norm": 0.44921875, "learning_rate": 0.0004894430841578249, "loss": 2.7107, "num_input_tokens_seen": 119275520, "step": 1820, "train_runtime": 48558.7752, "train_tokens_per_second": 2456.312 }, { "epoch": 0.4832959197147762, "grad_norm": 0.390625, "learning_rate": 0.0004892888421966776, "loss": 2.5919, "num_input_tokens_seen": 119930880, "step": 1830, "train_runtime": 48765.4151, "train_tokens_per_second": 2459.343 }, { "epoch": 0.485936881024693, "grad_norm": 0.392578125, "learning_rate": 0.000489133506276466, "loss": 2.6917, "num_input_tokens_seen": 120586240, "step": 1840, "train_runtime": 48972.0677, "train_tokens_per_second": 2462.347 }, { "epoch": 0.4885778423346098, "grad_norm": 0.384765625, "learning_rate": 0.0004889770771073407, "loss": 2.7011, "num_input_tokens_seen": 121241600, "step": 1850, "train_runtime": 49178.7023, "train_tokens_per_second": 2465.327 }, { "epoch": 0.4912188036445266, "grad_norm": 0.365234375, "learning_rate": 0.0004888195554044507, "loss": 2.6365, "num_input_tokens_seen": 121896960, "step": 1860, "train_runtime": 49385.3685, "train_tokens_per_second": 2468.281 }, { "epoch": 0.4938597649544434, "grad_norm": 0.36328125, "learning_rate": 0.0004886609418879391, "loss": 2.7245, "num_input_tokens_seen": 122552320, "step": 1870, "train_runtime": 49591.7934, "train_tokens_per_second": 2471.222 }, { "epoch": 0.4965007262643602, "grad_norm": 0.39453125, "learning_rate": 0.000488501237282941, "loss": 2.7149, "num_input_tokens_seen": 123207680, "step": 1880, "train_runtime": 49798.3338, "train_tokens_per_second": 2474.133 }, { "epoch": 0.499141687574277, "grad_norm": 0.3359375, "learning_rate": 0.0004883404423195795, "loss": 2.6651, "num_input_tokens_seen": 123863040, "step": 1890, "train_runtime": 50004.4658, "train_tokens_per_second": 2477.04 }, { "epoch": 0.5017826488841939, "grad_norm": 0.34765625, "learning_rate": 0.0004881785577329624, "loss": 2.7421, "num_input_tokens_seen": 124518400, "step": 1900, "train_runtime": 50211.1109, "train_tokens_per_second": 2479.897 }, { "epoch": 0.5044236101941106, "grad_norm": 0.349609375, "learning_rate": 0.0004880155842631789, "loss": 2.6493, "num_input_tokens_seen": 125173760, "step": 1910, "train_runtime": 50417.5524, "train_tokens_per_second": 2482.742 }, { "epoch": 0.5070645715040275, "grad_norm": 0.34765625, "learning_rate": 0.00048785152265529657, "loss": 2.702, "num_input_tokens_seen": 125829120, "step": 1920, "train_runtime": 50624.4207, "train_tokens_per_second": 2485.542 }, { "epoch": 0.5097055328139443, "grad_norm": 0.328125, "learning_rate": 0.0004876863736593572, "loss": 2.658, "num_input_tokens_seen": 126484480, "step": 1930, "train_runtime": 50831.0628, "train_tokens_per_second": 2488.33 }, { "epoch": 0.5123464941238611, "grad_norm": 0.333984375, "learning_rate": 0.0004875201380303742, "loss": 2.6901, "num_input_tokens_seen": 127139840, "step": 1940, "train_runtime": 51037.2011, "train_tokens_per_second": 2491.121 }, { "epoch": 0.5149874554337779, "grad_norm": 0.3671875, "learning_rate": 0.00048735281652832844, "loss": 2.6371, "num_input_tokens_seen": 127795200, "step": 1950, "train_runtime": 51243.725, "train_tokens_per_second": 2493.87 }, { "epoch": 0.5176284167436948, "grad_norm": 0.376953125, "learning_rate": 0.00048718440991816516, "loss": 2.7281, "num_input_tokens_seen": 128450560, "step": 1960, "train_runtime": 51450.1688, "train_tokens_per_second": 2496.601 }, { "epoch": 0.5202693780536115, "grad_norm": 0.33984375, "learning_rate": 0.0004870149189697906, "loss": 2.6286, "num_input_tokens_seen": 129105920, "step": 1970, "train_runtime": 51656.6086, "train_tokens_per_second": 2499.311 }, { "epoch": 0.5229103393635284, "grad_norm": 0.361328125, "learning_rate": 0.0004868443444580681, "loss": 2.6763, "num_input_tokens_seen": 129761280, "step": 1980, "train_runtime": 51863.4736, "train_tokens_per_second": 2501.978 }, { "epoch": 0.5255513006734451, "grad_norm": 0.3515625, "learning_rate": 0.0004866726871628147, "loss": 2.6689, "num_input_tokens_seen": 130416640, "step": 1990, "train_runtime": 52068.9916, "train_tokens_per_second": 2504.689 }, { "epoch": 0.528192261983362, "grad_norm": 0.314453125, "learning_rate": 0.00048649994786879777, "loss": 2.7113, "num_input_tokens_seen": 131072000, "step": 2000, "train_runtime": 52276.5627, "train_tokens_per_second": 2507.28 }, { "epoch": 0.5308332232932788, "grad_norm": 0.359375, "learning_rate": 0.0004863261273657311, "loss": 2.7491, "num_input_tokens_seen": 131727360, "step": 2010, "train_runtime": 52495.0703, "train_tokens_per_second": 2509.328 }, { "epoch": 0.5334741846031955, "grad_norm": 0.337890625, "learning_rate": 0.0004861512264482716, "loss": 2.6846, "num_input_tokens_seen": 132382720, "step": 2020, "train_runtime": 52696.2556, "train_tokens_per_second": 2512.185 }, { "epoch": 0.5361151459131124, "grad_norm": 0.3515625, "learning_rate": 0.0004859752459160154, "loss": 2.7133, "num_input_tokens_seen": 133038080, "step": 2030, "train_runtime": 52897.1759, "train_tokens_per_second": 2515.032 }, { "epoch": 0.5387561072230291, "grad_norm": 0.326171875, "learning_rate": 0.0004857981865734943, "loss": 2.7025, "num_input_tokens_seen": 133693440, "step": 2040, "train_runtime": 53098.6928, "train_tokens_per_second": 2517.829 }, { "epoch": 0.541397068532946, "grad_norm": 0.32421875, "learning_rate": 0.0004856200492301723, "loss": 2.7283, "num_input_tokens_seen": 134348800, "step": 2050, "train_runtime": 53300.3491, "train_tokens_per_second": 2520.599 }, { "epoch": 0.5440380298428628, "grad_norm": 0.34375, "learning_rate": 0.00048544083470044154, "loss": 2.6277, "num_input_tokens_seen": 135004160, "step": 2060, "train_runtime": 53506.3955, "train_tokens_per_second": 2523.141 }, { "epoch": 0.5466789911527796, "grad_norm": 0.34375, "learning_rate": 0.0004852605438036187, "loss": 2.6809, "num_input_tokens_seen": 135659520, "step": 2070, "train_runtime": 53713.2347, "train_tokens_per_second": 2525.626 }, { "epoch": 0.5493199524626964, "grad_norm": 0.3828125, "learning_rate": 0.00048507917736394154, "loss": 2.7295, "num_input_tokens_seen": 136314880, "step": 2080, "train_runtime": 53919.8743, "train_tokens_per_second": 2528.101 }, { "epoch": 0.5519609137726132, "grad_norm": 0.3671875, "learning_rate": 0.0004848967362105646, "loss": 2.7353, "num_input_tokens_seen": 136970240, "step": 2090, "train_runtime": 54126.737, "train_tokens_per_second": 2530.547 }, { "epoch": 0.55460187508253, "grad_norm": 0.33203125, "learning_rate": 0.00048471322117755577, "loss": 2.5811, "num_input_tokens_seen": 137625600, "step": 2100, "train_runtime": 54333.5849, "train_tokens_per_second": 2532.975 }, { "epoch": 0.5572428363924469, "grad_norm": 0.318359375, "learning_rate": 0.0004845286331038927, "loss": 2.6682, "num_input_tokens_seen": 138280960, "step": 2110, "train_runtime": 54600.3269, "train_tokens_per_second": 2532.603 }, { "epoch": 0.5598837977023636, "grad_norm": 0.341796875, "learning_rate": 0.0004843429728334582, "loss": 2.6917, "num_input_tokens_seen": 138936320, "step": 2120, "train_runtime": 54898.5911, "train_tokens_per_second": 2530.781 }, { "epoch": 0.5625247590122805, "grad_norm": 0.333984375, "learning_rate": 0.0004841562412150372, "loss": 2.6355, "num_input_tokens_seen": 139591680, "step": 2130, "train_runtime": 55206.542, "train_tokens_per_second": 2528.535 }, { "epoch": 0.5651657203221973, "grad_norm": 0.32421875, "learning_rate": 0.0004839684391023124, "loss": 2.6901, "num_input_tokens_seen": 140247040, "step": 2140, "train_runtime": 55513.4285, "train_tokens_per_second": 2526.362 }, { "epoch": 0.5678066816321141, "grad_norm": 0.34375, "learning_rate": 0.00048377956735386044, "loss": 2.7062, "num_input_tokens_seen": 140902400, "step": 2150, "train_runtime": 55825.6226, "train_tokens_per_second": 2523.974 }, { "epoch": 0.5704476429420309, "grad_norm": 0.357421875, "learning_rate": 0.00048358962683314803, "loss": 2.666, "num_input_tokens_seen": 141557760, "step": 2160, "train_runtime": 56107.6676, "train_tokens_per_second": 2522.966 }, { "epoch": 0.5730886042519477, "grad_norm": 0.33984375, "learning_rate": 0.0004833986184085283, "loss": 2.6364, "num_input_tokens_seen": 142213120, "step": 2170, "train_runtime": 56384.4245, "train_tokens_per_second": 2522.206 }, { "epoch": 0.5757295655618645, "grad_norm": 0.326171875, "learning_rate": 0.00048320654295323594, "loss": 2.6361, "num_input_tokens_seen": 142868480, "step": 2180, "train_runtime": 56627.8315, "train_tokens_per_second": 2522.938 }, { "epoch": 0.5783705268717814, "grad_norm": 0.35546875, "learning_rate": 0.0004830134013453844, "loss": 2.6673, "num_input_tokens_seen": 143523840, "step": 2190, "train_runtime": 56936.1007, "train_tokens_per_second": 2520.788 }, { "epoch": 0.5810114881816981, "grad_norm": 0.337890625, "learning_rate": 0.00048281919446796083, "loss": 2.6553, "num_input_tokens_seen": 144179200, "step": 2200, "train_runtime": 57250.9944, "train_tokens_per_second": 2518.37 }, { "epoch": 0.583652449491615, "grad_norm": 0.3203125, "learning_rate": 0.00048262392320882276, "loss": 2.6588, "num_input_tokens_seen": 144834560, "step": 2210, "train_runtime": 57548.402, "train_tokens_per_second": 2516.743 }, { "epoch": 0.5862934108015317, "grad_norm": 0.357421875, "learning_rate": 0.0004824275884606936, "loss": 2.635, "num_input_tokens_seen": 145489920, "step": 2220, "train_runtime": 57792.8878, "train_tokens_per_second": 2517.436 }, { "epoch": 0.5889343721114486, "grad_norm": 0.396484375, "learning_rate": 0.0004822301911211587, "loss": 2.6666, "num_input_tokens_seen": 146145280, "step": 2230, "train_runtime": 57950.0287, "train_tokens_per_second": 2521.919 }, { "epoch": 0.5915753334213654, "grad_norm": 0.357421875, "learning_rate": 0.0004820317320926615, "loss": 2.7133, "num_input_tokens_seen": 146800640, "step": 2240, "train_runtime": 58070.5268, "train_tokens_per_second": 2527.972 }, { "epoch": 0.5942162947312822, "grad_norm": 0.33984375, "learning_rate": 0.00048183221228249883, "loss": 2.5816, "num_input_tokens_seen": 147456000, "step": 2250, "train_runtime": 58181.0426, "train_tokens_per_second": 2534.434 }, { "epoch": 0.596857256041199, "grad_norm": 0.3984375, "learning_rate": 0.0004816316326028175, "loss": 2.6768, "num_input_tokens_seen": 148111360, "step": 2260, "train_runtime": 58304.3706, "train_tokens_per_second": 2540.313 }, { "epoch": 0.5994982173511159, "grad_norm": 0.41796875, "learning_rate": 0.0004814299939706094, "loss": 2.6189, "num_input_tokens_seen": 148766720, "step": 2270, "train_runtime": 58471.2938, "train_tokens_per_second": 2544.269 }, { "epoch": 0.6021391786610326, "grad_norm": 0.328125, "learning_rate": 0.0004812272973077079, "loss": 2.6704, "num_input_tokens_seen": 149422080, "step": 2280, "train_runtime": 58622.6079, "train_tokens_per_second": 2548.881 }, { "epoch": 0.6047801399709494, "grad_norm": 0.3359375, "learning_rate": 0.00048102354354078304, "loss": 2.6815, "num_input_tokens_seen": 150077440, "step": 2290, "train_runtime": 58738.1173, "train_tokens_per_second": 2555.026 }, { "epoch": 0.6074211012808662, "grad_norm": 0.341796875, "learning_rate": 0.000480818733601338, "loss": 2.6316, "num_input_tokens_seen": 150732800, "step": 2300, "train_runtime": 58855.0518, "train_tokens_per_second": 2561.085 }, { "epoch": 0.610062062590783, "grad_norm": 0.31640625, "learning_rate": 0.00048061286842570423, "loss": 2.7123, "num_input_tokens_seen": 151388160, "step": 2310, "train_runtime": 58974.3368, "train_tokens_per_second": 2567.018 }, { "epoch": 0.6127030239006999, "grad_norm": 0.36328125, "learning_rate": 0.0004804059489550376, "loss": 2.6427, "num_input_tokens_seen": 152043520, "step": 2320, "train_runtime": 59090.3855, "train_tokens_per_second": 2573.067 }, { "epoch": 0.6153439852106166, "grad_norm": 0.330078125, "learning_rate": 0.0004801979761353137, "loss": 2.6757, "num_input_tokens_seen": 152698880, "step": 2330, "train_runtime": 59206.2037, "train_tokens_per_second": 2579.103 }, { "epoch": 0.6179849465205335, "grad_norm": 0.322265625, "learning_rate": 0.000479988950917324, "loss": 2.5902, "num_input_tokens_seen": 153354240, "step": 2340, "train_runtime": 59321.7353, "train_tokens_per_second": 2585.127 }, { "epoch": 0.6206259078304502, "grad_norm": 0.427734375, "learning_rate": 0.0004797788742566709, "loss": 2.704, "num_input_tokens_seen": 154009600, "step": 2350, "train_runtime": 59436.8491, "train_tokens_per_second": 2591.147 }, { "epoch": 0.6232668691403671, "grad_norm": 0.31640625, "learning_rate": 0.00047956774711376395, "loss": 2.5874, "num_input_tokens_seen": 154664960, "step": 2360, "train_runtime": 59552.7183, "train_tokens_per_second": 2597.11 }, { "epoch": 0.6259078304502839, "grad_norm": 0.31640625, "learning_rate": 0.00047935557045381504, "loss": 2.516, "num_input_tokens_seen": 155320320, "step": 2370, "train_runtime": 59668.5711, "train_tokens_per_second": 2603.051 }, { "epoch": 0.6285487917602007, "grad_norm": 0.37890625, "learning_rate": 0.0004791423452468344, "loss": 2.6803, "num_input_tokens_seen": 155975680, "step": 2380, "train_runtime": 59784.5325, "train_tokens_per_second": 2608.964 }, { "epoch": 0.6311897530701175, "grad_norm": 0.337890625, "learning_rate": 0.0004789280724676255, "loss": 2.5819, "num_input_tokens_seen": 156631040, "step": 2390, "train_runtime": 59900.5071, "train_tokens_per_second": 2614.853 }, { "epoch": 0.6338307143800344, "grad_norm": 0.330078125, "learning_rate": 0.0004787127530957812, "loss": 2.7263, "num_input_tokens_seen": 157286400, "step": 2400, "train_runtime": 60023.3878, "train_tokens_per_second": 2620.419 }, { "epoch": 0.6364716756899511, "grad_norm": 0.33203125, "learning_rate": 0.00047849638811567943, "loss": 2.6114, "num_input_tokens_seen": 157941760, "step": 2410, "train_runtime": 60142.469, "train_tokens_per_second": 2626.127 }, { "epoch": 0.639112636999868, "grad_norm": 0.32421875, "learning_rate": 0.0004782789785164776, "loss": 2.6163, "num_input_tokens_seen": 158597120, "step": 2420, "train_runtime": 60260.8341, "train_tokens_per_second": 2631.844 }, { "epoch": 0.6417535983097847, "grad_norm": 0.3515625, "learning_rate": 0.00047806052529210966, "loss": 2.5355, "num_input_tokens_seen": 159252480, "step": 2430, "train_runtime": 60382.3435, "train_tokens_per_second": 2637.401 }, { "epoch": 0.6443945596197016, "grad_norm": 0.326171875, "learning_rate": 0.00047784102944127993, "loss": 2.6617, "num_input_tokens_seen": 159907840, "step": 2440, "train_runtime": 60503.0008, "train_tokens_per_second": 2642.974 }, { "epoch": 0.6470355209296184, "grad_norm": 0.328125, "learning_rate": 0.0004776204919674598, "loss": 2.5774, "num_input_tokens_seen": 160563200, "step": 2450, "train_runtime": 60626.6115, "train_tokens_per_second": 2648.395 }, { "epoch": 0.6496764822395352, "grad_norm": 0.33203125, "learning_rate": 0.0004773989138788826, "loss": 2.6197, "num_input_tokens_seen": 161218560, "step": 2460, "train_runtime": 60749.4747, "train_tokens_per_second": 2653.826 }, { "epoch": 0.652317443549452, "grad_norm": 0.33203125, "learning_rate": 0.00047717629618853886, "loss": 2.5963, "num_input_tokens_seen": 161873920, "step": 2470, "train_runtime": 60874.9324, "train_tokens_per_second": 2659.123 }, { "epoch": 0.6549584048593688, "grad_norm": 0.34375, "learning_rate": 0.0004769526399141721, "loss": 2.6724, "num_input_tokens_seen": 162529280, "step": 2480, "train_runtime": 60999.0162, "train_tokens_per_second": 2664.457 }, { "epoch": 0.6575993661692856, "grad_norm": 0.34375, "learning_rate": 0.0004767279460782737, "loss": 2.5581, "num_input_tokens_seen": 163184640, "step": 2490, "train_runtime": 61120.3538, "train_tokens_per_second": 2669.89 }, { "epoch": 0.6602403274792025, "grad_norm": 0.314453125, "learning_rate": 0.00047650221570807864, "loss": 2.6499, "num_input_tokens_seen": 163840000, "step": 2500, "train_runtime": 61245.0835, "train_tokens_per_second": 2675.154 }, { "epoch": 0.6628812887891192, "grad_norm": 0.357421875, "learning_rate": 0.0004762754498355606, "loss": 2.6674, "num_input_tokens_seen": 164495360, "step": 2510, "train_runtime": 61387.9006, "train_tokens_per_second": 2679.606 }, { "epoch": 0.6655222500990361, "grad_norm": 0.32421875, "learning_rate": 0.0004760476494974273, "loss": 2.6745, "num_input_tokens_seen": 165150720, "step": 2520, "train_runtime": 61511.164, "train_tokens_per_second": 2684.89 }, { "epoch": 0.6681632114089529, "grad_norm": 0.30859375, "learning_rate": 0.00047581881573511566, "loss": 2.6319, "num_input_tokens_seen": 165806080, "step": 2530, "train_runtime": 61635.1824, "train_tokens_per_second": 2690.121 }, { "epoch": 0.6708041727188697, "grad_norm": 0.349609375, "learning_rate": 0.0004755889495947872, "loss": 2.6683, "num_input_tokens_seen": 166461440, "step": 2540, "train_runtime": 61759.2578, "train_tokens_per_second": 2695.328 }, { "epoch": 0.6734451340287865, "grad_norm": 0.318359375, "learning_rate": 0.00047535805212732296, "loss": 2.5637, "num_input_tokens_seen": 167116800, "step": 2550, "train_runtime": 61882.2643, "train_tokens_per_second": 2700.561 }, { "epoch": 0.6760860953387033, "grad_norm": 0.330078125, "learning_rate": 0.00047512612438831934, "loss": 2.6466, "num_input_tokens_seen": 167772160, "step": 2560, "train_runtime": 62004.6058, "train_tokens_per_second": 2705.802 }, { "epoch": 0.6787270566486201, "grad_norm": 0.326171875, "learning_rate": 0.00047489316743808244, "loss": 2.5368, "num_input_tokens_seen": 168427520, "step": 2570, "train_runtime": 62126.926, "train_tokens_per_second": 2711.023 }, { "epoch": 0.6813680179585369, "grad_norm": 0.33984375, "learning_rate": 0.0004746591823416236, "loss": 2.6437, "num_input_tokens_seen": 169082880, "step": 2580, "train_runtime": 62250.0881, "train_tokens_per_second": 2716.187 }, { "epoch": 0.6840089792684537, "grad_norm": 0.3359375, "learning_rate": 0.0004744241701686551, "loss": 2.5841, "num_input_tokens_seen": 169738240, "step": 2590, "train_runtime": 62371.1278, "train_tokens_per_second": 2721.423 }, { "epoch": 0.6866499405783705, "grad_norm": 0.34765625, "learning_rate": 0.00047418813199358393, "loss": 2.5932, "num_input_tokens_seen": 170393600, "step": 2600, "train_runtime": 62492.6048, "train_tokens_per_second": 2726.62 }, { "epoch": 0.6892909018882873, "grad_norm": 0.349609375, "learning_rate": 0.0004739510688955082, "loss": 2.6544, "num_input_tokens_seen": 171048960, "step": 2610, "train_runtime": 62613.8962, "train_tokens_per_second": 2731.805 }, { "epoch": 0.6919318631982041, "grad_norm": 0.314453125, "learning_rate": 0.0004737129819582116, "loss": 2.6383, "num_input_tokens_seen": 171704320, "step": 2620, "train_runtime": 62734.3623, "train_tokens_per_second": 2737.006 }, { "epoch": 0.694572824508121, "grad_norm": 0.314453125, "learning_rate": 0.0004734738722701583, "loss": 2.6641, "num_input_tokens_seen": 172359680, "step": 2630, "train_runtime": 62851.6057, "train_tokens_per_second": 2742.327 }, { "epoch": 0.6972137858180377, "grad_norm": 0.326171875, "learning_rate": 0.00047323374092448836, "loss": 2.6349, "num_input_tokens_seen": 173015040, "step": 2640, "train_runtime": 62971.5987, "train_tokens_per_second": 2747.509 }, { "epoch": 0.6998547471279546, "grad_norm": 0.330078125, "learning_rate": 0.00047299258901901253, "loss": 2.6536, "num_input_tokens_seen": 173670400, "step": 2650, "train_runtime": 63090.7199, "train_tokens_per_second": 2752.709 }, { "epoch": 0.7024957084378713, "grad_norm": 0.30859375, "learning_rate": 0.0004727504176562073, "loss": 2.5464, "num_input_tokens_seen": 174325760, "step": 2660, "train_runtime": 63209.2983, "train_tokens_per_second": 2757.913 }, { "epoch": 0.7051366697477882, "grad_norm": 0.32421875, "learning_rate": 0.00047250722794320977, "loss": 2.5616, "num_input_tokens_seen": 174981120, "step": 2670, "train_runtime": 63328.2606, "train_tokens_per_second": 2763.081 }, { "epoch": 0.707777631057705, "grad_norm": 0.310546875, "learning_rate": 0.0004722630209918126, "loss": 2.6033, "num_input_tokens_seen": 175636480, "step": 2680, "train_runtime": 63447.7327, "train_tokens_per_second": 2768.207 }, { "epoch": 0.7104185923676218, "grad_norm": 0.3203125, "learning_rate": 0.000472017797918459, "loss": 2.4887, "num_input_tokens_seen": 176291840, "step": 2690, "train_runtime": 63567.9084, "train_tokens_per_second": 2773.284 }, { "epoch": 0.7130595536775386, "grad_norm": 0.318359375, "learning_rate": 0.00047177155984423776, "loss": 2.6174, "num_input_tokens_seen": 176947200, "step": 2700, "train_runtime": 63686.5563, "train_tokens_per_second": 2778.407 }, { "epoch": 0.7157005149874555, "grad_norm": 0.353515625, "learning_rate": 0.00047152430789487764, "loss": 2.6112, "num_input_tokens_seen": 177602560, "step": 2710, "train_runtime": 63807.847, "train_tokens_per_second": 2783.397 }, { "epoch": 0.7183414762973722, "grad_norm": 0.314453125, "learning_rate": 0.00047127604320074286, "loss": 2.5791, "num_input_tokens_seen": 178257920, "step": 2720, "train_runtime": 63924.7415, "train_tokens_per_second": 2788.559 }, { "epoch": 0.7209824376072891, "grad_norm": 0.3125, "learning_rate": 0.00047102676689682733, "loss": 2.5559, "num_input_tokens_seen": 178913280, "step": 2730, "train_runtime": 64045.3328, "train_tokens_per_second": 2793.541 }, { "epoch": 0.7236233989172058, "grad_norm": 0.30859375, "learning_rate": 0.00047077648012275005, "loss": 2.5332, "num_input_tokens_seen": 179568640, "step": 2740, "train_runtime": 64163.5594, "train_tokens_per_second": 2798.608 }, { "epoch": 0.7262643602271227, "grad_norm": 0.3046875, "learning_rate": 0.00047052518402274936, "loss": 2.582, "num_input_tokens_seen": 180224000, "step": 2750, "train_runtime": 64280.2933, "train_tokens_per_second": 2803.721 }, { "epoch": 0.7289053215370395, "grad_norm": 0.328125, "learning_rate": 0.0004702728797456779, "loss": 2.6733, "num_input_tokens_seen": 180879360, "step": 2760, "train_runtime": 64397.0611, "train_tokens_per_second": 2808.814 }, { "epoch": 0.7315462828469563, "grad_norm": 0.318359375, "learning_rate": 0.00047001956844499774, "loss": 2.566, "num_input_tokens_seen": 181534720, "step": 2770, "train_runtime": 64514.3091, "train_tokens_per_second": 2813.868 }, { "epoch": 0.7341872441568731, "grad_norm": 0.333984375, "learning_rate": 0.00046976525127877434, "loss": 2.5983, "num_input_tokens_seen": 182190080, "step": 2780, "train_runtime": 64637.4081, "train_tokens_per_second": 2818.648 }, { "epoch": 0.73682820546679, "grad_norm": 0.318359375, "learning_rate": 0.00046950992940967206, "loss": 2.6204, "num_input_tokens_seen": 182845440, "step": 2790, "train_runtime": 64760.9482, "train_tokens_per_second": 2823.39 }, { "epoch": 0.7394691667767067, "grad_norm": 0.35546875, "learning_rate": 0.0004692536040049482, "loss": 2.5219, "num_input_tokens_seen": 183500800, "step": 2800, "train_runtime": 64885.4696, "train_tokens_per_second": 2828.072 }, { "epoch": 0.7421101280866236, "grad_norm": 0.337890625, "learning_rate": 0.00046899627623644817, "loss": 2.4908, "num_input_tokens_seen": 184156160, "step": 2810, "train_runtime": 65004.5941, "train_tokens_per_second": 2832.971 }, { "epoch": 0.7447510893965403, "grad_norm": 0.357421875, "learning_rate": 0.0004687379472805996, "loss": 2.6228, "num_input_tokens_seen": 184811520, "step": 2820, "train_runtime": 65122.5778, "train_tokens_per_second": 2837.902 }, { "epoch": 0.7473920507064572, "grad_norm": 0.3125, "learning_rate": 0.0004684786183184074, "loss": 2.6018, "num_input_tokens_seen": 185466880, "step": 2830, "train_runtime": 65239.4763, "train_tokens_per_second": 2842.863 }, { "epoch": 0.750033012016374, "grad_norm": 0.326171875, "learning_rate": 0.0004682182905354485, "loss": 2.5533, "num_input_tokens_seen": 186122240, "step": 2840, "train_runtime": 65361.0354, "train_tokens_per_second": 2847.602 }, { "epoch": 0.7526739733262908, "grad_norm": 0.314453125, "learning_rate": 0.0004679569651218657, "loss": 2.5546, "num_input_tokens_seen": 186777600, "step": 2850, "train_runtime": 65479.2513, "train_tokens_per_second": 2852.47 }, { "epoch": 0.7553149346362076, "grad_norm": 0.306640625, "learning_rate": 0.0004676946432723628, "loss": 2.5145, "num_input_tokens_seen": 187432960, "step": 2860, "train_runtime": 65597.8854, "train_tokens_per_second": 2857.302 }, { "epoch": 0.7579558959461243, "grad_norm": 0.314453125, "learning_rate": 0.00046743132618619923, "loss": 2.5676, "num_input_tokens_seen": 188088320, "step": 2870, "train_runtime": 65716.9951, "train_tokens_per_second": 2862.096 }, { "epoch": 0.7605968572560412, "grad_norm": 0.361328125, "learning_rate": 0.00046716701506718415, "loss": 2.5907, "num_input_tokens_seen": 188743680, "step": 2880, "train_runtime": 65840.4604, "train_tokens_per_second": 2866.682 }, { "epoch": 0.763237818565958, "grad_norm": 0.31640625, "learning_rate": 0.0004669017111236712, "loss": 2.5965, "num_input_tokens_seen": 189399040, "step": 2890, "train_runtime": 65957.414, "train_tokens_per_second": 2871.535 }, { "epoch": 0.7658787798758748, "grad_norm": 0.3125, "learning_rate": 0.00046663541556855295, "loss": 2.5296, "num_input_tokens_seen": 190054400, "step": 2900, "train_runtime": 66075.1886, "train_tokens_per_second": 2876.335 }, { "epoch": 0.7685197411857916, "grad_norm": 0.30859375, "learning_rate": 0.0004663681296192552, "loss": 2.5497, "num_input_tokens_seen": 190709760, "step": 2910, "train_runtime": 66192.3246, "train_tokens_per_second": 2881.146 }, { "epoch": 0.7711607024957084, "grad_norm": 0.306640625, "learning_rate": 0.00046609985449773183, "loss": 2.5649, "num_input_tokens_seen": 191365120, "step": 2920, "train_runtime": 66311.303, "train_tokens_per_second": 2885.86 }, { "epoch": 0.7738016638056252, "grad_norm": 0.3046875, "learning_rate": 0.00046583059143045857, "loss": 2.5338, "num_input_tokens_seen": 192020480, "step": 2930, "train_runtime": 66429.1291, "train_tokens_per_second": 2890.607 }, { "epoch": 0.7764426251155421, "grad_norm": 0.302734375, "learning_rate": 0.00046556034164842814, "loss": 2.5703, "num_input_tokens_seen": 192675840, "step": 2940, "train_runtime": 66546.2716, "train_tokens_per_second": 2895.366 }, { "epoch": 0.7790835864254588, "grad_norm": 0.333984375, "learning_rate": 0.00046528910638714386, "loss": 2.6228, "num_input_tokens_seen": 193331200, "step": 2950, "train_runtime": 66663.6787, "train_tokens_per_second": 2900.098 }, { "epoch": 0.7817245477353757, "grad_norm": 0.314453125, "learning_rate": 0.0004650168868866146, "loss": 2.4879, "num_input_tokens_seen": 193986560, "step": 2960, "train_runtime": 66780.8493, "train_tokens_per_second": 2904.823 }, { "epoch": 0.7843655090452925, "grad_norm": 0.33203125, "learning_rate": 0.0004647436843913488, "loss": 2.5265, "num_input_tokens_seen": 194641920, "step": 2970, "train_runtime": 66899.303, "train_tokens_per_second": 2909.476 }, { "epoch": 0.7870064703552093, "grad_norm": 0.3515625, "learning_rate": 0.00046446950015034894, "loss": 2.5625, "num_input_tokens_seen": 195297280, "step": 2980, "train_runtime": 67018.546, "train_tokens_per_second": 2914.078 }, { "epoch": 0.7896474316651261, "grad_norm": 0.35546875, "learning_rate": 0.00046419433541710573, "loss": 2.5502, "num_input_tokens_seen": 195952640, "step": 2990, "train_runtime": 67137.0732, "train_tokens_per_second": 2918.695 }, { "epoch": 0.7922883929750429, "grad_norm": 0.318359375, "learning_rate": 0.00046391819144959225, "loss": 2.5597, "num_input_tokens_seen": 196608000, "step": 3000, "train_runtime": 67258.6768, "train_tokens_per_second": 2923.162 }, { "epoch": 0.7949293542849597, "grad_norm": 0.31640625, "learning_rate": 0.00046364106951025865, "loss": 2.501, "num_input_tokens_seen": 197263360, "step": 3010, "train_runtime": 67390.2208, "train_tokens_per_second": 2927.181 }, { "epoch": 0.7975703155948766, "grad_norm": 0.3125, "learning_rate": 0.0004633629708660258, "loss": 2.6074, "num_input_tokens_seen": 197918720, "step": 3020, "train_runtime": 67511.9884, "train_tokens_per_second": 2931.609 }, { "epoch": 0.8002112769047933, "grad_norm": 0.30859375, "learning_rate": 0.00046308389678828, "loss": 2.5246, "num_input_tokens_seen": 198574080, "step": 3030, "train_runtime": 67633.7618, "train_tokens_per_second": 2936.02 }, { "epoch": 0.8028522382147102, "grad_norm": 0.333984375, "learning_rate": 0.0004628038485528667, "loss": 2.5615, "num_input_tokens_seen": 199229440, "step": 3040, "train_runtime": 67751.2595, "train_tokens_per_second": 2940.601 }, { "epoch": 0.805493199524627, "grad_norm": 0.296875, "learning_rate": 0.0004625228274400853, "loss": 2.5714, "num_input_tokens_seen": 199884800, "step": 3050, "train_runtime": 67868.5083, "train_tokens_per_second": 2945.177 }, { "epoch": 0.8081341608345438, "grad_norm": 0.369140625, "learning_rate": 0.00046224083473468246, "loss": 2.5129, "num_input_tokens_seen": 200540160, "step": 3060, "train_runtime": 67989.2765, "train_tokens_per_second": 2949.585 }, { "epoch": 0.8107751221444606, "grad_norm": 0.3046875, "learning_rate": 0.0004619578717258471, "loss": 2.5505, "num_input_tokens_seen": 201195520, "step": 3070, "train_runtime": 68109.2186, "train_tokens_per_second": 2954.013 }, { "epoch": 0.8134160834543774, "grad_norm": 0.306640625, "learning_rate": 0.0004616739397072037, "loss": 2.5338, "num_input_tokens_seen": 201850880, "step": 3080, "train_runtime": 68229.9922, "train_tokens_per_second": 2958.389 }, { "epoch": 0.8160570447642942, "grad_norm": 0.30078125, "learning_rate": 0.00046138903997680706, "loss": 2.5715, "num_input_tokens_seen": 202506240, "step": 3090, "train_runtime": 68347.1664, "train_tokens_per_second": 2962.906 }, { "epoch": 0.8186980060742111, "grad_norm": 0.3203125, "learning_rate": 0.00046110317383713596, "loss": 2.5556, "num_input_tokens_seen": 203161600, "step": 3100, "train_runtime": 68463.2499, "train_tokens_per_second": 2967.455 }, { "epoch": 0.8213389673841278, "grad_norm": 0.330078125, "learning_rate": 0.0004608163425950873, "loss": 2.5133, "num_input_tokens_seen": 203816960, "step": 3110, "train_runtime": 68582.6904, "train_tokens_per_second": 2971.843 }, { "epoch": 0.8239799286940447, "grad_norm": 0.3046875, "learning_rate": 0.00046052854756197014, "loss": 2.5604, "num_input_tokens_seen": 204472320, "step": 3120, "train_runtime": 68702.7138, "train_tokens_per_second": 2976.19 }, { "epoch": 0.8266208900039614, "grad_norm": 0.298828125, "learning_rate": 0.0004602397900534999, "loss": 2.4993, "num_input_tokens_seen": 205127680, "step": 3130, "train_runtime": 68820.0311, "train_tokens_per_second": 2980.639 }, { "epoch": 0.8292618513138783, "grad_norm": 0.322265625, "learning_rate": 0.00045995007138979196, "loss": 2.495, "num_input_tokens_seen": 205783040, "step": 3140, "train_runtime": 68936.7593, "train_tokens_per_second": 2985.099 }, { "epoch": 0.8319028126237951, "grad_norm": 0.3359375, "learning_rate": 0.00045965939289535586, "loss": 2.558, "num_input_tokens_seen": 206438400, "step": 3150, "train_runtime": 69054.1993, "train_tokens_per_second": 2989.513 }, { "epoch": 0.8345437739337118, "grad_norm": 0.36328125, "learning_rate": 0.00045936775589908917, "loss": 2.6297, "num_input_tokens_seen": 207093760, "step": 3160, "train_runtime": 69172.7498, "train_tokens_per_second": 2993.863 }, { "epoch": 0.8371847352436287, "grad_norm": 0.3359375, "learning_rate": 0.0004590751617342716, "loss": 2.5844, "num_input_tokens_seen": 207749120, "step": 3170, "train_runtime": 69292.1584, "train_tokens_per_second": 2998.162 }, { "epoch": 0.8398256965535454, "grad_norm": 0.306640625, "learning_rate": 0.0004587816117385586, "loss": 2.5686, "num_input_tokens_seen": 208404480, "step": 3180, "train_runtime": 69410.4443, "train_tokens_per_second": 3002.495 }, { "epoch": 0.8424666578634623, "grad_norm": 0.32421875, "learning_rate": 0.0004584871072539755, "loss": 2.5584, "num_input_tokens_seen": 209059840, "step": 3190, "train_runtime": 69530.3644, "train_tokens_per_second": 3006.742 }, { "epoch": 0.8451076191733791, "grad_norm": 0.3046875, "learning_rate": 0.0004581916496269112, "loss": 2.5357, "num_input_tokens_seen": 209715200, "step": 3200, "train_runtime": 69649.7436, "train_tokens_per_second": 3010.997 }, { "epoch": 0.8477485804832959, "grad_norm": 0.322265625, "learning_rate": 0.00045789524020811213, "loss": 2.6667, "num_input_tokens_seen": 210370560, "step": 3210, "train_runtime": 69767.2675, "train_tokens_per_second": 3015.319 }, { "epoch": 0.8503895417932127, "grad_norm": 0.310546875, "learning_rate": 0.0004575978803526761, "loss": 2.5453, "num_input_tokens_seen": 211025920, "step": 3220, "train_runtime": 69887.2044, "train_tokens_per_second": 3019.522 }, { "epoch": 0.8530305031031296, "grad_norm": 0.30859375, "learning_rate": 0.00045729957142004587, "loss": 2.5007, "num_input_tokens_seen": 211681280, "step": 3230, "train_runtime": 70005.0651, "train_tokens_per_second": 3023.799 }, { "epoch": 0.8556714644130463, "grad_norm": 0.294921875, "learning_rate": 0.00045700031477400335, "loss": 2.5585, "num_input_tokens_seen": 212336640, "step": 3240, "train_runtime": 70121.88, "train_tokens_per_second": 3028.108 }, { "epoch": 0.8583124257229632, "grad_norm": 0.318359375, "learning_rate": 0.0004567001117826628, "loss": 2.5284, "num_input_tokens_seen": 212992000, "step": 3250, "train_runtime": 70238.6702, "train_tokens_per_second": 3032.404 }, { "epoch": 0.8609533870328799, "grad_norm": 0.314453125, "learning_rate": 0.00045639896381846525, "loss": 2.564, "num_input_tokens_seen": 213647360, "step": 3260, "train_runtime": 70356.4627, "train_tokens_per_second": 3036.642 }, { "epoch": 0.8635943483427968, "grad_norm": 0.3046875, "learning_rate": 0.0004560968722581716, "loss": 2.5174, "num_input_tokens_seen": 214302720, "step": 3270, "train_runtime": 70473.0866, "train_tokens_per_second": 3040.916 }, { "epoch": 0.8662353096527136, "grad_norm": 0.33203125, "learning_rate": 0.00045579383848285673, "loss": 2.5079, "num_input_tokens_seen": 214958080, "step": 3280, "train_runtime": 70590.8319, "train_tokens_per_second": 3045.127 }, { "epoch": 0.8688762709626304, "grad_norm": 0.3046875, "learning_rate": 0.000455489863877903, "loss": 2.5546, "num_input_tokens_seen": 215613440, "step": 3290, "train_runtime": 70707.7468, "train_tokens_per_second": 3049.361 }, { "epoch": 0.8715172322725472, "grad_norm": 0.33203125, "learning_rate": 0.00045518494983299397, "loss": 2.5635, "num_input_tokens_seen": 216268800, "step": 3300, "train_runtime": 70825.237, "train_tokens_per_second": 3053.556 }, { "epoch": 0.874158193582464, "grad_norm": 0.30859375, "learning_rate": 0.00045487909774210813, "loss": 2.5723, "num_input_tokens_seen": 216924160, "step": 3310, "train_runtime": 70944.418, "train_tokens_per_second": 3057.664 }, { "epoch": 0.8767991548923808, "grad_norm": 0.3203125, "learning_rate": 0.0004545723090035123, "loss": 2.5792, "num_input_tokens_seen": 217579520, "step": 3320, "train_runtime": 71062.7059, "train_tokens_per_second": 3061.796 }, { "epoch": 0.8794401162022977, "grad_norm": 0.3125, "learning_rate": 0.0004542645850197555, "loss": 2.4991, "num_input_tokens_seen": 218234880, "step": 3330, "train_runtime": 71179.9107, "train_tokens_per_second": 3065.962 }, { "epoch": 0.8820810775122144, "grad_norm": 0.30859375, "learning_rate": 0.0004539559271976624, "loss": 2.4421, "num_input_tokens_seen": 218890240, "step": 3340, "train_runtime": 71299.1948, "train_tokens_per_second": 3070.024 }, { "epoch": 0.8847220388221313, "grad_norm": 0.341796875, "learning_rate": 0.0004536463369483269, "loss": 2.526, "num_input_tokens_seen": 219545600, "step": 3350, "train_runtime": 71416.6115, "train_tokens_per_second": 3074.153 }, { "epoch": 0.887363000132048, "grad_norm": 0.31640625, "learning_rate": 0.00045333581568710556, "loss": 2.5114, "num_input_tokens_seen": 220200960, "step": 3360, "train_runtime": 71537.1781, "train_tokens_per_second": 3078.133 }, { "epoch": 0.8900039614419649, "grad_norm": 0.30859375, "learning_rate": 0.0004530243648336115, "loss": 2.4122, "num_input_tokens_seen": 220856320, "step": 3370, "train_runtime": 71654.2281, "train_tokens_per_second": 3082.251 }, { "epoch": 0.8926449227518817, "grad_norm": 0.2890625, "learning_rate": 0.00045271198581170745, "loss": 2.5761, "num_input_tokens_seen": 221511680, "step": 3380, "train_runtime": 71771.9695, "train_tokens_per_second": 3086.326 }, { "epoch": 0.8952858840617985, "grad_norm": 0.302734375, "learning_rate": 0.00045239868004949964, "loss": 2.4979, "num_input_tokens_seen": 222167040, "step": 3390, "train_runtime": 71888.7516, "train_tokens_per_second": 3090.428 }, { "epoch": 0.8979268453717153, "grad_norm": 0.322265625, "learning_rate": 0.0004520844489793309, "loss": 2.5003, "num_input_tokens_seen": 222822400, "step": 3400, "train_runtime": 72007.6371, "train_tokens_per_second": 3094.427 }, { "epoch": 0.9005678066816322, "grad_norm": 0.30078125, "learning_rate": 0.0004517692940377743, "loss": 2.5321, "num_input_tokens_seen": 223477760, "step": 3410, "train_runtime": 72126.9538, "train_tokens_per_second": 3098.395 }, { "epoch": 0.9032087679915489, "grad_norm": 0.330078125, "learning_rate": 0.00045145321666562683, "loss": 2.5793, "num_input_tokens_seen": 224133120, "step": 3420, "train_runtime": 72248.5725, "train_tokens_per_second": 3102.25 }, { "epoch": 0.9058497293014657, "grad_norm": 0.30078125, "learning_rate": 0.0004511362183079023, "loss": 2.5039, "num_input_tokens_seen": 224788480, "step": 3430, "train_runtime": 72367.5334, "train_tokens_per_second": 3106.206 }, { "epoch": 0.9084906906113825, "grad_norm": 0.29296875, "learning_rate": 0.00045081830041382524, "loss": 2.5551, "num_input_tokens_seen": 225443840, "step": 3440, "train_runtime": 72486.8809, "train_tokens_per_second": 3110.133 }, { "epoch": 0.9111316519212993, "grad_norm": 0.294921875, "learning_rate": 0.0004504994644368238, "loss": 2.5077, "num_input_tokens_seen": 226099200, "step": 3450, "train_runtime": 72604.9764, "train_tokens_per_second": 3114.101 }, { "epoch": 0.9137726132312162, "grad_norm": 0.34765625, "learning_rate": 0.00045017971183452333, "loss": 2.5018, "num_input_tokens_seen": 226754560, "step": 3460, "train_runtime": 72722.5169, "train_tokens_per_second": 3118.079 }, { "epoch": 0.9164135745411329, "grad_norm": 0.296875, "learning_rate": 0.00044985904406874, "loss": 2.4891, "num_input_tokens_seen": 227409920, "step": 3470, "train_runtime": 72842.3214, "train_tokens_per_second": 3121.948 }, { "epoch": 0.9190545358510498, "grad_norm": 0.318359375, "learning_rate": 0.0004495374626054736, "loss": 2.4945, "num_input_tokens_seen": 228065280, "step": 3480, "train_runtime": 72959.9692, "train_tokens_per_second": 3125.896 }, { "epoch": 0.9216954971609665, "grad_norm": 0.31640625, "learning_rate": 0.0004492149689149011, "loss": 2.5043, "num_input_tokens_seen": 228720640, "step": 3490, "train_runtime": 73079.5466, "train_tokens_per_second": 3129.749 }, { "epoch": 0.9243364584708834, "grad_norm": 0.3046875, "learning_rate": 0.00044889156447137007, "loss": 2.5296, "num_input_tokens_seen": 229376000, "step": 3500, "train_runtime": 73199.3203, "train_tokens_per_second": 3133.581 }, { "epoch": 0.9269774197808002, "grad_norm": 0.296875, "learning_rate": 0.0004485672507533916, "loss": 2.5304, "num_input_tokens_seen": 230031360, "step": 3510, "train_runtime": 73330.1712, "train_tokens_per_second": 3136.927 }, { "epoch": 0.929618381090717, "grad_norm": 0.30078125, "learning_rate": 0.0004482420292436338, "loss": 2.4987, "num_input_tokens_seen": 230686720, "step": 3520, "train_runtime": 73453.3714, "train_tokens_per_second": 3140.587 }, { "epoch": 0.9322593424006338, "grad_norm": 0.341796875, "learning_rate": 0.00044791590142891487, "loss": 2.523, "num_input_tokens_seen": 231342080, "step": 3530, "train_runtime": 73578.5986, "train_tokens_per_second": 3144.149 }, { "epoch": 0.9349003037105507, "grad_norm": 0.3125, "learning_rate": 0.00044758886880019646, "loss": 2.5653, "num_input_tokens_seen": 231997440, "step": 3540, "train_runtime": 73698.2351, "train_tokens_per_second": 3147.938 }, { "epoch": 0.9375412650204674, "grad_norm": 0.29296875, "learning_rate": 0.00044726093285257676, "loss": 2.5118, "num_input_tokens_seen": 232652800, "step": 3550, "train_runtime": 73816.1535, "train_tokens_per_second": 3151.787 }, { "epoch": 0.9401822263303843, "grad_norm": 0.30859375, "learning_rate": 0.00044693209508528365, "loss": 2.5051, "num_input_tokens_seen": 233308160, "step": 3560, "train_runtime": 73934.7982, "train_tokens_per_second": 3155.593 }, { "epoch": 0.942823187640301, "grad_norm": 0.30078125, "learning_rate": 0.00044660235700166786, "loss": 2.5131, "num_input_tokens_seen": 233963520, "step": 3570, "train_runtime": 74054.392, "train_tokens_per_second": 3159.347 }, { "epoch": 0.9454641489502179, "grad_norm": 0.337890625, "learning_rate": 0.000446271720109196, "loss": 2.5192, "num_input_tokens_seen": 234618880, "step": 3580, "train_runtime": 74172.2787, "train_tokens_per_second": 3163.161 }, { "epoch": 0.9481051102601347, "grad_norm": 0.37109375, "learning_rate": 0.000445940185919444, "loss": 2.5561, "num_input_tokens_seen": 235274240, "step": 3590, "train_runtime": 74292.5192, "train_tokens_per_second": 3166.863 }, { "epoch": 0.9507460715700515, "grad_norm": 0.345703125, "learning_rate": 0.0004456077559480898, "loss": 2.5362, "num_input_tokens_seen": 235929600, "step": 3600, "train_runtime": 74410.8817, "train_tokens_per_second": 3170.633 }, { "epoch": 0.9533870328799683, "grad_norm": 0.29296875, "learning_rate": 0.0004452744317149067, "loss": 2.4484, "num_input_tokens_seen": 236584960, "step": 3610, "train_runtime": 74528.8241, "train_tokens_per_second": 3174.409 }, { "epoch": 0.9560279941898852, "grad_norm": 0.298828125, "learning_rate": 0.00044494021474375626, "loss": 2.4905, "num_input_tokens_seen": 237240320, "step": 3620, "train_runtime": 74646.6587, "train_tokens_per_second": 3178.177 }, { "epoch": 0.9586689554998019, "grad_norm": 0.3046875, "learning_rate": 0.00044460510656258123, "loss": 2.5725, "num_input_tokens_seen": 237895680, "step": 3630, "train_runtime": 74767.2065, "train_tokens_per_second": 3181.818 }, { "epoch": 0.9613099168097188, "grad_norm": 0.291015625, "learning_rate": 0.000444269108703399, "loss": 2.4727, "num_input_tokens_seen": 238551040, "step": 3640, "train_runtime": 74887.4523, "train_tokens_per_second": 3185.461 }, { "epoch": 0.9639508781196355, "grad_norm": 0.31640625, "learning_rate": 0.0004439322227022941, "loss": 2.6083, "num_input_tokens_seen": 239206400, "step": 3650, "train_runtime": 75006.6107, "train_tokens_per_second": 3189.138 }, { "epoch": 0.9665918394295524, "grad_norm": 0.298828125, "learning_rate": 0.00044359445009941147, "loss": 2.4778, "num_input_tokens_seen": 239861760, "step": 3660, "train_runtime": 75125.5365, "train_tokens_per_second": 3192.813 }, { "epoch": 0.9692328007394692, "grad_norm": 0.314453125, "learning_rate": 0.0004432557924389493, "loss": 2.4564, "num_input_tokens_seen": 240517120, "step": 3670, "train_runtime": 75245.5579, "train_tokens_per_second": 3196.429 }, { "epoch": 0.971873762049386, "grad_norm": 0.30078125, "learning_rate": 0.00044291625126915204, "loss": 2.4725, "num_input_tokens_seen": 241172480, "step": 3680, "train_runtime": 75365.6311, "train_tokens_per_second": 3200.033 }, { "epoch": 0.9745147233593028, "grad_norm": 0.296875, "learning_rate": 0.0004425758281423032, "loss": 2.4829, "num_input_tokens_seen": 241827840, "step": 3690, "train_runtime": 75488.5475, "train_tokens_per_second": 3203.504 }, { "epoch": 0.9771556846692196, "grad_norm": 0.29296875, "learning_rate": 0.0004422345246147185, "loss": 2.5562, "num_input_tokens_seen": 242483200, "step": 3700, "train_runtime": 75606.4806, "train_tokens_per_second": 3207.175 }, { "epoch": 0.9797966459791364, "grad_norm": 0.3359375, "learning_rate": 0.0004418923422467385, "loss": 2.5714, "num_input_tokens_seen": 243138560, "step": 3710, "train_runtime": 75724.8015, "train_tokens_per_second": 3210.818 }, { "epoch": 0.9824376072890532, "grad_norm": 0.298828125, "learning_rate": 0.00044154928260272145, "loss": 2.4656, "num_input_tokens_seen": 243793920, "step": 3720, "train_runtime": 75843.5826, "train_tokens_per_second": 3214.43 }, { "epoch": 0.98507856859897, "grad_norm": 0.294921875, "learning_rate": 0.00044120534725103653, "loss": 2.4655, "num_input_tokens_seen": 244449280, "step": 3730, "train_runtime": 75964.9389, "train_tokens_per_second": 3217.922 }, { "epoch": 0.9877195299088868, "grad_norm": 0.296875, "learning_rate": 0.0004408605377640562, "loss": 2.5045, "num_input_tokens_seen": 245104640, "step": 3740, "train_runtime": 76084.5912, "train_tokens_per_second": 3221.475 }, { "epoch": 0.9903604912188037, "grad_norm": 0.302734375, "learning_rate": 0.0004405148557181492, "loss": 2.5, "num_input_tokens_seen": 245760000, "step": 3750, "train_runtime": 76204.0097, "train_tokens_per_second": 3225.027 }, { "epoch": 0.9930014525287204, "grad_norm": 0.333984375, "learning_rate": 0.00044016830269367346, "loss": 2.5142, "num_input_tokens_seen": 246415360, "step": 3760, "train_runtime": 76322.0106, "train_tokens_per_second": 3228.628 }, { "epoch": 0.9956424138386373, "grad_norm": 0.30859375, "learning_rate": 0.0004398208802749687, "loss": 2.5142, "num_input_tokens_seen": 247070720, "step": 3770, "train_runtime": 76439.2146, "train_tokens_per_second": 3232.251 }, { "epoch": 0.998283375148554, "grad_norm": 0.345703125, "learning_rate": 0.0004394725900503493, "loss": 2.5362, "num_input_tokens_seen": 247726080, "step": 3780, "train_runtime": 76559.5869, "train_tokens_per_second": 3235.729 }, { "epoch": 1.000792288392975, "grad_norm": 0.3125, "learning_rate": 0.0004391234336120969, "loss": 2.4494, "num_input_tokens_seen": 248340480, "step": 3790, "train_runtime": 76675.4597, "train_tokens_per_second": 3238.852 }, { "epoch": 1.003433249702892, "grad_norm": 0.349609375, "learning_rate": 0.00043877341255645335, "loss": 2.2429, "num_input_tokens_seen": 248995840, "step": 3800, "train_runtime": 76794.0486, "train_tokens_per_second": 3242.385 }, { "epoch": 1.0060742110128087, "grad_norm": 0.3125, "learning_rate": 0.0004384225284836133, "loss": 2.3023, "num_input_tokens_seen": 249651200, "step": 3810, "train_runtime": 76911.1715, "train_tokens_per_second": 3245.968 }, { "epoch": 1.0087151723227255, "grad_norm": 0.33203125, "learning_rate": 0.0004380707829977166, "loss": 2.2617, "num_input_tokens_seen": 250306560, "step": 3820, "train_runtime": 77029.4457, "train_tokens_per_second": 3249.492 }, { "epoch": 1.0113561336326422, "grad_norm": 0.328125, "learning_rate": 0.0004377181777068416, "loss": 2.251, "num_input_tokens_seen": 250961920, "step": 3830, "train_runtime": 77147.4148, "train_tokens_per_second": 3253.018 }, { "epoch": 1.0139970949425592, "grad_norm": 0.37109375, "learning_rate": 0.0004373647142229972, "loss": 2.3025, "num_input_tokens_seen": 251617280, "step": 3840, "train_runtime": 77264.2759, "train_tokens_per_second": 3256.58 }, { "epoch": 1.016638056252476, "grad_norm": 0.349609375, "learning_rate": 0.00043701039416211564, "loss": 2.2215, "num_input_tokens_seen": 252272640, "step": 3850, "train_runtime": 77382.3779, "train_tokens_per_second": 3260.079 }, { "epoch": 1.0192790175623927, "grad_norm": 0.32421875, "learning_rate": 0.00043665521914404545, "loss": 2.2709, "num_input_tokens_seen": 252928000, "step": 3860, "train_runtime": 77505.1764, "train_tokens_per_second": 3263.369 }, { "epoch": 1.0219199788723095, "grad_norm": 0.337890625, "learning_rate": 0.0004362991907925435, "loss": 2.1754, "num_input_tokens_seen": 253583360, "step": 3870, "train_runtime": 77624.7271, "train_tokens_per_second": 3266.786 }, { "epoch": 1.0245609401822264, "grad_norm": 0.3125, "learning_rate": 0.000435942310735268, "loss": 2.3135, "num_input_tokens_seen": 254238720, "step": 3880, "train_runtime": 77750.1998, "train_tokens_per_second": 3269.943 }, { "epoch": 1.0272019014921432, "grad_norm": 0.30078125, "learning_rate": 0.000435584580603771, "loss": 2.2239, "num_input_tokens_seen": 254894080, "step": 3890, "train_runtime": 77869.7265, "train_tokens_per_second": 3273.34 }, { "epoch": 1.02984286280206, "grad_norm": 0.314453125, "learning_rate": 0.00043522600203349055, "loss": 2.2953, "num_input_tokens_seen": 255549440, "step": 3900, "train_runtime": 77989.4549, "train_tokens_per_second": 3276.718 }, { "epoch": 1.0324838241119767, "grad_norm": 0.314453125, "learning_rate": 0.00043486657666374384, "loss": 2.318, "num_input_tokens_seen": 256204800, "step": 3910, "train_runtime": 78108.7097, "train_tokens_per_second": 3280.105 }, { "epoch": 1.0351247854218935, "grad_norm": 0.318359375, "learning_rate": 0.00043450630613771916, "loss": 2.2676, "num_input_tokens_seen": 256860160, "step": 3920, "train_runtime": 78226.6613, "train_tokens_per_second": 3283.537 }, { "epoch": 1.0377657467318104, "grad_norm": 0.353515625, "learning_rate": 0.0004341451921024687, "loss": 2.2883, "num_input_tokens_seen": 257515520, "step": 3930, "train_runtime": 78344.3741, "train_tokens_per_second": 3286.969 }, { "epoch": 1.0404067080417272, "grad_norm": 0.31640625, "learning_rate": 0.0004337832362089009, "loss": 2.2167, "num_input_tokens_seen": 258170880, "step": 3940, "train_runtime": 78463.8699, "train_tokens_per_second": 3290.315 }, { "epoch": 1.043047669351644, "grad_norm": 0.314453125, "learning_rate": 0.00043342044011177293, "loss": 2.2106, "num_input_tokens_seen": 258826240, "step": 3950, "train_runtime": 78585.3237, "train_tokens_per_second": 3293.57 }, { "epoch": 1.0456886306615607, "grad_norm": 0.333984375, "learning_rate": 0.00043305680546968316, "loss": 2.3089, "num_input_tokens_seen": 259481600, "step": 3960, "train_runtime": 78702.6123, "train_tokens_per_second": 3296.988 }, { "epoch": 1.0483295919714777, "grad_norm": 0.3203125, "learning_rate": 0.0004326923339450636, "loss": 2.3229, "num_input_tokens_seen": 260136960, "step": 3970, "train_runtime": 78820.7623, "train_tokens_per_second": 3300.361 }, { "epoch": 1.0509705532813944, "grad_norm": 0.33203125, "learning_rate": 0.00043232702720417206, "loss": 2.2907, "num_input_tokens_seen": 260792320, "step": 3980, "train_runtime": 78940.7675, "train_tokens_per_second": 3303.646 }, { "epoch": 1.0536115145913112, "grad_norm": 0.314453125, "learning_rate": 0.0004319608869170848, "loss": 2.2815, "num_input_tokens_seen": 261447680, "step": 3990, "train_runtime": 79060.2279, "train_tokens_per_second": 3306.943 }, { "epoch": 1.056252475901228, "grad_norm": 0.328125, "learning_rate": 0.00043159391475768895, "loss": 2.1302, "num_input_tokens_seen": 262103040, "step": 4000, "train_runtime": 79180.0707, "train_tokens_per_second": 3310.215 }, { "epoch": 1.058893437211145, "grad_norm": 0.318359375, "learning_rate": 0.0004312261124036746, "loss": 2.3044, "num_input_tokens_seen": 262758400, "step": 4010, "train_runtime": 79313.4214, "train_tokens_per_second": 3312.912 }, { "epoch": 1.0615343985210617, "grad_norm": 0.33203125, "learning_rate": 0.0004308574815365271, "loss": 2.342, "num_input_tokens_seen": 263413760, "step": 4020, "train_runtime": 79435.9458, "train_tokens_per_second": 3316.052 }, { "epoch": 1.0641753598309784, "grad_norm": 0.34765625, "learning_rate": 0.0004304880238415199, "loss": 2.3127, "num_input_tokens_seen": 264069120, "step": 4030, "train_runtime": 79561.0668, "train_tokens_per_second": 3319.075 }, { "epoch": 1.0668163211408952, "grad_norm": 0.33203125, "learning_rate": 0.000430117741007706, "loss": 2.2446, "num_input_tokens_seen": 264724480, "step": 4040, "train_runtime": 79685.8835, "train_tokens_per_second": 3322.1 }, { "epoch": 1.0694572824508122, "grad_norm": 0.3046875, "learning_rate": 0.0004297466347279111, "loss": 2.2921, "num_input_tokens_seen": 265379840, "step": 4050, "train_runtime": 79812.4232, "train_tokens_per_second": 3325.044 }, { "epoch": 1.072098243760729, "grad_norm": 0.322265625, "learning_rate": 0.0004293747066987252, "loss": 2.3192, "num_input_tokens_seen": 266035200, "step": 4060, "train_runtime": 79935.0637, "train_tokens_per_second": 3328.141 }, { "epoch": 1.0747392050706457, "grad_norm": 0.333984375, "learning_rate": 0.0004290019586204953, "loss": 2.2752, "num_input_tokens_seen": 266690560, "step": 4070, "train_runtime": 80056.5141, "train_tokens_per_second": 3331.279 }, { "epoch": 1.0773801663805624, "grad_norm": 0.322265625, "learning_rate": 0.0004286283921973172, "loss": 2.2939, "num_input_tokens_seen": 267345920, "step": 4080, "train_runtime": 80180.0022, "train_tokens_per_second": 3334.322 }, { "epoch": 1.0800211276904794, "grad_norm": 0.3125, "learning_rate": 0.0004282540091370281, "loss": 2.2694, "num_input_tokens_seen": 268001280, "step": 4090, "train_runtime": 80304.7403, "train_tokens_per_second": 3337.303 }, { "epoch": 1.0826620890003962, "grad_norm": 0.306640625, "learning_rate": 0.00042787881115119864, "loss": 2.2958, "num_input_tokens_seen": 268656640, "step": 4100, "train_runtime": 80428.3833, "train_tokens_per_second": 3340.321 }, { "epoch": 1.085303050310313, "grad_norm": 0.345703125, "learning_rate": 0.0004275027999551249, "loss": 2.3079, "num_input_tokens_seen": 269312000, "step": 4110, "train_runtime": 80554.4198, "train_tokens_per_second": 3343.231 }, { "epoch": 1.0879440116202297, "grad_norm": 0.31640625, "learning_rate": 0.00042712597726782085, "loss": 2.2273, "num_input_tokens_seen": 269967360, "step": 4120, "train_runtime": 80679.6787, "train_tokens_per_second": 3346.163 }, { "epoch": 1.0905849729301467, "grad_norm": 0.341796875, "learning_rate": 0.0004267483448120104, "loss": 2.3295, "num_input_tokens_seen": 270622720, "step": 4130, "train_runtime": 80804.7272, "train_tokens_per_second": 3349.095 }, { "epoch": 1.0932259342400634, "grad_norm": 0.34765625, "learning_rate": 0.00042636990431411937, "loss": 2.2563, "num_input_tokens_seen": 271278080, "step": 4140, "train_runtime": 80929.6656, "train_tokens_per_second": 3352.023 }, { "epoch": 1.0958668955499802, "grad_norm": 0.31640625, "learning_rate": 0.00042599065750426767, "loss": 2.2812, "num_input_tokens_seen": 271933440, "step": 4150, "train_runtime": 81054.998, "train_tokens_per_second": 3354.925 }, { "epoch": 1.098507856859897, "grad_norm": 0.3046875, "learning_rate": 0.00042561060611626177, "loss": 2.3289, "num_input_tokens_seen": 272588800, "step": 4160, "train_runtime": 81178.5522, "train_tokens_per_second": 3357.892 }, { "epoch": 1.101148818169814, "grad_norm": 0.3046875, "learning_rate": 0.000425229751887586, "loss": 2.1831, "num_input_tokens_seen": 273244160, "step": 4170, "train_runtime": 81300.0149, "train_tokens_per_second": 3360.936 }, { "epoch": 1.1037897794797307, "grad_norm": 0.296875, "learning_rate": 0.00042484809655939527, "loss": 2.2654, "num_input_tokens_seen": 273899520, "step": 4180, "train_runtime": 81418.7735, "train_tokens_per_second": 3364.083 }, { "epoch": 1.1064307407896474, "grad_norm": 0.298828125, "learning_rate": 0.0004244656418765069, "loss": 2.2485, "num_input_tokens_seen": 274554880, "step": 4190, "train_runtime": 81540.3091, "train_tokens_per_second": 3367.106 }, { "epoch": 1.1090717020995642, "grad_norm": 0.328125, "learning_rate": 0.00042408238958739267, "loss": 2.2779, "num_input_tokens_seen": 275210240, "step": 4200, "train_runtime": 81662.2475, "train_tokens_per_second": 3370.104 }, { "epoch": 1.111712663409481, "grad_norm": 0.3125, "learning_rate": 0.0004236983414441705, "loss": 2.2195, "num_input_tokens_seen": 275865600, "step": 4210, "train_runtime": 81786.6331, "train_tokens_per_second": 3372.991 }, { "epoch": 1.114353624719398, "grad_norm": 0.34375, "learning_rate": 0.000423313499202597, "loss": 2.2841, "num_input_tokens_seen": 276520960, "step": 4220, "train_runtime": 81906.7637, "train_tokens_per_second": 3376.045 }, { "epoch": 1.1169945860293147, "grad_norm": 0.314453125, "learning_rate": 0.00042292786462205914, "loss": 2.2479, "num_input_tokens_seen": 277176320, "step": 4230, "train_runtime": 82028.3019, "train_tokens_per_second": 3379.033 }, { "epoch": 1.1196355473392314, "grad_norm": 0.310546875, "learning_rate": 0.00042254143946556606, "loss": 2.2651, "num_input_tokens_seen": 277831680, "step": 4240, "train_runtime": 82151.0352, "train_tokens_per_second": 3381.962 }, { "epoch": 1.1222765086491484, "grad_norm": 0.330078125, "learning_rate": 0.00042215422549974144, "loss": 2.2704, "num_input_tokens_seen": 278487040, "step": 4250, "train_runtime": 82270.3576, "train_tokens_per_second": 3385.023 }, { "epoch": 1.1249174699590652, "grad_norm": 0.318359375, "learning_rate": 0.000421766224494815, "loss": 2.3028, "num_input_tokens_seen": 279142400, "step": 4260, "train_runtime": 82390.9274, "train_tokens_per_second": 3388.024 }, { "epoch": 1.127558431268982, "grad_norm": 0.314453125, "learning_rate": 0.0004213774382246146, "loss": 2.3083, "num_input_tokens_seen": 279797760, "step": 4270, "train_runtime": 82510.8106, "train_tokens_per_second": 3391.044 }, { "epoch": 1.1301993925788987, "grad_norm": 0.30078125, "learning_rate": 0.000420987868466558, "loss": 2.3101, "num_input_tokens_seen": 280453120, "step": 4280, "train_runtime": 82633.193, "train_tokens_per_second": 3393.952 }, { "epoch": 1.1328403538888154, "grad_norm": 0.31640625, "learning_rate": 0.00042059751700164515, "loss": 2.3364, "num_input_tokens_seen": 281108480, "step": 4290, "train_runtime": 82755.5866, "train_tokens_per_second": 3396.852 }, { "epoch": 1.1354813151987324, "grad_norm": 0.33203125, "learning_rate": 0.0004202063856144494, "loss": 2.3146, "num_input_tokens_seen": 281763840, "step": 4300, "train_runtime": 82876.6686, "train_tokens_per_second": 3399.797 }, { "epoch": 1.1381222765086492, "grad_norm": 0.30859375, "learning_rate": 0.00041981447609310983, "loss": 2.2821, "num_input_tokens_seen": 282419200, "step": 4310, "train_runtime": 83002.1117, "train_tokens_per_second": 3402.554 }, { "epoch": 1.140763237818566, "grad_norm": 0.31640625, "learning_rate": 0.0004194217902293229, "loss": 2.2739, "num_input_tokens_seen": 283074560, "step": 4320, "train_runtime": 83121.4284, "train_tokens_per_second": 3405.555 }, { "epoch": 1.1434041991284827, "grad_norm": 0.306640625, "learning_rate": 0.0004190283298183344, "loss": 2.2125, "num_input_tokens_seen": 283729920, "step": 4330, "train_runtime": 83243.228, "train_tokens_per_second": 3408.444 }, { "epoch": 1.1460451604383997, "grad_norm": 0.302734375, "learning_rate": 0.0004186340966589309, "loss": 2.249, "num_input_tokens_seen": 284385280, "step": 4340, "train_runtime": 83363.4964, "train_tokens_per_second": 3411.389 }, { "epoch": 1.1486861217483164, "grad_norm": 0.310546875, "learning_rate": 0.00041823909255343187, "loss": 2.2658, "num_input_tokens_seen": 285040640, "step": 4350, "train_runtime": 83485.0706, "train_tokens_per_second": 3414.271 }, { "epoch": 1.1513270830582332, "grad_norm": 0.298828125, "learning_rate": 0.00041784331930768125, "loss": 2.2801, "num_input_tokens_seen": 285696000, "step": 4360, "train_runtime": 83606.0455, "train_tokens_per_second": 3417.169 }, { "epoch": 1.15396804436815, "grad_norm": 0.3046875, "learning_rate": 0.0004174467787310396, "loss": 2.3529, "num_input_tokens_seen": 286351360, "step": 4370, "train_runtime": 83728.7506, "train_tokens_per_second": 3419.988 }, { "epoch": 1.156609005678067, "grad_norm": 0.337890625, "learning_rate": 0.00041704947263637493, "loss": 2.3014, "num_input_tokens_seen": 287006720, "step": 4380, "train_runtime": 83848.4792, "train_tokens_per_second": 3422.921 }, { "epoch": 1.1592499669879837, "grad_norm": 0.29296875, "learning_rate": 0.0004166514028400554, "loss": 2.2544, "num_input_tokens_seen": 287662080, "step": 4390, "train_runtime": 83974.8578, "train_tokens_per_second": 3425.574 }, { "epoch": 1.1618909282979004, "grad_norm": 0.33203125, "learning_rate": 0.0004162525711619405, "loss": 2.2893, "num_input_tokens_seen": 288317440, "step": 4400, "train_runtime": 84093.0934, "train_tokens_per_second": 3428.551 }, { "epoch": 1.1645318896078172, "grad_norm": 0.33203125, "learning_rate": 0.0004158529794253727, "loss": 2.2578, "num_input_tokens_seen": 288972800, "step": 4410, "train_runtime": 84210.3873, "train_tokens_per_second": 3431.558 }, { "epoch": 1.1671728509177342, "grad_norm": 0.310546875, "learning_rate": 0.00041545262945716946, "loss": 2.3109, "num_input_tokens_seen": 289628160, "step": 4420, "train_runtime": 84327.684, "train_tokens_per_second": 3434.556 }, { "epoch": 1.169813812227651, "grad_norm": 0.322265625, "learning_rate": 0.00041505152308761434, "loss": 2.2651, "num_input_tokens_seen": 290283520, "step": 4430, "train_runtime": 84449.3471, "train_tokens_per_second": 3437.368 }, { "epoch": 1.1724547735375677, "grad_norm": 0.333984375, "learning_rate": 0.00041464966215044917, "loss": 2.2561, "num_input_tokens_seen": 290938880, "step": 4440, "train_runtime": 84570.0712, "train_tokens_per_second": 3440.211 }, { "epoch": 1.1750957348474844, "grad_norm": 0.306640625, "learning_rate": 0.00041424704848286553, "loss": 2.2728, "num_input_tokens_seen": 291594240, "step": 4450, "train_runtime": 84688.7888, "train_tokens_per_second": 3443.127 }, { "epoch": 1.1777366961574014, "grad_norm": 0.3125, "learning_rate": 0.000413843683925496, "loss": 2.3104, "num_input_tokens_seen": 292249600, "step": 4460, "train_runtime": 84807.008, "train_tokens_per_second": 3446.055 }, { "epoch": 1.1803776574673182, "grad_norm": 0.298828125, "learning_rate": 0.0004134395703224062, "loss": 2.3648, "num_input_tokens_seen": 292904960, "step": 4470, "train_runtime": 84924.5715, "train_tokens_per_second": 3449.001 }, { "epoch": 1.183018618777235, "grad_norm": 0.310546875, "learning_rate": 0.00041303470952108615, "loss": 2.2891, "num_input_tokens_seen": 293560320, "step": 4480, "train_runtime": 85042.4478, "train_tokens_per_second": 3451.927 }, { "epoch": 1.1856595800871517, "grad_norm": 0.337890625, "learning_rate": 0.0004126291033724417, "loss": 2.2063, "num_input_tokens_seen": 294215680, "step": 4490, "train_runtime": 85160.8956, "train_tokens_per_second": 3454.821 }, { "epoch": 1.1883005413970684, "grad_norm": 0.318359375, "learning_rate": 0.0004122227537307864, "loss": 2.3004, "num_input_tokens_seen": 294871040, "step": 4500, "train_runtime": 85278.8135, "train_tokens_per_second": 3457.729 }, { "epoch": 1.1909415027069854, "grad_norm": 0.298828125, "learning_rate": 0.00041181566245383273, "loss": 2.2272, "num_input_tokens_seen": 295526400, "step": 4510, "train_runtime": 85410.3019, "train_tokens_per_second": 3460.079 }, { "epoch": 1.1935824640169022, "grad_norm": 0.314453125, "learning_rate": 0.00041140783140268365, "loss": 2.303, "num_input_tokens_seen": 296181760, "step": 4520, "train_runtime": 85528.7014, "train_tokens_per_second": 3462.952 }, { "epoch": 1.196223425326819, "grad_norm": 0.31640625, "learning_rate": 0.0004109992624418244, "loss": 2.3099, "num_input_tokens_seen": 296837120, "step": 4530, "train_runtime": 85651.1363, "train_tokens_per_second": 3465.653 }, { "epoch": 1.198864386636736, "grad_norm": 0.30859375, "learning_rate": 0.0004105899574391135, "loss": 2.2171, "num_input_tokens_seen": 297492480, "step": 4540, "train_runtime": 85772.0031, "train_tokens_per_second": 3468.41 }, { "epoch": 1.2015053479466526, "grad_norm": 0.314453125, "learning_rate": 0.00041017991826577444, "loss": 2.2966, "num_input_tokens_seen": 298147840, "step": 4550, "train_runtime": 85892.1066, "train_tokens_per_second": 3471.19 }, { "epoch": 1.2041463092565694, "grad_norm": 0.310546875, "learning_rate": 0.00040976914679638726, "loss": 2.1979, "num_input_tokens_seen": 298803200, "step": 4560, "train_runtime": 86010.3013, "train_tokens_per_second": 3474.04 }, { "epoch": 1.2067872705664862, "grad_norm": 0.2890625, "learning_rate": 0.0004093576449088797, "loss": 2.2558, "num_input_tokens_seen": 299458560, "step": 4570, "train_runtime": 86128.1932, "train_tokens_per_second": 3476.894 }, { "epoch": 1.209428231876403, "grad_norm": 0.30078125, "learning_rate": 0.00040894541448451894, "loss": 2.2734, "num_input_tokens_seen": 300113920, "step": 4580, "train_runtime": 86246.2901, "train_tokens_per_second": 3479.731 }, { "epoch": 1.21206919318632, "grad_norm": 0.318359375, "learning_rate": 0.0004085324574079027, "loss": 2.2513, "num_input_tokens_seen": 300769280, "step": 4590, "train_runtime": 86365.5307, "train_tokens_per_second": 3482.515 }, { "epoch": 1.2147101544962366, "grad_norm": 0.314453125, "learning_rate": 0.0004081187755669506, "loss": 2.2718, "num_input_tokens_seen": 301424640, "step": 4600, "train_runtime": 86483.7904, "train_tokens_per_second": 3485.331 }, { "epoch": 1.2173511158061534, "grad_norm": 0.294921875, "learning_rate": 0.000407704370852896, "loss": 2.3453, "num_input_tokens_seen": 302080000, "step": 4610, "train_runtime": 86601.8135, "train_tokens_per_second": 3488.149 }, { "epoch": 1.2199920771160702, "grad_norm": 0.30859375, "learning_rate": 0.00040728924516027676, "loss": 2.318, "num_input_tokens_seen": 302735360, "step": 4620, "train_runtime": 86719.9873, "train_tokens_per_second": 3490.953 }, { "epoch": 1.2226330384259871, "grad_norm": 0.30078125, "learning_rate": 0.000406873400386927, "loss": 2.2749, "num_input_tokens_seen": 303390720, "step": 4630, "train_runtime": 86841.9478, "train_tokens_per_second": 3493.596 }, { "epoch": 1.225273999735904, "grad_norm": 0.296875, "learning_rate": 0.00040645683843396817, "loss": 2.2283, "num_input_tokens_seen": 304046080, "step": 4640, "train_runtime": 86962.7992, "train_tokens_per_second": 3496.278 }, { "epoch": 1.2279149610458207, "grad_norm": 0.31640625, "learning_rate": 0.00040603956120580044, "loss": 2.27, "num_input_tokens_seen": 304701440, "step": 4650, "train_runtime": 87081.6875, "train_tokens_per_second": 3499.03 }, { "epoch": 1.2305559223557374, "grad_norm": 0.3203125, "learning_rate": 0.00040562157061009417, "loss": 2.3199, "num_input_tokens_seen": 305356800, "step": 4660, "train_runtime": 87200.158, "train_tokens_per_second": 3501.792 }, { "epoch": 1.2331968836656544, "grad_norm": 0.3203125, "learning_rate": 0.0004052028685577809, "loss": 2.2438, "num_input_tokens_seen": 306012160, "step": 4670, "train_runtime": 87317.976, "train_tokens_per_second": 3504.572 }, { "epoch": 1.2358378449755711, "grad_norm": 0.30078125, "learning_rate": 0.0004047834569630447, "loss": 2.2876, "num_input_tokens_seen": 306667520, "step": 4680, "train_runtime": 87436.9012, "train_tokens_per_second": 3507.301 }, { "epoch": 1.238478806285488, "grad_norm": 0.306640625, "learning_rate": 0.0004043633377433136, "loss": 2.3233, "num_input_tokens_seen": 307322880, "step": 4690, "train_runtime": 87555.5652, "train_tokens_per_second": 3510.033 }, { "epoch": 1.2411197675954047, "grad_norm": 0.296875, "learning_rate": 0.00040394251281925046, "loss": 2.2602, "num_input_tokens_seen": 307978240, "step": 4700, "train_runtime": 87674.0469, "train_tokens_per_second": 3512.764 }, { "epoch": 1.2437607289053216, "grad_norm": 0.314453125, "learning_rate": 0.0004035209841147448, "loss": 2.3335, "num_input_tokens_seen": 308633600, "step": 4710, "train_runtime": 87792.5632, "train_tokens_per_second": 3515.487 }, { "epoch": 1.2464016902152384, "grad_norm": 0.322265625, "learning_rate": 0.0004030987535569032, "loss": 2.2491, "num_input_tokens_seen": 309288960, "step": 4720, "train_runtime": 87910.5629, "train_tokens_per_second": 3518.223 }, { "epoch": 1.2490426515251551, "grad_norm": 0.326171875, "learning_rate": 0.00040267582307604115, "loss": 2.3001, "num_input_tokens_seen": 309944320, "step": 4730, "train_runtime": 88028.8102, "train_tokens_per_second": 3520.942 }, { "epoch": 1.251683612835072, "grad_norm": 0.3203125, "learning_rate": 0.000402252194605674, "loss": 2.3019, "num_input_tokens_seen": 310599680, "step": 4740, "train_runtime": 88147.4335, "train_tokens_per_second": 3523.638 }, { "epoch": 1.2543245741449889, "grad_norm": 0.306640625, "learning_rate": 0.000401827870082508, "loss": 2.2112, "num_input_tokens_seen": 311255040, "step": 4750, "train_runtime": 88269.8181, "train_tokens_per_second": 3526.177 }, { "epoch": 1.2569655354549056, "grad_norm": 0.29296875, "learning_rate": 0.0004014028514464315, "loss": 2.2837, "num_input_tokens_seen": 311910400, "step": 4760, "train_runtime": 88388.6042, "train_tokens_per_second": 3528.853 }, { "epoch": 1.2596064967648224, "grad_norm": 0.34375, "learning_rate": 0.0004009771406405064, "loss": 2.2713, "num_input_tokens_seen": 312565760, "step": 4770, "train_runtime": 88507.0596, "train_tokens_per_second": 3531.535 }, { "epoch": 1.2622474580747391, "grad_norm": 0.314453125, "learning_rate": 0.0004005507396109586, "loss": 2.3538, "num_input_tokens_seen": 313221120, "step": 4780, "train_runtime": 88630.6163, "train_tokens_per_second": 3534.006 }, { "epoch": 1.264888419384656, "grad_norm": 0.298828125, "learning_rate": 0.0004001236503071699, "loss": 2.3031, "num_input_tokens_seen": 313876480, "step": 4790, "train_runtime": 88749.8818, "train_tokens_per_second": 3536.641 }, { "epoch": 1.2675293806945729, "grad_norm": 0.328125, "learning_rate": 0.0003996958746816684, "loss": 2.3328, "num_input_tokens_seen": 314531840, "step": 4800, "train_runtime": 88868.4023, "train_tokens_per_second": 3539.299 }, { "epoch": 1.2701703420044896, "grad_norm": 0.298828125, "learning_rate": 0.00039926741469012005, "loss": 2.2023, "num_input_tokens_seen": 315187200, "step": 4810, "train_runtime": 88986.5256, "train_tokens_per_second": 3541.965 }, { "epoch": 1.2728113033144064, "grad_norm": 0.30078125, "learning_rate": 0.0003988382722913194, "loss": 2.278, "num_input_tokens_seen": 315842560, "step": 4820, "train_runtime": 89104.5506, "train_tokens_per_second": 3544.629 }, { "epoch": 1.2754522646243234, "grad_norm": 0.3203125, "learning_rate": 0.00039840844944718086, "loss": 2.3375, "num_input_tokens_seen": 316497920, "step": 4830, "train_runtime": 89222.5592, "train_tokens_per_second": 3547.286 }, { "epoch": 1.2780932259342401, "grad_norm": 0.30859375, "learning_rate": 0.00039797794812272957, "loss": 2.3218, "num_input_tokens_seen": 317153280, "step": 4840, "train_runtime": 89340.9536, "train_tokens_per_second": 3549.92 }, { "epoch": 1.2807341872441569, "grad_norm": 0.298828125, "learning_rate": 0.00039754677028609254, "loss": 2.2926, "num_input_tokens_seen": 317808640, "step": 4850, "train_runtime": 89458.543, "train_tokens_per_second": 3552.58 }, { "epoch": 1.2833751485540736, "grad_norm": 0.3046875, "learning_rate": 0.00039711491790848946, "loss": 2.2381, "num_input_tokens_seen": 318464000, "step": 4860, "train_runtime": 89576.8503, "train_tokens_per_second": 3555.204 }, { "epoch": 1.2860161098639904, "grad_norm": 0.302734375, "learning_rate": 0.000396682392964224, "loss": 2.3415, "num_input_tokens_seen": 319119360, "step": 4870, "train_runtime": 89694.933, "train_tokens_per_second": 3557.83 }, { "epoch": 1.2886570711739074, "grad_norm": 0.314453125, "learning_rate": 0.0003962491974306746, "loss": 2.2658, "num_input_tokens_seen": 319774720, "step": 4880, "train_runtime": 89813.2074, "train_tokens_per_second": 3560.442 }, { "epoch": 1.2912980324838241, "grad_norm": 0.310546875, "learning_rate": 0.00039581533328828536, "loss": 2.3088, "num_input_tokens_seen": 320430080, "step": 4890, "train_runtime": 89931.5757, "train_tokens_per_second": 3563.043 }, { "epoch": 1.2939389937937409, "grad_norm": 0.30859375, "learning_rate": 0.000395380802520557, "loss": 2.3227, "num_input_tokens_seen": 321085440, "step": 4900, "train_runtime": 90049.479, "train_tokens_per_second": 3565.656 }, { "epoch": 1.2965799551036579, "grad_norm": 0.306640625, "learning_rate": 0.000394945607114038, "loss": 2.3324, "num_input_tokens_seen": 321740800, "step": 4910, "train_runtime": 90167.7361, "train_tokens_per_second": 3568.248 }, { "epoch": 1.2992209164135746, "grad_norm": 0.291015625, "learning_rate": 0.0003945097490583153, "loss": 2.2403, "num_input_tokens_seen": 322396160, "step": 4920, "train_runtime": 90285.8825, "train_tokens_per_second": 3570.837 }, { "epoch": 1.3018618777234914, "grad_norm": 0.306640625, "learning_rate": 0.0003940732303460053, "loss": 2.2773, "num_input_tokens_seen": 323051520, "step": 4930, "train_runtime": 90407.5925, "train_tokens_per_second": 3573.279 }, { "epoch": 1.3045028390334081, "grad_norm": 0.302734375, "learning_rate": 0.00039363605297274473, "loss": 2.307, "num_input_tokens_seen": 323706880, "step": 4940, "train_runtime": 90526.3987, "train_tokens_per_second": 3575.829 }, { "epoch": 1.307143800343325, "grad_norm": 0.298828125, "learning_rate": 0.00039319821893718163, "loss": 2.2994, "num_input_tokens_seen": 324362240, "step": 4950, "train_runtime": 90644.4098, "train_tokens_per_second": 3578.403 }, { "epoch": 1.3097847616532419, "grad_norm": 0.30078125, "learning_rate": 0.0003927597302409658, "loss": 2.2691, "num_input_tokens_seen": 325017600, "step": 4960, "train_runtime": 90762.6029, "train_tokens_per_second": 3580.964 }, { "epoch": 1.3124257229631586, "grad_norm": 0.30078125, "learning_rate": 0.00039232058888874033, "loss": 2.2588, "num_input_tokens_seen": 325672960, "step": 4970, "train_runtime": 90880.8201, "train_tokens_per_second": 3583.517 }, { "epoch": 1.3150666842730754, "grad_norm": 0.302734375, "learning_rate": 0.0003918807968881318, "loss": 2.216, "num_input_tokens_seen": 326328320, "step": 4980, "train_runtime": 90998.7304, "train_tokens_per_second": 3586.076 }, { "epoch": 1.3177076455829921, "grad_norm": 0.30078125, "learning_rate": 0.0003914403562497415, "loss": 2.3125, "num_input_tokens_seen": 326983680, "step": 4990, "train_runtime": 91119.3008, "train_tokens_per_second": 3588.523 }, { "epoch": 1.320348606892909, "grad_norm": 0.294921875, "learning_rate": 0.0003909992689871361, "loss": 2.3088, "num_input_tokens_seen": 327639040, "step": 5000, "train_runtime": 91239.2357, "train_tokens_per_second": 3590.988 }, { "epoch": 1.3229895682028259, "grad_norm": 0.294921875, "learning_rate": 0.0003905575371168383, "loss": 2.2613, "num_input_tokens_seen": 328294400, "step": 5010, "train_runtime": 91372.8031, "train_tokens_per_second": 3592.912 }, { "epoch": 1.3256305295127426, "grad_norm": 0.310546875, "learning_rate": 0.00039011516265831804, "loss": 2.3633, "num_input_tokens_seen": 328949760, "step": 5020, "train_runtime": 91492.3125, "train_tokens_per_second": 3595.381 }, { "epoch": 1.3282714908226594, "grad_norm": 0.302734375, "learning_rate": 0.00038967214763398265, "loss": 2.3284, "num_input_tokens_seen": 329605120, "step": 5030, "train_runtime": 91612.4203, "train_tokens_per_second": 3597.821 }, { "epoch": 1.3309124521325764, "grad_norm": 0.3046875, "learning_rate": 0.0003892284940691682, "loss": 2.2438, "num_input_tokens_seen": 330260480, "step": 5040, "train_runtime": 91732.8683, "train_tokens_per_second": 3600.242 }, { "epoch": 1.3335534134424931, "grad_norm": 0.3046875, "learning_rate": 0.00038878420399212987, "loss": 2.3119, "num_input_tokens_seen": 330915840, "step": 5050, "train_runtime": 91852.984, "train_tokens_per_second": 3602.668 }, { "epoch": 1.3361943747524099, "grad_norm": 0.298828125, "learning_rate": 0.0003883392794340328, "loss": 2.2153, "num_input_tokens_seen": 331571200, "step": 5060, "train_runtime": 91971.4591, "train_tokens_per_second": 3605.153 }, { "epoch": 1.3388353360623266, "grad_norm": 0.314453125, "learning_rate": 0.0003878937224289429, "loss": 2.2827, "num_input_tokens_seen": 332226560, "step": 5070, "train_runtime": 92089.9404, "train_tokens_per_second": 3607.631 }, { "epoch": 1.3414762973722434, "grad_norm": 0.294921875, "learning_rate": 0.0003874475350138171, "loss": 2.2723, "num_input_tokens_seen": 332881920, "step": 5080, "train_runtime": 92208.5982, "train_tokens_per_second": 3610.096 }, { "epoch": 1.3441172586821604, "grad_norm": 0.291015625, "learning_rate": 0.0003870007192284949, "loss": 2.319, "num_input_tokens_seen": 333537280, "step": 5090, "train_runtime": 92329.1823, "train_tokens_per_second": 3612.48 }, { "epoch": 1.3467582199920771, "grad_norm": 0.296875, "learning_rate": 0.0003865532771156882, "loss": 2.2392, "num_input_tokens_seen": 334192640, "step": 5100, "train_runtime": 92449.0591, "train_tokens_per_second": 3614.884 }, { "epoch": 1.3493991813019939, "grad_norm": 0.30078125, "learning_rate": 0.00038610521072097217, "loss": 2.2927, "num_input_tokens_seen": 334848000, "step": 5110, "train_runtime": 92567.0339, "train_tokens_per_second": 3617.357 }, { "epoch": 1.3520401426119109, "grad_norm": 0.6328125, "learning_rate": 0.00038565652209277636, "loss": 2.3171, "num_input_tokens_seen": 335503360, "step": 5120, "train_runtime": 92685.5042, "train_tokens_per_second": 3619.804 }, { "epoch": 1.3546811039218276, "grad_norm": 0.328125, "learning_rate": 0.0003852072132823747, "loss": 2.2479, "num_input_tokens_seen": 336158720, "step": 5130, "train_runtime": 92806.5462, "train_tokens_per_second": 3622.144 }, { "epoch": 1.3573220652317444, "grad_norm": 0.35546875, "learning_rate": 0.0003847572863438766, "loss": 2.3483, "num_input_tokens_seen": 336814080, "step": 5140, "train_runtime": 92926.2734, "train_tokens_per_second": 3624.53 }, { "epoch": 1.3599630265416611, "grad_norm": 0.3203125, "learning_rate": 0.0003843067433342173, "loss": 2.3469, "num_input_tokens_seen": 337469440, "step": 5150, "train_runtime": 93045.1657, "train_tokens_per_second": 3626.942 }, { "epoch": 1.3626039878515779, "grad_norm": 0.337890625, "learning_rate": 0.00038385558631314853, "loss": 2.3005, "num_input_tokens_seen": 338124800, "step": 5160, "train_runtime": 93162.7513, "train_tokens_per_second": 3629.399 }, { "epoch": 1.3652449491614949, "grad_norm": 0.30859375, "learning_rate": 0.0003834038173432292, "loss": 2.288, "num_input_tokens_seen": 338780160, "step": 5170, "train_runtime": 93280.87, "train_tokens_per_second": 3631.829 }, { "epoch": 1.3678859104714116, "grad_norm": 0.30078125, "learning_rate": 0.00038295143848981566, "loss": 2.3067, "num_input_tokens_seen": 339435520, "step": 5180, "train_runtime": 93398.4602, "train_tokens_per_second": 3634.273 }, { "epoch": 1.3705268717813284, "grad_norm": 0.287109375, "learning_rate": 0.00038249845182105257, "loss": 2.2299, "num_input_tokens_seen": 340090880, "step": 5190, "train_runtime": 93516.5785, "train_tokens_per_second": 3636.691 }, { "epoch": 1.3731678330912453, "grad_norm": 0.30078125, "learning_rate": 0.0003820448594078635, "loss": 2.2228, "num_input_tokens_seen": 340746240, "step": 5200, "train_runtime": 93635.4301, "train_tokens_per_second": 3639.074 }, { "epoch": 1.375808794401162, "grad_norm": 0.333984375, "learning_rate": 0.0003815906633239411, "loss": 2.2324, "num_input_tokens_seen": 341401600, "step": 5210, "train_runtime": 93753.8366, "train_tokens_per_second": 3641.468 }, { "epoch": 1.3784497557110789, "grad_norm": 0.298828125, "learning_rate": 0.000381135865645738, "loss": 2.1961, "num_input_tokens_seen": 342056960, "step": 5220, "train_runtime": 93871.0038, "train_tokens_per_second": 3643.904 }, { "epoch": 1.3810907170209956, "grad_norm": 0.298828125, "learning_rate": 0.0003806804684524568, "loss": 2.2648, "num_input_tokens_seen": 342712320, "step": 5230, "train_runtime": 93989.121, "train_tokens_per_second": 3646.298 }, { "epoch": 1.3837316783309124, "grad_norm": 0.306640625, "learning_rate": 0.0003802244738260414, "loss": 2.3501, "num_input_tokens_seen": 343367680, "step": 5240, "train_runtime": 94107.614, "train_tokens_per_second": 3648.671 }, { "epoch": 1.3863726396408294, "grad_norm": 0.318359375, "learning_rate": 0.00037976788385116666, "loss": 2.3171, "num_input_tokens_seen": 344023040, "step": 5250, "train_runtime": 94227.9178, "train_tokens_per_second": 3650.967 }, { "epoch": 1.389013600950746, "grad_norm": 0.3046875, "learning_rate": 0.0003793107006152293, "loss": 2.274, "num_input_tokens_seen": 344678400, "step": 5260, "train_runtime": 94347.6, "train_tokens_per_second": 3653.282 }, { "epoch": 1.3916545622606629, "grad_norm": 0.302734375, "learning_rate": 0.00037885292620833827, "loss": 2.3022, "num_input_tokens_seen": 345333760, "step": 5270, "train_runtime": 94467.1961, "train_tokens_per_second": 3655.594 }, { "epoch": 1.3942955235705796, "grad_norm": 0.3046875, "learning_rate": 0.0003783945627233052, "loss": 2.303, "num_input_tokens_seen": 345989120, "step": 5280, "train_runtime": 94586.2929, "train_tokens_per_second": 3657.92 }, { "epoch": 1.3969364848804964, "grad_norm": 0.322265625, "learning_rate": 0.0003779356122556347, "loss": 2.2225, "num_input_tokens_seen": 346644480, "step": 5290, "train_runtime": 94704.221, "train_tokens_per_second": 3660.285 }, { "epoch": 1.3995774461904134, "grad_norm": 0.310546875, "learning_rate": 0.0003774760769035148, "loss": 2.2395, "num_input_tokens_seen": 347299840, "step": 5300, "train_runtime": 94823.0943, "train_tokens_per_second": 3662.608 }, { "epoch": 1.40221840750033, "grad_norm": 0.294921875, "learning_rate": 0.00037701595876780795, "loss": 2.2575, "num_input_tokens_seen": 347955200, "step": 5310, "train_runtime": 94941.8464, "train_tokens_per_second": 3664.93 }, { "epoch": 1.4048593688102469, "grad_norm": 0.294921875, "learning_rate": 0.0003765552599520404, "loss": 2.267, "num_input_tokens_seen": 348610560, "step": 5320, "train_runtime": 95062.3926, "train_tokens_per_second": 3667.176 }, { "epoch": 1.4075003301201638, "grad_norm": 0.30859375, "learning_rate": 0.0003760939825623933, "loss": 2.2573, "num_input_tokens_seen": 349265920, "step": 5330, "train_runtime": 95181.5486, "train_tokens_per_second": 3669.471 }, { "epoch": 1.4101412914300806, "grad_norm": 0.28125, "learning_rate": 0.00037563212870769287, "loss": 2.259, "num_input_tokens_seen": 349921280, "step": 5340, "train_runtime": 95301.5728, "train_tokens_per_second": 3671.726 }, { "epoch": 1.4127822527399974, "grad_norm": 0.287109375, "learning_rate": 0.0003751697004994008, "loss": 2.2742, "num_input_tokens_seen": 350576640, "step": 5350, "train_runtime": 95419.7779, "train_tokens_per_second": 3674.046 }, { "epoch": 1.4154232140499141, "grad_norm": 0.31640625, "learning_rate": 0.0003747067000516044, "loss": 2.2791, "num_input_tokens_seen": 351232000, "step": 5360, "train_runtime": 95538.2795, "train_tokens_per_second": 3676.348 }, { "epoch": 1.4180641753598309, "grad_norm": 0.29296875, "learning_rate": 0.0003742431294810073, "loss": 2.22, "num_input_tokens_seen": 351887360, "step": 5370, "train_runtime": 95656.9994, "train_tokens_per_second": 3678.637 }, { "epoch": 1.4207051366697478, "grad_norm": 0.30859375, "learning_rate": 0.00037377899090691936, "loss": 2.3062, "num_input_tokens_seen": 352542720, "step": 5380, "train_runtime": 95775.7809, "train_tokens_per_second": 3680.917 }, { "epoch": 1.4233460979796646, "grad_norm": 0.296875, "learning_rate": 0.00037331428645124735, "loss": 2.2998, "num_input_tokens_seen": 353198080, "step": 5390, "train_runtime": 95896.8801, "train_tokens_per_second": 3683.103 }, { "epoch": 1.4259870592895814, "grad_norm": 0.298828125, "learning_rate": 0.0003728490182384851, "loss": 2.285, "num_input_tokens_seen": 353853440, "step": 5400, "train_runtime": 96017.8895, "train_tokens_per_second": 3685.287 }, { "epoch": 1.4286280205994983, "grad_norm": 0.3125, "learning_rate": 0.00037238318839570355, "loss": 2.2884, "num_input_tokens_seen": 354508800, "step": 5410, "train_runtime": 96137.8059, "train_tokens_per_second": 3687.507 }, { "epoch": 1.431268981909415, "grad_norm": 0.30859375, "learning_rate": 0.00037191679905254155, "loss": 2.2288, "num_input_tokens_seen": 355164160, "step": 5420, "train_runtime": 96256.2674, "train_tokens_per_second": 3689.777 }, { "epoch": 1.4339099432193319, "grad_norm": 0.29296875, "learning_rate": 0.00037144985234119555, "loss": 2.2367, "num_input_tokens_seen": 355819520, "step": 5430, "train_runtime": 96374.0976, "train_tokens_per_second": 3692.066 }, { "epoch": 1.4365509045292486, "grad_norm": 0.2890625, "learning_rate": 0.0003709823503964103, "loss": 2.2153, "num_input_tokens_seen": 356474880, "step": 5440, "train_runtime": 96493.1509, "train_tokens_per_second": 3694.302 }, { "epoch": 1.4391918658391654, "grad_norm": 0.30859375, "learning_rate": 0.0003705142953554689, "loss": 2.259, "num_input_tokens_seen": 357130240, "step": 5450, "train_runtime": 96611.7007, "train_tokens_per_second": 3696.553 }, { "epoch": 1.4418328271490823, "grad_norm": 0.275390625, "learning_rate": 0.00037004568935818295, "loss": 2.2649, "num_input_tokens_seen": 357785600, "step": 5460, "train_runtime": 96730.8038, "train_tokens_per_second": 3698.776 }, { "epoch": 1.444473788458999, "grad_norm": 0.3046875, "learning_rate": 0.000369576534546883, "loss": 2.2521, "num_input_tokens_seen": 358440960, "step": 5470, "train_runtime": 96851.4334, "train_tokens_per_second": 3700.936 }, { "epoch": 1.4471147497689159, "grad_norm": 0.287109375, "learning_rate": 0.00036910683306640846, "loss": 2.2253, "num_input_tokens_seen": 359096320, "step": 5480, "train_runtime": 96972.9536, "train_tokens_per_second": 3703.056 }, { "epoch": 1.4497557110788328, "grad_norm": 0.296875, "learning_rate": 0.00036863658706409806, "loss": 2.2428, "num_input_tokens_seen": 359751680, "step": 5490, "train_runtime": 97092.7885, "train_tokens_per_second": 3705.236 }, { "epoch": 1.4523966723887496, "grad_norm": 0.310546875, "learning_rate": 0.0003681657986897799, "loss": 2.2753, "num_input_tokens_seen": 360407040, "step": 5500, "train_runtime": 97211.0919, "train_tokens_per_second": 3707.468 }, { "epoch": 1.4550376336986663, "grad_norm": 0.30078125, "learning_rate": 0.00036769447009576164, "loss": 2.2894, "num_input_tokens_seen": 361062400, "step": 5510, "train_runtime": 97341.7182, "train_tokens_per_second": 3709.226 }, { "epoch": 1.457678595008583, "grad_norm": 0.2890625, "learning_rate": 0.0003672226034368207, "loss": 2.2631, "num_input_tokens_seen": 361717760, "step": 5520, "train_runtime": 97463.0139, "train_tokens_per_second": 3711.334 }, { "epoch": 1.4603195563184999, "grad_norm": 0.29296875, "learning_rate": 0.0003667502008701943, "loss": 2.287, "num_input_tokens_seen": 362373120, "step": 5530, "train_runtime": 97583.3565, "train_tokens_per_second": 3713.473 }, { "epoch": 1.4629605176284168, "grad_norm": 0.294921875, "learning_rate": 0.00036627726455556976, "loss": 2.2132, "num_input_tokens_seen": 363028480, "step": 5540, "train_runtime": 97702.306, "train_tokens_per_second": 3715.659 }, { "epoch": 1.4656014789383336, "grad_norm": 0.302734375, "learning_rate": 0.0003658037966550746, "loss": 2.2597, "num_input_tokens_seen": 363683840, "step": 5550, "train_runtime": 97820.7458, "train_tokens_per_second": 3717.86 }, { "epoch": 1.4682424402482503, "grad_norm": 0.29296875, "learning_rate": 0.00036532979933326626, "loss": 2.2171, "num_input_tokens_seen": 364339200, "step": 5560, "train_runtime": 97939.1074, "train_tokens_per_second": 3720.058 }, { "epoch": 1.470883401558167, "grad_norm": 0.302734375, "learning_rate": 0.000364855274757123, "loss": 2.1656, "num_input_tokens_seen": 364994560, "step": 5570, "train_runtime": 98059.806, "train_tokens_per_second": 3722.163 }, { "epoch": 1.4735243628680839, "grad_norm": 0.306640625, "learning_rate": 0.00036438022509603326, "loss": 2.2898, "num_input_tokens_seen": 365649920, "step": 5580, "train_runtime": 98180.6303, "train_tokens_per_second": 3724.257 }, { "epoch": 1.4761653241780008, "grad_norm": 0.283203125, "learning_rate": 0.00036390465252178597, "loss": 2.2011, "num_input_tokens_seen": 366305280, "step": 5590, "train_runtime": 98299.989, "train_tokens_per_second": 3726.402 }, { "epoch": 1.4788062854879176, "grad_norm": 0.302734375, "learning_rate": 0.00036342855920856086, "loss": 2.2916, "num_input_tokens_seen": 366960640, "step": 5600, "train_runtime": 98418.0582, "train_tokens_per_second": 3728.591 }, { "epoch": 1.4814472467978343, "grad_norm": 0.298828125, "learning_rate": 0.00036295194733291825, "loss": 2.2948, "num_input_tokens_seen": 367616000, "step": 5610, "train_runtime": 98537.2337, "train_tokens_per_second": 3730.732 }, { "epoch": 1.4840882081077513, "grad_norm": 0.29296875, "learning_rate": 0.00036247481907378915, "loss": 2.2105, "num_input_tokens_seen": 368271360, "step": 5620, "train_runtime": 98658.9979, "train_tokens_per_second": 3732.77 }, { "epoch": 1.486729169417668, "grad_norm": 0.314453125, "learning_rate": 0.0003619971766124653, "loss": 2.2325, "num_input_tokens_seen": 368926720, "step": 5630, "train_runtime": 98782.0276, "train_tokens_per_second": 3734.755 }, { "epoch": 1.4893701307275848, "grad_norm": 0.291015625, "learning_rate": 0.0003615190221325893, "loss": 2.239, "num_input_tokens_seen": 369582080, "step": 5640, "train_runtime": 98899.572, "train_tokens_per_second": 3736.943 }, { "epoch": 1.4920110920375016, "grad_norm": 0.294921875, "learning_rate": 0.0003610403578201445, "loss": 2.226, "num_input_tokens_seen": 370237440, "step": 5650, "train_runtime": 99018.6627, "train_tokens_per_second": 3739.067 }, { "epoch": 1.4946520533474184, "grad_norm": 0.279296875, "learning_rate": 0.00036056118586344504, "loss": 2.2243, "num_input_tokens_seen": 370892800, "step": 5660, "train_runtime": 99137.1022, "train_tokens_per_second": 3741.211 }, { "epoch": 1.4972930146573353, "grad_norm": 0.296875, "learning_rate": 0.00036008150845312595, "loss": 2.202, "num_input_tokens_seen": 371548160, "step": 5670, "train_runtime": 99254.9505, "train_tokens_per_second": 3743.372 }, { "epoch": 1.499933975967252, "grad_norm": 0.32421875, "learning_rate": 0.00035960132778213295, "loss": 2.2371, "num_input_tokens_seen": 372203520, "step": 5680, "train_runtime": 99373.1878, "train_tokens_per_second": 3745.513 }, { "epoch": 1.5025749372771688, "grad_norm": 0.291015625, "learning_rate": 0.00035912064604571247, "loss": 2.2658, "num_input_tokens_seen": 372858880, "step": 5690, "train_runtime": 99491.7647, "train_tokens_per_second": 3747.636 }, { "epoch": 1.5052158985870858, "grad_norm": 0.3046875, "learning_rate": 0.00035863946544140184, "loss": 2.1924, "num_input_tokens_seen": 373514240, "step": 5700, "train_runtime": 99610.9288, "train_tokens_per_second": 3749.732 }, { "epoch": 1.5078568598970024, "grad_norm": 0.306640625, "learning_rate": 0.00035815778816901904, "loss": 2.256, "num_input_tokens_seen": 374169600, "step": 5710, "train_runtime": 99730.1745, "train_tokens_per_second": 3751.819 }, { "epoch": 1.5104978212069193, "grad_norm": 0.306640625, "learning_rate": 0.00035767561643065257, "loss": 2.1976, "num_input_tokens_seen": 374824960, "step": 5720, "train_runtime": 99848.9497, "train_tokens_per_second": 3753.92 }, { "epoch": 1.513138782516836, "grad_norm": 0.29296875, "learning_rate": 0.0003571929524306515, "loss": 2.2108, "num_input_tokens_seen": 375480320, "step": 5730, "train_runtime": 99967.2615, "train_tokens_per_second": 3756.033 }, { "epoch": 1.5157797438267528, "grad_norm": 0.294921875, "learning_rate": 0.0003567097983756153, "loss": 2.3204, "num_input_tokens_seen": 376135680, "step": 5740, "train_runtime": 100085.7354, "train_tokens_per_second": 3758.135 }, { "epoch": 1.5184207051366698, "grad_norm": 0.3046875, "learning_rate": 0.00035622615647438425, "loss": 2.2411, "num_input_tokens_seen": 376791040, "step": 5750, "train_runtime": 100204.6838, "train_tokens_per_second": 3760.214 }, { "epoch": 1.5210616664465866, "grad_norm": 0.29296875, "learning_rate": 0.00035574202893802833, "loss": 2.1838, "num_input_tokens_seen": 377446400, "step": 5760, "train_runtime": 100323.8015, "train_tokens_per_second": 3762.282 }, { "epoch": 1.5237026277565033, "grad_norm": 0.279296875, "learning_rate": 0.000355257417979838, "loss": 2.28, "num_input_tokens_seen": 378101760, "step": 5770, "train_runtime": 100444.7715, "train_tokens_per_second": 3764.275 }, { "epoch": 1.5263435890664203, "grad_norm": 0.287109375, "learning_rate": 0.0003547723258153138, "loss": 2.2991, "num_input_tokens_seen": 378757120, "step": 5780, "train_runtime": 100563.6927, "train_tokens_per_second": 3766.341 }, { "epoch": 1.5289845503763368, "grad_norm": 0.32421875, "learning_rate": 0.0003542867546621563, "loss": 2.3799, "num_input_tokens_seen": 379412480, "step": 5790, "train_runtime": 100682.5497, "train_tokens_per_second": 3768.404 }, { "epoch": 1.5316255116862538, "grad_norm": 0.29296875, "learning_rate": 0.0003538007067402556, "loss": 2.2693, "num_input_tokens_seen": 380067840, "step": 5800, "train_runtime": 100803.4359, "train_tokens_per_second": 3770.386 }, { "epoch": 1.5342664729961706, "grad_norm": 0.29296875, "learning_rate": 0.0003533141842716816, "loss": 2.2448, "num_input_tokens_seen": 380723200, "step": 5810, "train_runtime": 100922.0068, "train_tokens_per_second": 3772.45 }, { "epoch": 1.5369074343060873, "grad_norm": 0.30078125, "learning_rate": 0.0003528271894806737, "loss": 2.2343, "num_input_tokens_seen": 381378560, "step": 5820, "train_runtime": 101041.4132, "train_tokens_per_second": 3774.478 }, { "epoch": 1.5395483956160043, "grad_norm": 0.3046875, "learning_rate": 0.00035233972459363056, "loss": 2.2402, "num_input_tokens_seen": 382033920, "step": 5830, "train_runtime": 101159.5687, "train_tokens_per_second": 3776.548 }, { "epoch": 1.542189356925921, "grad_norm": 0.287109375, "learning_rate": 0.0003518517918391001, "loss": 2.2691, "num_input_tokens_seen": 382689280, "step": 5840, "train_runtime": 101278.5167, "train_tokens_per_second": 3778.583 }, { "epoch": 1.5448303182358378, "grad_norm": 0.29296875, "learning_rate": 0.000351363393447769, "loss": 2.219, "num_input_tokens_seen": 383344640, "step": 5850, "train_runtime": 101397.4895, "train_tokens_per_second": 3780.613 }, { "epoch": 1.5474712795457548, "grad_norm": 0.291015625, "learning_rate": 0.0003508745316524528, "loss": 2.2402, "num_input_tokens_seen": 384000000, "step": 5860, "train_runtime": 101516.2815, "train_tokens_per_second": 3782.644 }, { "epoch": 1.5501122408556713, "grad_norm": 0.29296875, "learning_rate": 0.00035038520868808573, "loss": 2.2339, "num_input_tokens_seen": 384655360, "step": 5870, "train_runtime": 101634.5334, "train_tokens_per_second": 3784.692 }, { "epoch": 1.5527532021655883, "grad_norm": 0.30078125, "learning_rate": 0.00034989542679171007, "loss": 2.2025, "num_input_tokens_seen": 385310720, "step": 5880, "train_runtime": 101752.9132, "train_tokens_per_second": 3786.729 }, { "epoch": 1.555394163475505, "grad_norm": 0.306640625, "learning_rate": 0.0003494051882024665, "loss": 2.2256, "num_input_tokens_seen": 385966080, "step": 5890, "train_runtime": 101871.2073, "train_tokens_per_second": 3788.765 }, { "epoch": 1.5580351247854218, "grad_norm": 0.294921875, "learning_rate": 0.00034891449516158326, "loss": 2.1937, "num_input_tokens_seen": 386621440, "step": 5900, "train_runtime": 101989.8338, "train_tokens_per_second": 3790.784 }, { "epoch": 1.5606760860953388, "grad_norm": 0.298828125, "learning_rate": 0.0003484233499123665, "loss": 2.2708, "num_input_tokens_seen": 387276800, "step": 5910, "train_runtime": 102110.8845, "train_tokens_per_second": 3792.708 }, { "epoch": 1.5633170474052556, "grad_norm": 0.2890625, "learning_rate": 0.0003479317547001895, "loss": 2.2746, "num_input_tokens_seen": 387932160, "step": 5920, "train_runtime": 102229.8036, "train_tokens_per_second": 3794.707 }, { "epoch": 1.5659580087151723, "grad_norm": 0.28515625, "learning_rate": 0.0003474397117724829, "loss": 2.1972, "num_input_tokens_seen": 388587520, "step": 5930, "train_runtime": 102348.6911, "train_tokens_per_second": 3796.702 }, { "epoch": 1.568598970025089, "grad_norm": 0.291015625, "learning_rate": 0.0003469472233787238, "loss": 2.1999, "num_input_tokens_seen": 389242880, "step": 5940, "train_runtime": 102466.9399, "train_tokens_per_second": 3798.717 }, { "epoch": 1.5712399313350058, "grad_norm": 0.296875, "learning_rate": 0.0003464542917704262, "loss": 2.2364, "num_input_tokens_seen": 389898240, "step": 5950, "train_runtime": 102585.4476, "train_tokens_per_second": 3800.717 }, { "epoch": 1.5738808926449228, "grad_norm": 0.298828125, "learning_rate": 0.0003459609192011301, "loss": 2.2147, "num_input_tokens_seen": 390553600, "step": 5960, "train_runtime": 102704.0399, "train_tokens_per_second": 3802.709 }, { "epoch": 1.5765218539548396, "grad_norm": 0.302734375, "learning_rate": 0.00034546710792639164, "loss": 2.2406, "num_input_tokens_seen": 391208960, "step": 5970, "train_runtime": 102822.2437, "train_tokens_per_second": 3804.711 }, { "epoch": 1.5791628152647563, "grad_norm": 0.283203125, "learning_rate": 0.00034497286020377245, "loss": 2.2814, "num_input_tokens_seen": 391864320, "step": 5980, "train_runtime": 102940.9894, "train_tokens_per_second": 3806.689 }, { "epoch": 1.5818037765746733, "grad_norm": 0.29296875, "learning_rate": 0.00034447817829282945, "loss": 2.2857, "num_input_tokens_seen": 392519680, "step": 5990, "train_runtime": 103059.3455, "train_tokens_per_second": 3808.676 }, { "epoch": 1.5844447378845898, "grad_norm": 0.2890625, "learning_rate": 0.0003439830644551048, "loss": 2.2429, "num_input_tokens_seen": 393175040, "step": 6000, "train_runtime": 103177.906, "train_tokens_per_second": 3810.651 }, { "epoch": 1.5870856991945068, "grad_norm": 0.294921875, "learning_rate": 0.00034348752095411493, "loss": 2.2563, "num_input_tokens_seen": 393830400, "step": 6010, "train_runtime": 103310.1462, "train_tokens_per_second": 3812.117 }, { "epoch": 1.5897266605044236, "grad_norm": 0.310546875, "learning_rate": 0.00034299155005534086, "loss": 2.2445, "num_input_tokens_seen": 394485760, "step": 6020, "train_runtime": 103429.4102, "train_tokens_per_second": 3814.058 }, { "epoch": 1.5923676218143403, "grad_norm": 0.318359375, "learning_rate": 0.00034249515402621746, "loss": 2.2211, "num_input_tokens_seen": 395141120, "step": 6030, "train_runtime": 103548.2987, "train_tokens_per_second": 3816.008 }, { "epoch": 1.5950085831242573, "grad_norm": 0.34765625, "learning_rate": 0.000341998335136123, "loss": 2.3039, "num_input_tokens_seen": 395796480, "step": 6040, "train_runtime": 103666.3992, "train_tokens_per_second": 3817.982 }, { "epoch": 1.597649544434174, "grad_norm": 0.30078125, "learning_rate": 0.00034150109565636924, "loss": 2.1969, "num_input_tokens_seen": 396451840, "step": 6050, "train_runtime": 103786.543, "train_tokens_per_second": 3819.877 }, { "epoch": 1.6002905057440908, "grad_norm": 0.279296875, "learning_rate": 0.0003410034378601906, "loss": 2.2174, "num_input_tokens_seen": 397107200, "step": 6060, "train_runtime": 103907.9855, "train_tokens_per_second": 3821.72 }, { "epoch": 1.6029314670540078, "grad_norm": 0.296875, "learning_rate": 0.00034050536402273384, "loss": 2.2401, "num_input_tokens_seen": 397762560, "step": 6070, "train_runtime": 104028.9442, "train_tokens_per_second": 3823.576 }, { "epoch": 1.6055724283639243, "grad_norm": 0.28125, "learning_rate": 0.000340006876421048, "loss": 2.2266, "num_input_tokens_seen": 398417920, "step": 6080, "train_runtime": 104149.3397, "train_tokens_per_second": 3825.448 }, { "epoch": 1.6082133896738413, "grad_norm": 0.291015625, "learning_rate": 0.00033950797733407344, "loss": 2.1852, "num_input_tokens_seen": 399073280, "step": 6090, "train_runtime": 104268.6612, "train_tokens_per_second": 3827.356 }, { "epoch": 1.610854350983758, "grad_norm": 0.302734375, "learning_rate": 0.00033900866904263186, "loss": 2.2782, "num_input_tokens_seen": 399728640, "step": 6100, "train_runtime": 104386.4519, "train_tokens_per_second": 3829.315 }, { "epoch": 1.6134953122936748, "grad_norm": 0.298828125, "learning_rate": 0.0003385089538294158, "loss": 2.276, "num_input_tokens_seen": 400384000, "step": 6110, "train_runtime": 104505.1118, "train_tokens_per_second": 3831.238 }, { "epoch": 1.6161362736035918, "grad_norm": 0.298828125, "learning_rate": 0.0003380088339789779, "loss": 2.2243, "num_input_tokens_seen": 401039360, "step": 6120, "train_runtime": 104626.5766, "train_tokens_per_second": 3833.054 }, { "epoch": 1.6187772349135086, "grad_norm": 0.306640625, "learning_rate": 0.00033750831177772076, "loss": 2.2221, "num_input_tokens_seen": 401694720, "step": 6130, "train_runtime": 104744.9254, "train_tokens_per_second": 3834.98 }, { "epoch": 1.6214181962234253, "grad_norm": 0.29296875, "learning_rate": 0.0003370073895138866, "loss": 2.2769, "num_input_tokens_seen": 402350080, "step": 6140, "train_runtime": 104863.6451, "train_tokens_per_second": 3836.888 }, { "epoch": 1.6240591575333423, "grad_norm": 0.2890625, "learning_rate": 0.00033650606947754647, "loss": 2.2258, "num_input_tokens_seen": 403005440, "step": 6150, "train_runtime": 104981.952, "train_tokens_per_second": 3838.807 }, { "epoch": 1.6267001188432588, "grad_norm": 0.291015625, "learning_rate": 0.00033600435396058994, "loss": 2.2582, "num_input_tokens_seen": 403660800, "step": 6160, "train_runtime": 105099.9364, "train_tokens_per_second": 3840.733 }, { "epoch": 1.6293410801531758, "grad_norm": 0.28515625, "learning_rate": 0.0003355022452567144, "loss": 2.2351, "num_input_tokens_seen": 404316160, "step": 6170, "train_runtime": 105221.7088, "train_tokens_per_second": 3842.517 }, { "epoch": 1.6319820414630926, "grad_norm": 0.30859375, "learning_rate": 0.0003349997456614152, "loss": 2.2935, "num_input_tokens_seen": 404971520, "step": 6180, "train_runtime": 105341.9161, "train_tokens_per_second": 3844.353 }, { "epoch": 1.6346230027730093, "grad_norm": 0.283203125, "learning_rate": 0.0003344968574719744, "loss": 2.1989, "num_input_tokens_seen": 405626880, "step": 6190, "train_runtime": 105460.0525, "train_tokens_per_second": 3846.261 }, { "epoch": 1.6372639640829263, "grad_norm": 0.29296875, "learning_rate": 0.00033399358298745067, "loss": 2.2285, "num_input_tokens_seen": 406282240, "step": 6200, "train_runtime": 105578.4506, "train_tokens_per_second": 3848.155 }, { "epoch": 1.639904925392843, "grad_norm": 0.294921875, "learning_rate": 0.0003334899245086687, "loss": 2.1826, "num_input_tokens_seen": 406937600, "step": 6210, "train_runtime": 105697.191, "train_tokens_per_second": 3850.032 }, { "epoch": 1.6425458867027598, "grad_norm": 0.30859375, "learning_rate": 0.0003329858843382089, "loss": 2.2519, "num_input_tokens_seen": 407592960, "step": 6220, "train_runtime": 105815.1638, "train_tokens_per_second": 3851.933 }, { "epoch": 1.6451868480126766, "grad_norm": 0.298828125, "learning_rate": 0.0003324814647803962, "loss": 2.2557, "num_input_tokens_seen": 408248320, "step": 6230, "train_runtime": 105934.0778, "train_tokens_per_second": 3853.796 }, { "epoch": 1.6478278093225933, "grad_norm": 0.279296875, "learning_rate": 0.00033197666814129044, "loss": 2.1799, "num_input_tokens_seen": 408903680, "step": 6240, "train_runtime": 106052.3659, "train_tokens_per_second": 3855.677 }, { "epoch": 1.6504687706325103, "grad_norm": 0.287109375, "learning_rate": 0.0003314714967286753, "loss": 2.267, "num_input_tokens_seen": 409559040, "step": 6250, "train_runtime": 106172.7773, "train_tokens_per_second": 3857.477 }, { "epoch": 1.653109731942427, "grad_norm": 0.283203125, "learning_rate": 0.00033096595285204755, "loss": 2.276, "num_input_tokens_seen": 410214400, "step": 6260, "train_runtime": 106295.8445, "train_tokens_per_second": 3859.176 }, { "epoch": 1.6557506932523438, "grad_norm": 0.291015625, "learning_rate": 0.00033046003882260694, "loss": 2.2193, "num_input_tokens_seen": 410869760, "step": 6270, "train_runtime": 106414.384, "train_tokens_per_second": 3861.036 }, { "epoch": 1.6583916545622608, "grad_norm": 0.283203125, "learning_rate": 0.00032995375695324544, "loss": 2.2459, "num_input_tokens_seen": 411525120, "step": 6280, "train_runtime": 106532.7579, "train_tokens_per_second": 3862.897 }, { "epoch": 1.6610326158721773, "grad_norm": 0.29296875, "learning_rate": 0.00032944710955853663, "loss": 2.2003, "num_input_tokens_seen": 412180480, "step": 6290, "train_runtime": 106651.5193, "train_tokens_per_second": 3864.741 }, { "epoch": 1.6636735771820943, "grad_norm": 0.302734375, "learning_rate": 0.00032894009895472533, "loss": 2.2881, "num_input_tokens_seen": 412835840, "step": 6300, "train_runtime": 106771.4124, "train_tokens_per_second": 3866.539 }, { "epoch": 1.666314538492011, "grad_norm": 0.298828125, "learning_rate": 0.00032843272745971646, "loss": 2.1741, "num_input_tokens_seen": 413491200, "step": 6310, "train_runtime": 106890.3921, "train_tokens_per_second": 3868.366 }, { "epoch": 1.6689554998019278, "grad_norm": 0.27734375, "learning_rate": 0.00032792499739306533, "loss": 2.2267, "num_input_tokens_seen": 414146560, "step": 6320, "train_runtime": 107008.467, "train_tokens_per_second": 3870.222 }, { "epoch": 1.6715964611118448, "grad_norm": 0.3046875, "learning_rate": 0.00032741691107596616, "loss": 2.2488, "num_input_tokens_seen": 414801920, "step": 6330, "train_runtime": 107129.579, "train_tokens_per_second": 3871.964 }, { "epoch": 1.6742374224217615, "grad_norm": 0.291015625, "learning_rate": 0.0003269084708312421, "loss": 2.269, "num_input_tokens_seen": 415457280, "step": 6340, "train_runtime": 107250.1062, "train_tokens_per_second": 3873.724 }, { "epoch": 1.6768783837316783, "grad_norm": 0.294921875, "learning_rate": 0.0003263996789833341, "loss": 2.2739, "num_input_tokens_seen": 416112640, "step": 6350, "train_runtime": 107368.668, "train_tokens_per_second": 3875.55 }, { "epoch": 1.6795193450415953, "grad_norm": 0.283203125, "learning_rate": 0.0003258905378582907, "loss": 2.2184, "num_input_tokens_seen": 416768000, "step": 6360, "train_runtime": 107486.5584, "train_tokens_per_second": 3877.396 }, { "epoch": 1.6821603063515118, "grad_norm": 0.283203125, "learning_rate": 0.0003253810497837572, "loss": 2.2112, "num_input_tokens_seen": 417423360, "step": 6370, "train_runtime": 107604.7235, "train_tokens_per_second": 3879.229 }, { "epoch": 1.6848012676614288, "grad_norm": 0.296875, "learning_rate": 0.0003248712170889651, "loss": 2.1952, "num_input_tokens_seen": 418078720, "step": 6380, "train_runtime": 107722.9922, "train_tokens_per_second": 3881.054 }, { "epoch": 1.6874422289713455, "grad_norm": 0.29296875, "learning_rate": 0.0003243610421047213, "loss": 2.2812, "num_input_tokens_seen": 418734080, "step": 6390, "train_runtime": 107840.9421, "train_tokens_per_second": 3882.886 }, { "epoch": 1.6900831902812623, "grad_norm": 0.310546875, "learning_rate": 0.0003238505271633975, "loss": 2.2397, "num_input_tokens_seen": 419389440, "step": 6400, "train_runtime": 107959.8287, "train_tokens_per_second": 3884.68 }, { "epoch": 1.6927241515911793, "grad_norm": 0.28125, "learning_rate": 0.00032333967459892, "loss": 2.1701, "num_input_tokens_seen": 420044800, "step": 6410, "train_runtime": 108078.5028, "train_tokens_per_second": 3886.479 }, { "epoch": 1.695365112901096, "grad_norm": 0.28125, "learning_rate": 0.00032282848674675796, "loss": 2.2112, "num_input_tokens_seen": 420700160, "step": 6420, "train_runtime": 108196.7229, "train_tokens_per_second": 3888.289 }, { "epoch": 1.6980060742110128, "grad_norm": 0.2890625, "learning_rate": 0.00032231696594391395, "loss": 2.2086, "num_input_tokens_seen": 421355520, "step": 6430, "train_runtime": 108315.3892, "train_tokens_per_second": 3890.08 }, { "epoch": 1.7006470355209298, "grad_norm": 0.283203125, "learning_rate": 0.0003218051145289124, "loss": 2.2006, "num_input_tokens_seen": 422010880, "step": 6440, "train_runtime": 108435.7939, "train_tokens_per_second": 3891.804 }, { "epoch": 1.7032879968308463, "grad_norm": 0.3046875, "learning_rate": 0.00032129293484178925, "loss": 2.3335, "num_input_tokens_seen": 422666240, "step": 6450, "train_runtime": 108553.8734, "train_tokens_per_second": 3893.608 }, { "epoch": 1.7059289581407633, "grad_norm": 0.294921875, "learning_rate": 0.0003207804292240812, "loss": 2.2537, "num_input_tokens_seen": 423321600, "step": 6460, "train_runtime": 108672.8996, "train_tokens_per_second": 3895.374 }, { "epoch": 1.70856991945068, "grad_norm": 0.287109375, "learning_rate": 0.00032026760001881507, "loss": 2.1872, "num_input_tokens_seen": 423976960, "step": 6470, "train_runtime": 108793.2483, "train_tokens_per_second": 3897.089 }, { "epoch": 1.7112108807605968, "grad_norm": 0.296875, "learning_rate": 0.000319754449570497, "loss": 2.1641, "num_input_tokens_seen": 424632320, "step": 6480, "train_runtime": 108912.3022, "train_tokens_per_second": 3898.846 }, { "epoch": 1.7138518420705138, "grad_norm": 0.287109375, "learning_rate": 0.00031924098022510165, "loss": 2.198, "num_input_tokens_seen": 425287680, "step": 6490, "train_runtime": 109031.1858, "train_tokens_per_second": 3900.606 }, { "epoch": 1.7164928033804305, "grad_norm": 0.2890625, "learning_rate": 0.0003187271943300618, "loss": 2.2146, "num_input_tokens_seen": 425943040, "step": 6500, "train_runtime": 109152.7442, "train_tokens_per_second": 3902.266 }, { "epoch": 1.7191337646903473, "grad_norm": 0.30859375, "learning_rate": 0.0003182130942342573, "loss": 2.2801, "num_input_tokens_seen": 426598400, "step": 6510, "train_runtime": 109285.8918, "train_tokens_per_second": 3903.508 }, { "epoch": 1.721774726000264, "grad_norm": 0.29296875, "learning_rate": 0.00031769868228800435, "loss": 2.2058, "num_input_tokens_seen": 427253760, "step": 6520, "train_runtime": 109405.4925, "train_tokens_per_second": 3905.231 }, { "epoch": 1.7244156873101808, "grad_norm": 0.283203125, "learning_rate": 0.0003171839608430449, "loss": 2.1824, "num_input_tokens_seen": 427909120, "step": 6530, "train_runtime": 109526.7432, "train_tokens_per_second": 3906.892 }, { "epoch": 1.7270566486200978, "grad_norm": 0.287109375, "learning_rate": 0.000316668932252536, "loss": 2.2827, "num_input_tokens_seen": 428564480, "step": 6540, "train_runtime": 109645.4766, "train_tokens_per_second": 3908.638 }, { "epoch": 1.7296976099300145, "grad_norm": 0.28515625, "learning_rate": 0.00031615359887103854, "loss": 2.2298, "num_input_tokens_seen": 429219840, "step": 6550, "train_runtime": 109763.7338, "train_tokens_per_second": 3910.398 }, { "epoch": 1.7323385712399313, "grad_norm": 0.2734375, "learning_rate": 0.0003156379630545072, "loss": 2.2086, "num_input_tokens_seen": 429875200, "step": 6560, "train_runtime": 109882.5111, "train_tokens_per_second": 3912.135 }, { "epoch": 1.7349795325498483, "grad_norm": 0.310546875, "learning_rate": 0.0003151220271602789, "loss": 2.2049, "num_input_tokens_seen": 430530560, "step": 6570, "train_runtime": 110003.8157, "train_tokens_per_second": 3913.778 }, { "epoch": 1.7376204938597648, "grad_norm": 0.306640625, "learning_rate": 0.0003146057935470628, "loss": 2.1743, "num_input_tokens_seen": 431185920, "step": 6580, "train_runtime": 110124.2541, "train_tokens_per_second": 3915.449 }, { "epoch": 1.7402614551696818, "grad_norm": 0.298828125, "learning_rate": 0.00031408926457492895, "loss": 2.2296, "num_input_tokens_seen": 431841280, "step": 6590, "train_runtime": 110243.1176, "train_tokens_per_second": 3917.172 }, { "epoch": 1.7429024164795985, "grad_norm": 0.287109375, "learning_rate": 0.0003135724426052978, "loss": 2.1756, "num_input_tokens_seen": 432496640, "step": 6600, "train_runtime": 110363.0718, "train_tokens_per_second": 3918.853 }, { "epoch": 1.7455433777895153, "grad_norm": 0.302734375, "learning_rate": 0.0003130553300009291, "loss": 2.1728, "num_input_tokens_seen": 433152000, "step": 6610, "train_runtime": 110481.7805, "train_tokens_per_second": 3920.574 }, { "epoch": 1.7481843390994323, "grad_norm": 0.287109375, "learning_rate": 0.00031253792912591167, "loss": 2.2178, "num_input_tokens_seen": 433807360, "step": 6620, "train_runtime": 110600.6749, "train_tokens_per_second": 3922.285 }, { "epoch": 1.750825300409349, "grad_norm": 0.279296875, "learning_rate": 0.00031202024234565183, "loss": 2.1999, "num_input_tokens_seen": 434462720, "step": 6630, "train_runtime": 110719.7179, "train_tokens_per_second": 3923.987 }, { "epoch": 1.7534662617192658, "grad_norm": 0.296875, "learning_rate": 0.000311502272026863, "loss": 2.1903, "num_input_tokens_seen": 435118080, "step": 6640, "train_runtime": 110838.6311, "train_tokens_per_second": 3925.69 }, { "epoch": 1.7561072230291828, "grad_norm": 0.294921875, "learning_rate": 0.0003109840205375553, "loss": 2.2394, "num_input_tokens_seen": 435773440, "step": 6650, "train_runtime": 110957.3935, "train_tokens_per_second": 3927.394 }, { "epoch": 1.7587481843390993, "grad_norm": 0.27734375, "learning_rate": 0.0003104654902470238, "loss": 2.2375, "num_input_tokens_seen": 436428800, "step": 6660, "train_runtime": 111076.0541, "train_tokens_per_second": 3929.099 }, { "epoch": 1.7613891456490163, "grad_norm": 0.294921875, "learning_rate": 0.00030994668352583827, "loss": 2.1452, "num_input_tokens_seen": 437084160, "step": 6670, "train_runtime": 111195.0803, "train_tokens_per_second": 3930.787 }, { "epoch": 1.764030106958933, "grad_norm": 0.28515625, "learning_rate": 0.0003094276027458324, "loss": 2.2098, "num_input_tokens_seen": 437739520, "step": 6680, "train_runtime": 111313.5793, "train_tokens_per_second": 3932.49 }, { "epoch": 1.7666710682688498, "grad_norm": 0.28125, "learning_rate": 0.00030890825028009265, "loss": 2.1948, "num_input_tokens_seen": 438394880, "step": 6690, "train_runtime": 111432.7597, "train_tokens_per_second": 3934.165 }, { "epoch": 1.7693120295787668, "grad_norm": 0.28515625, "learning_rate": 0.00030838862850294775, "loss": 2.1585, "num_input_tokens_seen": 439050240, "step": 6700, "train_runtime": 111551.5656, "train_tokens_per_second": 3935.85 }, { "epoch": 1.7719529908886835, "grad_norm": 0.287109375, "learning_rate": 0.00030786873978995725, "loss": 2.2294, "num_input_tokens_seen": 439705600, "step": 6710, "train_runtime": 111670.9152, "train_tokens_per_second": 3937.512 }, { "epoch": 1.7745939521986003, "grad_norm": 0.267578125, "learning_rate": 0.00030734858651790156, "loss": 2.0915, "num_input_tokens_seen": 440360960, "step": 6720, "train_runtime": 111789.6956, "train_tokens_per_second": 3939.191 }, { "epoch": 1.7772349135085173, "grad_norm": 0.314453125, "learning_rate": 0.00030682817106477013, "loss": 2.2075, "num_input_tokens_seen": 441016320, "step": 6730, "train_runtime": 111908.1352, "train_tokens_per_second": 3940.878 }, { "epoch": 1.7798758748184338, "grad_norm": 0.298828125, "learning_rate": 0.00030630749580975124, "loss": 2.2005, "num_input_tokens_seen": 441671680, "step": 6740, "train_runtime": 112027.1622, "train_tokens_per_second": 3942.541 }, { "epoch": 1.7825168361283508, "grad_norm": 0.310546875, "learning_rate": 0.0003057865631332209, "loss": 2.1084, "num_input_tokens_seen": 442327040, "step": 6750, "train_runtime": 112146.1056, "train_tokens_per_second": 3944.203 }, { "epoch": 1.7851577974382675, "grad_norm": 0.283203125, "learning_rate": 0.0003052653754167319, "loss": 2.1721, "num_input_tokens_seen": 442982400, "step": 6760, "train_runtime": 112265.1664, "train_tokens_per_second": 3945.858 }, { "epoch": 1.7877987587481843, "grad_norm": 0.291015625, "learning_rate": 0.0003047439350430033, "loss": 2.1935, "num_input_tokens_seen": 443637760, "step": 6770, "train_runtime": 112384.0032, "train_tokens_per_second": 3947.517 }, { "epoch": 1.7904397200581013, "grad_norm": 0.28515625, "learning_rate": 0.0003042222443959087, "loss": 2.1449, "num_input_tokens_seen": 444293120, "step": 6780, "train_runtime": 112503.3088, "train_tokens_per_second": 3949.156 }, { "epoch": 1.7930806813680178, "grad_norm": 0.27734375, "learning_rate": 0.0003037003058604663, "loss": 2.2176, "num_input_tokens_seen": 444948480, "step": 6790, "train_runtime": 112622.2703, "train_tokens_per_second": 3950.804 }, { "epoch": 1.7957216426779348, "grad_norm": 0.291015625, "learning_rate": 0.00030317812182282746, "loss": 2.1583, "num_input_tokens_seen": 445603840, "step": 6800, "train_runtime": 112742.9301, "train_tokens_per_second": 3952.388 }, { "epoch": 1.7983626039878515, "grad_norm": 0.287109375, "learning_rate": 0.0003026556946702659, "loss": 2.1611, "num_input_tokens_seen": 446259200, "step": 6810, "train_runtime": 112861.2253, "train_tokens_per_second": 3954.052 }, { "epoch": 1.8010035652977683, "grad_norm": 0.30078125, "learning_rate": 0.00030213302679116656, "loss": 2.1791, "num_input_tokens_seen": 446914560, "step": 6820, "train_runtime": 112980.1671, "train_tokens_per_second": 3955.69 }, { "epoch": 1.8036445266076853, "grad_norm": 0.306640625, "learning_rate": 0.0003016101205750154, "loss": 2.2046, "num_input_tokens_seen": 447569920, "step": 6830, "train_runtime": 113098.8089, "train_tokens_per_second": 3957.335 }, { "epoch": 1.806285487917602, "grad_norm": 0.287109375, "learning_rate": 0.0003010869784123876, "loss": 2.2256, "num_input_tokens_seen": 448225280, "step": 6840, "train_runtime": 113218.0517, "train_tokens_per_second": 3958.956 }, { "epoch": 1.8089264492275188, "grad_norm": 0.287109375, "learning_rate": 0.00030056360269493715, "loss": 2.1948, "num_input_tokens_seen": 448880640, "step": 6850, "train_runtime": 113337.3183, "train_tokens_per_second": 3960.572 }, { "epoch": 1.8115674105374358, "grad_norm": 0.27734375, "learning_rate": 0.0003000399958153857, "loss": 2.2073, "num_input_tokens_seen": 449536000, "step": 6860, "train_runtime": 113456.1827, "train_tokens_per_second": 3962.199 }, { "epoch": 1.8142083718473523, "grad_norm": 0.28125, "learning_rate": 0.00029951616016751195, "loss": 2.2603, "num_input_tokens_seen": 450191360, "step": 6870, "train_runtime": 113574.9211, "train_tokens_per_second": 3963.827 }, { "epoch": 1.8168493331572693, "grad_norm": 0.287109375, "learning_rate": 0.0002989920981461401, "loss": 2.2381, "num_input_tokens_seen": 450846720, "step": 6880, "train_runtime": 113693.497, "train_tokens_per_second": 3965.457 }, { "epoch": 1.819490294467186, "grad_norm": 0.2890625, "learning_rate": 0.0002984678121471296, "loss": 2.1604, "num_input_tokens_seen": 451502080, "step": 6890, "train_runtime": 113812.7739, "train_tokens_per_second": 3967.06 }, { "epoch": 1.8221312557771028, "grad_norm": 0.31640625, "learning_rate": 0.00029794330456736363, "loss": 2.1744, "num_input_tokens_seen": 452157440, "step": 6900, "train_runtime": 113931.679, "train_tokens_per_second": 3968.672 }, { "epoch": 1.8247722170870198, "grad_norm": 0.291015625, "learning_rate": 0.00029741857780473855, "loss": 2.1531, "num_input_tokens_seen": 452812800, "step": 6910, "train_runtime": 114052.0605, "train_tokens_per_second": 3970.229 }, { "epoch": 1.8274131783969365, "grad_norm": 0.28125, "learning_rate": 0.00029689363425815246, "loss": 2.2166, "num_input_tokens_seen": 453468160, "step": 6920, "train_runtime": 114170.4795, "train_tokens_per_second": 3971.851 }, { "epoch": 1.8300541397068533, "grad_norm": 0.28515625, "learning_rate": 0.0002963684763274949, "loss": 2.1864, "num_input_tokens_seen": 454123520, "step": 6930, "train_runtime": 114289.4159, "train_tokens_per_second": 3973.452 }, { "epoch": 1.8326951010167702, "grad_norm": 0.28515625, "learning_rate": 0.00029584310641363534, "loss": 2.1412, "num_input_tokens_seen": 454778880, "step": 6940, "train_runtime": 114408.26, "train_tokens_per_second": 3975.053 }, { "epoch": 1.8353360623266868, "grad_norm": 0.294921875, "learning_rate": 0.00029531752691841235, "loss": 2.1573, "num_input_tokens_seen": 455434240, "step": 6950, "train_runtime": 114528.955, "train_tokens_per_second": 3976.586 }, { "epoch": 1.8379770236366038, "grad_norm": 0.275390625, "learning_rate": 0.00029479174024462274, "loss": 2.1641, "num_input_tokens_seen": 456089600, "step": 6960, "train_runtime": 114647.3935, "train_tokens_per_second": 3978.194 }, { "epoch": 1.8406179849465205, "grad_norm": 0.287109375, "learning_rate": 0.0002942657487960103, "loss": 2.1352, "num_input_tokens_seen": 456744960, "step": 6970, "train_runtime": 114766.5021, "train_tokens_per_second": 3979.776 }, { "epoch": 1.8432589462564373, "grad_norm": 0.28515625, "learning_rate": 0.0002937395549772553, "loss": 2.2016, "num_input_tokens_seen": 457400320, "step": 6980, "train_runtime": 114886.3677, "train_tokens_per_second": 3981.328 }, { "epoch": 1.8458999075663542, "grad_norm": 0.287109375, "learning_rate": 0.00029321316119396287, "loss": 2.1595, "num_input_tokens_seen": 458055680, "step": 6990, "train_runtime": 115007.1447, "train_tokens_per_second": 3982.845 }, { "epoch": 1.848540868876271, "grad_norm": 0.306640625, "learning_rate": 0.0002926865698526524, "loss": 2.2249, "num_input_tokens_seen": 458711040, "step": 7000, "train_runtime": 115126.0467, "train_tokens_per_second": 3984.424 }, { "epoch": 1.8511818301861878, "grad_norm": 0.306640625, "learning_rate": 0.00029215978336074666, "loss": 2.1877, "num_input_tokens_seen": 459366400, "step": 7010, "train_runtime": 115258.2273, "train_tokens_per_second": 3985.541 }, { "epoch": 1.8538227914961047, "grad_norm": 0.287109375, "learning_rate": 0.0002916328041265604, "loss": 2.2082, "num_input_tokens_seen": 460021760, "step": 7020, "train_runtime": 115376.4672, "train_tokens_per_second": 3987.137 }, { "epoch": 1.8564637528060213, "grad_norm": 0.291015625, "learning_rate": 0.00029110563455928944, "loss": 2.2056, "num_input_tokens_seen": 460677120, "step": 7030, "train_runtime": 115494.4951, "train_tokens_per_second": 3988.737 }, { "epoch": 1.8591047141159383, "grad_norm": 0.294921875, "learning_rate": 0.00029057827706899995, "loss": 2.1202, "num_input_tokens_seen": 461332480, "step": 7040, "train_runtime": 115613.3943, "train_tokens_per_second": 3990.303 }, { "epoch": 1.861745675425855, "grad_norm": 0.298828125, "learning_rate": 0.0002900507340666173, "loss": 2.2726, "num_input_tokens_seen": 461987840, "step": 7050, "train_runtime": 115732.6563, "train_tokens_per_second": 3991.854 }, { "epoch": 1.8643866367357718, "grad_norm": 0.283203125, "learning_rate": 0.00028952300796391466, "loss": 2.1556, "num_input_tokens_seen": 462643200, "step": 7060, "train_runtime": 115850.6045, "train_tokens_per_second": 3993.447 }, { "epoch": 1.8670275980456887, "grad_norm": 0.291015625, "learning_rate": 0.0002889951011735026, "loss": 2.1617, "num_input_tokens_seen": 463298560, "step": 7070, "train_runtime": 115969.3806, "train_tokens_per_second": 3995.008 }, { "epoch": 1.8696685593556053, "grad_norm": 0.306640625, "learning_rate": 0.00028846701610881734, "loss": 2.1631, "num_input_tokens_seen": 463953920, "step": 7080, "train_runtime": 116090.0824, "train_tokens_per_second": 3996.499 }, { "epoch": 1.8723095206655223, "grad_norm": 0.29296875, "learning_rate": 0.00028793875518411057, "loss": 2.1641, "num_input_tokens_seen": 464609280, "step": 7090, "train_runtime": 116209.9065, "train_tokens_per_second": 3998.018 }, { "epoch": 1.874950481975439, "grad_norm": 0.28515625, "learning_rate": 0.0002874103208144377, "loss": 2.1655, "num_input_tokens_seen": 465264640, "step": 7100, "train_runtime": 116329.9499, "train_tokens_per_second": 3999.526 }, { "epoch": 1.8775914432853558, "grad_norm": 0.275390625, "learning_rate": 0.00028688171541564714, "loss": 2.1712, "num_input_tokens_seen": 465920000, "step": 7110, "train_runtime": 116448.5946, "train_tokens_per_second": 4001.079 }, { "epoch": 1.8802324045952727, "grad_norm": 0.302734375, "learning_rate": 0.0002863529414043692, "loss": 2.1668, "num_input_tokens_seen": 466575360, "step": 7120, "train_runtime": 116567.0957, "train_tokens_per_second": 4002.633 }, { "epoch": 1.8828733659051895, "grad_norm": 0.287109375, "learning_rate": 0.000285824001198005, "loss": 2.2046, "num_input_tokens_seen": 467230720, "step": 7130, "train_runtime": 116685.8683, "train_tokens_per_second": 4004.176 }, { "epoch": 1.8855143272151063, "grad_norm": 0.28515625, "learning_rate": 0.00028529489721471556, "loss": 2.1657, "num_input_tokens_seen": 467886080, "step": 7140, "train_runtime": 116806.3913, "train_tokens_per_second": 4005.655 }, { "epoch": 1.8881552885250232, "grad_norm": 0.27734375, "learning_rate": 0.0002847656318734105, "loss": 2.2341, "num_input_tokens_seen": 468541440, "step": 7150, "train_runtime": 116930.1487, "train_tokens_per_second": 4007.02 }, { "epoch": 1.8907962498349398, "grad_norm": 0.275390625, "learning_rate": 0.0002842362075937372, "loss": 2.1156, "num_input_tokens_seen": 469196800, "step": 7160, "train_runtime": 117053.2263, "train_tokens_per_second": 4008.406 }, { "epoch": 1.8934372111448567, "grad_norm": 0.27734375, "learning_rate": 0.00028370662679606974, "loss": 2.1697, "num_input_tokens_seen": 469852160, "step": 7170, "train_runtime": 117176.5737, "train_tokens_per_second": 4009.779 }, { "epoch": 1.8960781724547735, "grad_norm": 0.2890625, "learning_rate": 0.0002831768919014975, "loss": 2.186, "num_input_tokens_seen": 470507520, "step": 7180, "train_runtime": 117299.8905, "train_tokens_per_second": 4011.151 }, { "epoch": 1.8987191337646903, "grad_norm": 0.314453125, "learning_rate": 0.0002826470053318146, "loss": 2.1841, "num_input_tokens_seen": 471162880, "step": 7190, "train_runtime": 117422.9798, "train_tokens_per_second": 4012.527 }, { "epoch": 1.9013600950746072, "grad_norm": 0.306640625, "learning_rate": 0.0002821169695095085, "loss": 2.216, "num_input_tokens_seen": 471818240, "step": 7200, "train_runtime": 117546.1568, "train_tokens_per_second": 4013.898 }, { "epoch": 1.904001056384524, "grad_norm": 0.29296875, "learning_rate": 0.00028158678685774894, "loss": 2.1943, "num_input_tokens_seen": 472473600, "step": 7210, "train_runtime": 117670.8304, "train_tokens_per_second": 4015.214 }, { "epoch": 1.9066420176944407, "grad_norm": 0.279296875, "learning_rate": 0.00028105645980037704, "loss": 2.1354, "num_input_tokens_seen": 473128960, "step": 7220, "train_runtime": 117794.3725, "train_tokens_per_second": 4016.567 }, { "epoch": 1.9092829790043577, "grad_norm": 0.283203125, "learning_rate": 0.00028052599076189397, "loss": 2.1935, "num_input_tokens_seen": 473784320, "step": 7230, "train_runtime": 117913.5927, "train_tokens_per_second": 4018.064 }, { "epoch": 1.9119239403142743, "grad_norm": 0.30859375, "learning_rate": 0.00027999538216745003, "loss": 2.2211, "num_input_tokens_seen": 474439680, "step": 7240, "train_runtime": 118031.8674, "train_tokens_per_second": 4019.59 }, { "epoch": 1.9145649016241912, "grad_norm": 0.287109375, "learning_rate": 0.00027946463644283365, "loss": 2.1685, "num_input_tokens_seen": 475095040, "step": 7250, "train_runtime": 118150.1803, "train_tokens_per_second": 4021.111 }, { "epoch": 1.917205862934108, "grad_norm": 0.279296875, "learning_rate": 0.0002789337560144599, "loss": 2.1477, "num_input_tokens_seen": 475750400, "step": 7260, "train_runtime": 118268.9666, "train_tokens_per_second": 4022.614 }, { "epoch": 1.9198468242440248, "grad_norm": 0.265625, "learning_rate": 0.00027840274330936005, "loss": 2.1712, "num_input_tokens_seen": 476405760, "step": 7270, "train_runtime": 118387.3002, "train_tokens_per_second": 4024.129 }, { "epoch": 1.9224877855539417, "grad_norm": 0.283203125, "learning_rate": 0.00027787160075516985, "loss": 2.2009, "num_input_tokens_seen": 477061120, "step": 7280, "train_runtime": 118505.5171, "train_tokens_per_second": 4025.645 }, { "epoch": 1.9251287468638585, "grad_norm": 0.291015625, "learning_rate": 0.0002773403307801187, "loss": 2.1796, "num_input_tokens_seen": 477716480, "step": 7290, "train_runtime": 118623.8596, "train_tokens_per_second": 4027.153 }, { "epoch": 1.9277697081737752, "grad_norm": 0.27734375, "learning_rate": 0.0002768089358130185, "loss": 2.139, "num_input_tokens_seen": 478371840, "step": 7300, "train_runtime": 118742.6025, "train_tokens_per_second": 4028.645 }, { "epoch": 1.9304106694836922, "grad_norm": 0.275390625, "learning_rate": 0.00027627741828325293, "loss": 2.2045, "num_input_tokens_seen": 479027200, "step": 7310, "train_runtime": 118860.2199, "train_tokens_per_second": 4030.173 }, { "epoch": 1.9330516307936088, "grad_norm": 0.287109375, "learning_rate": 0.00027574578062076544, "loss": 2.2294, "num_input_tokens_seen": 479682560, "step": 7320, "train_runtime": 118978.8305, "train_tokens_per_second": 4031.663 }, { "epoch": 1.9356925921035257, "grad_norm": 0.28125, "learning_rate": 0.000275214025256049, "loss": 2.1407, "num_input_tokens_seen": 480337920, "step": 7330, "train_runtime": 119098.2016, "train_tokens_per_second": 4033.125 }, { "epoch": 1.9383335534134425, "grad_norm": 0.29296875, "learning_rate": 0.0002746821546201347, "loss": 2.1828, "num_input_tokens_seen": 480993280, "step": 7340, "train_runtime": 119217.9125, "train_tokens_per_second": 4034.572 }, { "epoch": 1.9409745147233592, "grad_norm": 0.279296875, "learning_rate": 0.0002741501711445807, "loss": 2.2349, "num_input_tokens_seen": 481648640, "step": 7350, "train_runtime": 119336.0203, "train_tokens_per_second": 4036.071 }, { "epoch": 1.9436154760332762, "grad_norm": 0.28125, "learning_rate": 0.00027361807726146057, "loss": 2.1359, "num_input_tokens_seen": 482304000, "step": 7360, "train_runtime": 119454.4738, "train_tokens_per_second": 4037.555 }, { "epoch": 1.9462564373431928, "grad_norm": 0.287109375, "learning_rate": 0.0002730858754033532, "loss": 2.1516, "num_input_tokens_seen": 482959360, "step": 7370, "train_runtime": 119573.9009, "train_tokens_per_second": 4039.003 }, { "epoch": 1.9488973986531097, "grad_norm": 0.291015625, "learning_rate": 0.00027255356800333076, "loss": 2.1146, "num_input_tokens_seen": 483614720, "step": 7380, "train_runtime": 119692.3733, "train_tokens_per_second": 4040.481 }, { "epoch": 1.9515383599630265, "grad_norm": 0.28125, "learning_rate": 0.000272021157494948, "loss": 2.225, "num_input_tokens_seen": 484270080, "step": 7390, "train_runtime": 119810.8053, "train_tokens_per_second": 4041.957 }, { "epoch": 1.9541793212729432, "grad_norm": 0.28515625, "learning_rate": 0.0002714886463122312, "loss": 2.1815, "num_input_tokens_seen": 484925440, "step": 7400, "train_runtime": 119928.5985, "train_tokens_per_second": 4043.451 }, { "epoch": 1.9568202825828602, "grad_norm": 0.291015625, "learning_rate": 0.00027095603688966676, "loss": 2.2085, "num_input_tokens_seen": 485580800, "step": 7410, "train_runtime": 120049.2318, "train_tokens_per_second": 4044.847 }, { "epoch": 1.959461243892777, "grad_norm": 0.27734375, "learning_rate": 0.00027042333166219006, "loss": 2.1333, "num_input_tokens_seen": 486236160, "step": 7420, "train_runtime": 120173.7942, "train_tokens_per_second": 4046.108 }, { "epoch": 1.9621022052026937, "grad_norm": 0.275390625, "learning_rate": 0.0002698905330651748, "loss": 2.1709, "num_input_tokens_seen": 486891520, "step": 7430, "train_runtime": 120291.6868, "train_tokens_per_second": 4047.591 }, { "epoch": 1.9647431665126107, "grad_norm": 0.310546875, "learning_rate": 0.0002693576435344212, "loss": 2.1971, "num_input_tokens_seen": 487546880, "step": 7440, "train_runtime": 120410.078, "train_tokens_per_second": 4049.054 }, { "epoch": 1.9673841278225273, "grad_norm": 0.294921875, "learning_rate": 0.0002688246655061456, "loss": 2.1605, "num_input_tokens_seen": 488202240, "step": 7450, "train_runtime": 120529.2995, "train_tokens_per_second": 4050.486 }, { "epoch": 1.9700250891324442, "grad_norm": 0.28125, "learning_rate": 0.0002682916014169685, "loss": 2.172, "num_input_tokens_seen": 488857600, "step": 7460, "train_runtime": 120648.4611, "train_tokens_per_second": 4051.917 }, { "epoch": 1.972666050442361, "grad_norm": 0.28515625, "learning_rate": 0.0002677584537039041, "loss": 2.0911, "num_input_tokens_seen": 489512960, "step": 7470, "train_runtime": 120766.5398, "train_tokens_per_second": 4053.382 }, { "epoch": 1.9753070117522777, "grad_norm": 0.2734375, "learning_rate": 0.0002672252248043488, "loss": 2.149, "num_input_tokens_seen": 490168320, "step": 7480, "train_runtime": 120885.5921, "train_tokens_per_second": 4054.812 }, { "epoch": 1.9779479730621947, "grad_norm": 0.28125, "learning_rate": 0.0002666919171560703, "loss": 2.2283, "num_input_tokens_seen": 490823680, "step": 7490, "train_runtime": 121010.2529, "train_tokens_per_second": 4056.05 }, { "epoch": 1.9805889343721115, "grad_norm": 0.2734375, "learning_rate": 0.00026615853319719626, "loss": 2.2299, "num_input_tokens_seen": 491479040, "step": 7500, "train_runtime": 121128.4355, "train_tokens_per_second": 4057.503 }, { "epoch": 1.9832298956820282, "grad_norm": 0.287109375, "learning_rate": 0.00026562507536620294, "loss": 2.2422, "num_input_tokens_seen": 492134400, "step": 7510, "train_runtime": 121260.8652, "train_tokens_per_second": 4058.477 }, { "epoch": 1.9858708569919452, "grad_norm": 0.27734375, "learning_rate": 0.0002650915461019048, "loss": 2.1926, "num_input_tokens_seen": 492789760, "step": 7520, "train_runtime": 121378.9998, "train_tokens_per_second": 4059.926 }, { "epoch": 1.9885118183018617, "grad_norm": 0.27734375, "learning_rate": 0.0002645579478434426, "loss": 2.1582, "num_input_tokens_seen": 493445120, "step": 7530, "train_runtime": 121498.0518, "train_tokens_per_second": 4061.342 }, { "epoch": 1.9911527796117787, "grad_norm": 0.28515625, "learning_rate": 0.00026402428303027236, "loss": 2.1812, "num_input_tokens_seen": 494100480, "step": 7540, "train_runtime": 121616.5807, "train_tokens_per_second": 4062.772 }, { "epoch": 1.9937937409216955, "grad_norm": 0.30078125, "learning_rate": 0.00026349055410215474, "loss": 2.1734, "num_input_tokens_seen": 494755840, "step": 7550, "train_runtime": 121735.3205, "train_tokens_per_second": 4064.193 }, { "epoch": 1.9964347022316122, "grad_norm": 0.287109375, "learning_rate": 0.00026295676349914315, "loss": 2.1836, "num_input_tokens_seen": 495411200, "step": 7560, "train_runtime": 121853.5549, "train_tokens_per_second": 4065.628 }, { "epoch": 1.9990756635415292, "grad_norm": 0.283203125, "learning_rate": 0.0002624229136615734, "loss": 2.1844, "num_input_tokens_seen": 496066560, "step": 7570, "train_runtime": 121972.4187, "train_tokens_per_second": 4067.039 }, { "epoch": 2.00158457678595, "grad_norm": 0.337890625, "learning_rate": 0.00026188900703005163, "loss": 1.8962, "num_input_tokens_seen": 496680960, "step": 7580, "train_runtime": 122084.9792, "train_tokens_per_second": 4068.322 }, { "epoch": 2.004225538095867, "grad_norm": 0.34375, "learning_rate": 0.00026135504604544394, "loss": 1.8112, "num_input_tokens_seen": 497336320, "step": 7590, "train_runtime": 122202.9107, "train_tokens_per_second": 4069.758 }, { "epoch": 2.006866499405784, "grad_norm": 0.32421875, "learning_rate": 0.00026082103314886484, "loss": 1.7385, "num_input_tokens_seen": 497991680, "step": 7600, "train_runtime": 122323.6716, "train_tokens_per_second": 4071.098 }, { "epoch": 2.0095074607157004, "grad_norm": 0.33203125, "learning_rate": 0.0002602869707816661, "loss": 1.7123, "num_input_tokens_seen": 498647040, "step": 7610, "train_runtime": 122441.9533, "train_tokens_per_second": 4072.518 }, { "epoch": 2.0121484220256174, "grad_norm": 0.3203125, "learning_rate": 0.00025975286138542553, "loss": 1.7411, "num_input_tokens_seen": 499302400, "step": 7620, "train_runtime": 122560.5511, "train_tokens_per_second": 4073.924 }, { "epoch": 2.014789383335534, "grad_norm": 0.326171875, "learning_rate": 0.0002592187074019364, "loss": 1.6976, "num_input_tokens_seen": 499957760, "step": 7630, "train_runtime": 122679.9299, "train_tokens_per_second": 4075.302 }, { "epoch": 2.017430344645451, "grad_norm": 0.3515625, "learning_rate": 0.0002586845112731954, "loss": 1.7751, "num_input_tokens_seen": 500613120, "step": 7640, "train_runtime": 122799.0373, "train_tokens_per_second": 4076.686 }, { "epoch": 2.020071305955368, "grad_norm": 0.341796875, "learning_rate": 0.000258150275441392, "loss": 1.7346, "num_input_tokens_seen": 501268480, "step": 7650, "train_runtime": 122918.4582, "train_tokens_per_second": 4078.057 }, { "epoch": 2.0227122672652844, "grad_norm": 0.3515625, "learning_rate": 0.0002576160023488972, "loss": 1.7168, "num_input_tokens_seen": 501923840, "step": 7660, "train_runtime": 123037.1835, "train_tokens_per_second": 4079.448 }, { "epoch": 2.0253532285752014, "grad_norm": 0.328125, "learning_rate": 0.0002570816944382524, "loss": 1.6782, "num_input_tokens_seen": 502579200, "step": 7670, "train_runtime": 123156.4293, "train_tokens_per_second": 4080.82 }, { "epoch": 2.0279941898851184, "grad_norm": 0.34375, "learning_rate": 0.0002565473541521582, "loss": 1.7674, "num_input_tokens_seen": 503234560, "step": 7680, "train_runtime": 123275.4067, "train_tokens_per_second": 4082.198 }, { "epoch": 2.030635151195035, "grad_norm": 0.341796875, "learning_rate": 0.000256012983933463, "loss": 1.7331, "num_input_tokens_seen": 503889920, "step": 7690, "train_runtime": 123394.327, "train_tokens_per_second": 4083.574 }, { "epoch": 2.033276112504952, "grad_norm": 0.345703125, "learning_rate": 0.0002554785862251523, "loss": 1.7731, "num_input_tokens_seen": 504545280, "step": 7700, "train_runtime": 123517.1474, "train_tokens_per_second": 4084.82 }, { "epoch": 2.0359170738148684, "grad_norm": 0.3203125, "learning_rate": 0.00025494416347033704, "loss": 1.7762, "num_input_tokens_seen": 505200640, "step": 7710, "train_runtime": 123640.3767, "train_tokens_per_second": 4086.049 }, { "epoch": 2.0385580351247854, "grad_norm": 0.34765625, "learning_rate": 0.00025440971811224294, "loss": 1.728, "num_input_tokens_seen": 505856000, "step": 7720, "train_runtime": 123759.753, "train_tokens_per_second": 4087.403 }, { "epoch": 2.0411989964347024, "grad_norm": 0.35546875, "learning_rate": 0.00025387525259419874, "loss": 1.7502, "num_input_tokens_seen": 506511360, "step": 7730, "train_runtime": 123878.1799, "train_tokens_per_second": 4088.786 }, { "epoch": 2.043839957744619, "grad_norm": 0.34375, "learning_rate": 0.00025334076935962555, "loss": 1.7492, "num_input_tokens_seen": 507166720, "step": 7740, "train_runtime": 123997.0286, "train_tokens_per_second": 4090.152 }, { "epoch": 2.046480919054536, "grad_norm": 0.36328125, "learning_rate": 0.00025280627085202555, "loss": 1.7439, "num_input_tokens_seen": 507822080, "step": 7750, "train_runtime": 124115.8412, "train_tokens_per_second": 4091.517 }, { "epoch": 2.049121880364453, "grad_norm": 0.3515625, "learning_rate": 0.0002522717595149705, "loss": 1.7854, "num_input_tokens_seen": 508477440, "step": 7760, "train_runtime": 124234.7525, "train_tokens_per_second": 4092.876 }, { "epoch": 2.0517628416743694, "grad_norm": 0.349609375, "learning_rate": 0.000251737237792091, "loss": 1.7746, "num_input_tokens_seen": 509132800, "step": 7770, "train_runtime": 124353.8608, "train_tokens_per_second": 4094.226 }, { "epoch": 2.0544038029842864, "grad_norm": 0.341796875, "learning_rate": 0.0002512027081270651, "loss": 1.776, "num_input_tokens_seen": 509788160, "step": 7780, "train_runtime": 124474.9998, "train_tokens_per_second": 4095.506 }, { "epoch": 2.057044764294203, "grad_norm": 0.34765625, "learning_rate": 0.000250668172963607, "loss": 1.7205, "num_input_tokens_seen": 510443520, "step": 7790, "train_runtime": 124594.3313, "train_tokens_per_second": 4096.844 }, { "epoch": 2.05968572560412, "grad_norm": 0.35546875, "learning_rate": 0.0002501336347454562, "loss": 1.7456, "num_input_tokens_seen": 511098880, "step": 7800, "train_runtime": 124713.6451, "train_tokens_per_second": 4098.179 }, { "epoch": 2.062326686914037, "grad_norm": 0.361328125, "learning_rate": 0.00024959909591636625, "loss": 1.771, "num_input_tokens_seen": 511754240, "step": 7810, "train_runtime": 124832.3326, "train_tokens_per_second": 4099.533 }, { "epoch": 2.0649676482239534, "grad_norm": 0.3515625, "learning_rate": 0.00024906455892009327, "loss": 1.7439, "num_input_tokens_seen": 512409600, "step": 7820, "train_runtime": 124951.4924, "train_tokens_per_second": 4100.868 }, { "epoch": 2.0676086095338704, "grad_norm": 0.337890625, "learning_rate": 0.00024853002620038513, "loss": 1.8217, "num_input_tokens_seen": 513064960, "step": 7830, "train_runtime": 125070.5633, "train_tokens_per_second": 4102.204 }, { "epoch": 2.070249570843787, "grad_norm": 0.345703125, "learning_rate": 0.00024799550020097004, "loss": 1.7468, "num_input_tokens_seen": 513720320, "step": 7840, "train_runtime": 125188.9602, "train_tokens_per_second": 4103.559 }, { "epoch": 2.072890532153704, "grad_norm": 0.3515625, "learning_rate": 0.0002474609833655457, "loss": 1.7804, "num_input_tokens_seen": 514375680, "step": 7850, "train_runtime": 125308.4415, "train_tokens_per_second": 4104.877 }, { "epoch": 2.075531493463621, "grad_norm": 0.345703125, "learning_rate": 0.00024692647813776784, "loss": 1.742, "num_input_tokens_seen": 515031040, "step": 7860, "train_runtime": 125426.5875, "train_tokens_per_second": 4106.235 }, { "epoch": 2.0781724547735374, "grad_norm": 0.35546875, "learning_rate": 0.00024639198696123886, "loss": 1.7549, "num_input_tokens_seen": 515686400, "step": 7870, "train_runtime": 125545.2703, "train_tokens_per_second": 4107.573 }, { "epoch": 2.0808134160834544, "grad_norm": 0.3359375, "learning_rate": 0.0002458575122794973, "loss": 1.7687, "num_input_tokens_seen": 516341760, "step": 7880, "train_runtime": 125664.2574, "train_tokens_per_second": 4108.899 }, { "epoch": 2.0834543773933714, "grad_norm": 0.345703125, "learning_rate": 0.000245323056536006, "loss": 1.7648, "num_input_tokens_seen": 516997120, "step": 7890, "train_runtime": 125783.0612, "train_tokens_per_second": 4110.228 }, { "epoch": 2.086095338703288, "grad_norm": 0.3359375, "learning_rate": 0.0002447886221741414, "loss": 1.7488, "num_input_tokens_seen": 517652480, "step": 7900, "train_runtime": 125902.0967, "train_tokens_per_second": 4111.548 }, { "epoch": 2.088736300013205, "grad_norm": 0.33984375, "learning_rate": 0.00024425421163718207, "loss": 1.7756, "num_input_tokens_seen": 518307840, "step": 7910, "train_runtime": 126021.0352, "train_tokens_per_second": 4112.868 }, { "epoch": 2.0913772613231214, "grad_norm": 0.345703125, "learning_rate": 0.0002437198273682978, "loss": 1.6993, "num_input_tokens_seen": 518963200, "step": 7920, "train_runtime": 126140.657, "train_tokens_per_second": 4114.163 }, { "epoch": 2.0940182226330384, "grad_norm": 0.349609375, "learning_rate": 0.00024318547181053819, "loss": 1.7315, "num_input_tokens_seen": 519618560, "step": 7930, "train_runtime": 126259.02, "train_tokens_per_second": 4115.497 }, { "epoch": 2.0966591839429554, "grad_norm": 0.357421875, "learning_rate": 0.00024265114740682167, "loss": 1.7431, "num_input_tokens_seen": 520273920, "step": 7940, "train_runtime": 126378.0559, "train_tokens_per_second": 4116.806 }, { "epoch": 2.099300145252872, "grad_norm": 0.36328125, "learning_rate": 0.000242116856599924, "loss": 1.7303, "num_input_tokens_seen": 520929280, "step": 7950, "train_runtime": 126497.1651, "train_tokens_per_second": 4118.11 }, { "epoch": 2.101941106562789, "grad_norm": 0.345703125, "learning_rate": 0.00024158260183246757, "loss": 1.7809, "num_input_tokens_seen": 521584640, "step": 7960, "train_runtime": 126616.0381, "train_tokens_per_second": 4119.42 }, { "epoch": 2.104582067872706, "grad_norm": 0.34765625, "learning_rate": 0.00024104838554691015, "loss": 1.7361, "num_input_tokens_seen": 522240000, "step": 7970, "train_runtime": 126734.5896, "train_tokens_per_second": 4120.738 }, { "epoch": 2.1072230291826224, "grad_norm": 0.353515625, "learning_rate": 0.00024051421018553312, "loss": 1.7758, "num_input_tokens_seen": 522895360, "step": 7980, "train_runtime": 126853.6257, "train_tokens_per_second": 4122.037 }, { "epoch": 2.1098639904925394, "grad_norm": 0.345703125, "learning_rate": 0.00023998007819043122, "loss": 1.775, "num_input_tokens_seen": 523550720, "step": 7990, "train_runtime": 126972.9651, "train_tokens_per_second": 4123.324 }, { "epoch": 2.112504951802456, "grad_norm": 0.37890625, "learning_rate": 0.00023944599200350058, "loss": 1.7898, "num_input_tokens_seen": 524206080, "step": 8000, "train_runtime": 127092.5128, "train_tokens_per_second": 4124.602 }, { "epoch": 2.115145913112373, "grad_norm": 0.353515625, "learning_rate": 0.00023891195406642825, "loss": 1.7563, "num_input_tokens_seen": 524861440, "step": 8010, "train_runtime": 127224.1289, "train_tokens_per_second": 4125.487 }, { "epoch": 2.11778687442229, "grad_norm": 0.375, "learning_rate": 0.00023837796682068047, "loss": 1.7687, "num_input_tokens_seen": 525516800, "step": 8020, "train_runtime": 127342.628, "train_tokens_per_second": 4126.794 }, { "epoch": 2.1204278357322064, "grad_norm": 0.32421875, "learning_rate": 0.00023784403270749166, "loss": 1.7389, "num_input_tokens_seen": 526172160, "step": 8030, "train_runtime": 127461.3966, "train_tokens_per_second": 4128.09 }, { "epoch": 2.1230687970421234, "grad_norm": 0.353515625, "learning_rate": 0.0002373101541678536, "loss": 1.7768, "num_input_tokens_seen": 526827520, "step": 8040, "train_runtime": 127580.0605, "train_tokens_per_second": 4129.388 }, { "epoch": 2.1257097583520403, "grad_norm": 0.34375, "learning_rate": 0.00023677633364250388, "loss": 1.8145, "num_input_tokens_seen": 527482880, "step": 8050, "train_runtime": 127698.8762, "train_tokens_per_second": 4130.678 }, { "epoch": 2.128350719661957, "grad_norm": 0.328125, "learning_rate": 0.0002362425735719147, "loss": 1.7534, "num_input_tokens_seen": 528138240, "step": 8060, "train_runtime": 127819.112, "train_tokens_per_second": 4131.919 }, { "epoch": 2.130991680971874, "grad_norm": 0.353515625, "learning_rate": 0.0002357088763962821, "loss": 1.791, "num_input_tokens_seen": 528793600, "step": 8070, "train_runtime": 127937.2109, "train_tokens_per_second": 4133.228 }, { "epoch": 2.1336326422817904, "grad_norm": 0.427734375, "learning_rate": 0.00023517524455551463, "loss": 1.801, "num_input_tokens_seen": 529448960, "step": 8080, "train_runtime": 128056.2376, "train_tokens_per_second": 4134.503 }, { "epoch": 2.1362736035917074, "grad_norm": 0.36328125, "learning_rate": 0.0002346416804892218, "loss": 1.7931, "num_input_tokens_seen": 530104320, "step": 8090, "train_runtime": 128174.2709, "train_tokens_per_second": 4135.809 }, { "epoch": 2.1389145649016243, "grad_norm": 0.33984375, "learning_rate": 0.0002341081866367037, "loss": 1.772, "num_input_tokens_seen": 530759680, "step": 8100, "train_runtime": 128292.3229, "train_tokens_per_second": 4137.112 }, { "epoch": 2.141555526211541, "grad_norm": 0.34375, "learning_rate": 0.00023357476543693905, "loss": 1.7975, "num_input_tokens_seen": 531415040, "step": 8110, "train_runtime": 128414.6821, "train_tokens_per_second": 4138.273 }, { "epoch": 2.144196487521458, "grad_norm": 0.3359375, "learning_rate": 0.0002330414193285747, "loss": 1.713, "num_input_tokens_seen": 532070400, "step": 8120, "train_runtime": 128535.8516, "train_tokens_per_second": 4139.471 }, { "epoch": 2.1468374488313744, "grad_norm": 0.353515625, "learning_rate": 0.00023250815074991418, "loss": 1.7528, "num_input_tokens_seen": 532725760, "step": 8130, "train_runtime": 128657.89, "train_tokens_per_second": 4140.638 }, { "epoch": 2.1494784101412914, "grad_norm": 0.37890625, "learning_rate": 0.0002319749621389063, "loss": 1.7682, "num_input_tokens_seen": 533381120, "step": 8140, "train_runtime": 128776.6087, "train_tokens_per_second": 4141.91 }, { "epoch": 2.1521193714512084, "grad_norm": 0.341796875, "learning_rate": 0.0002314418559331346, "loss": 1.8058, "num_input_tokens_seen": 534036480, "step": 8150, "train_runtime": 128894.9712, "train_tokens_per_second": 4143.191 }, { "epoch": 2.154760332761125, "grad_norm": 0.359375, "learning_rate": 0.00023090883456980586, "loss": 1.762, "num_input_tokens_seen": 534691840, "step": 8160, "train_runtime": 129012.3112, "train_tokens_per_second": 4144.502 }, { "epoch": 2.157401294071042, "grad_norm": 0.34765625, "learning_rate": 0.00023037590048573866, "loss": 1.738, "num_input_tokens_seen": 535347200, "step": 8170, "train_runtime": 129131.7649, "train_tokens_per_second": 4145.744 }, { "epoch": 2.160042255380959, "grad_norm": 0.369140625, "learning_rate": 0.00022984305611735293, "loss": 1.7308, "num_input_tokens_seen": 536002560, "step": 8180, "train_runtime": 129250.2333, "train_tokens_per_second": 4147.014 }, { "epoch": 2.1626832166908754, "grad_norm": 0.341796875, "learning_rate": 0.0002293103039006583, "loss": 1.787, "num_input_tokens_seen": 536657920, "step": 8190, "train_runtime": 129370.1093, "train_tokens_per_second": 4148.237 }, { "epoch": 2.1653241780007924, "grad_norm": 0.34375, "learning_rate": 0.00022877764627124314, "loss": 1.753, "num_input_tokens_seen": 537313280, "step": 8200, "train_runtime": 129490.7161, "train_tokens_per_second": 4149.435 }, { "epoch": 2.1679651393107093, "grad_norm": 0.33984375, "learning_rate": 0.0002282450856642633, "loss": 1.7668, "num_input_tokens_seen": 537968640, "step": 8210, "train_runtime": 129611.9771, "train_tokens_per_second": 4150.609 }, { "epoch": 2.170606100620626, "grad_norm": 0.35546875, "learning_rate": 0.00022771262451443133, "loss": 1.7946, "num_input_tokens_seen": 538624000, "step": 8220, "train_runtime": 129730.1997, "train_tokens_per_second": 4151.878 }, { "epoch": 2.173247061930543, "grad_norm": 0.353515625, "learning_rate": 0.00022718026525600466, "loss": 1.8128, "num_input_tokens_seen": 539279360, "step": 8230, "train_runtime": 129849.1077, "train_tokens_per_second": 4153.123 }, { "epoch": 2.1758880232404594, "grad_norm": 0.359375, "learning_rate": 0.00022664801032277538, "loss": 1.7647, "num_input_tokens_seen": 539934720, "step": 8240, "train_runtime": 129967.5043, "train_tokens_per_second": 4154.382 }, { "epoch": 2.1785289845503764, "grad_norm": 0.33984375, "learning_rate": 0.00022611586214805817, "loss": 1.8094, "num_input_tokens_seen": 540590080, "step": 8250, "train_runtime": 130085.9211, "train_tokens_per_second": 4155.639 }, { "epoch": 2.1811699458602933, "grad_norm": 0.34765625, "learning_rate": 0.00022558382316468, "loss": 1.7866, "num_input_tokens_seen": 541245440, "step": 8260, "train_runtime": 130204.8497, "train_tokens_per_second": 4156.876 }, { "epoch": 2.18381090717021, "grad_norm": 0.35546875, "learning_rate": 0.0002250518958049686, "loss": 1.7779, "num_input_tokens_seen": 541900800, "step": 8270, "train_runtime": 130323.382, "train_tokens_per_second": 4158.124 }, { "epoch": 2.186451868480127, "grad_norm": 0.36328125, "learning_rate": 0.00022452008250074115, "loss": 1.7684, "num_input_tokens_seen": 542556160, "step": 8280, "train_runtime": 130441.9355, "train_tokens_per_second": 4159.369 }, { "epoch": 2.1890928297900434, "grad_norm": 0.359375, "learning_rate": 0.00022398838568329365, "loss": 1.7176, "num_input_tokens_seen": 543211520, "step": 8290, "train_runtime": 130563.1634, "train_tokens_per_second": 4160.527 }, { "epoch": 2.1917337910999604, "grad_norm": 0.33984375, "learning_rate": 0.00022345680778338963, "loss": 1.7329, "num_input_tokens_seen": 543866880, "step": 8300, "train_runtime": 130682.6085, "train_tokens_per_second": 4161.739 }, { "epoch": 2.1943747524098773, "grad_norm": 0.359375, "learning_rate": 0.0002229253512312485, "loss": 1.7994, "num_input_tokens_seen": 544522240, "step": 8310, "train_runtime": 130801.1059, "train_tokens_per_second": 4162.979 }, { "epoch": 2.197015713719794, "grad_norm": 0.37109375, "learning_rate": 0.00022239401845653534, "loss": 1.7616, "num_input_tokens_seen": 545177600, "step": 8320, "train_runtime": 130920.2945, "train_tokens_per_second": 4164.195 }, { "epoch": 2.199656675029711, "grad_norm": 0.349609375, "learning_rate": 0.00022186281188834938, "loss": 1.8238, "num_input_tokens_seen": 545832960, "step": 8330, "train_runtime": 131039.1858, "train_tokens_per_second": 4165.418 }, { "epoch": 2.202297636339628, "grad_norm": 0.36328125, "learning_rate": 0.00022133173395521248, "loss": 1.7817, "num_input_tokens_seen": 546488320, "step": 8340, "train_runtime": 131157.9036, "train_tokens_per_second": 4166.644 }, { "epoch": 2.2049385976495444, "grad_norm": 0.349609375, "learning_rate": 0.00022080078708505878, "loss": 1.7631, "num_input_tokens_seen": 547143680, "step": 8350, "train_runtime": 131276.9596, "train_tokens_per_second": 4167.858 }, { "epoch": 2.2075795589594613, "grad_norm": 0.345703125, "learning_rate": 0.00022026997370522302, "loss": 1.7164, "num_input_tokens_seen": 547799040, "step": 8360, "train_runtime": 131396.048, "train_tokens_per_second": 4169.068 }, { "epoch": 2.210220520269378, "grad_norm": 0.34765625, "learning_rate": 0.00021973929624242988, "loss": 1.7892, "num_input_tokens_seen": 548454400, "step": 8370, "train_runtime": 131514.9039, "train_tokens_per_second": 4170.283 }, { "epoch": 2.212861481579295, "grad_norm": 0.365234375, "learning_rate": 0.0002192087571227825, "loss": 1.7302, "num_input_tokens_seen": 549109760, "step": 8380, "train_runtime": 131634.3445, "train_tokens_per_second": 4171.478 }, { "epoch": 2.215502442889212, "grad_norm": 0.365234375, "learning_rate": 0.00021867835877175147, "loss": 1.741, "num_input_tokens_seen": 549765120, "step": 8390, "train_runtime": 131753.3361, "train_tokens_per_second": 4172.685 }, { "epoch": 2.2181434041991284, "grad_norm": 0.34375, "learning_rate": 0.00021814810361416403, "loss": 1.7484, "num_input_tokens_seen": 550420480, "step": 8400, "train_runtime": 131871.9634, "train_tokens_per_second": 4173.901 }, { "epoch": 2.2207843655090453, "grad_norm": 0.359375, "learning_rate": 0.00021761799407419286, "loss": 1.8028, "num_input_tokens_seen": 551075840, "step": 8410, "train_runtime": 131991.3808, "train_tokens_per_second": 4175.09 }, { "epoch": 2.223425326818962, "grad_norm": 0.33984375, "learning_rate": 0.00021708803257534451, "loss": 1.7589, "num_input_tokens_seen": 551731200, "step": 8420, "train_runtime": 132110.699, "train_tokens_per_second": 4176.279 }, { "epoch": 2.226066288128879, "grad_norm": 0.345703125, "learning_rate": 0.00021655822154044907, "loss": 1.7806, "num_input_tokens_seen": 552386560, "step": 8430, "train_runtime": 132231.161, "train_tokens_per_second": 4177.431 }, { "epoch": 2.228707249438796, "grad_norm": 0.33984375, "learning_rate": 0.00021602856339164882, "loss": 1.7687, "num_input_tokens_seen": 553041920, "step": 8440, "train_runtime": 132350.5384, "train_tokens_per_second": 4178.615 }, { "epoch": 2.2313482107487124, "grad_norm": 0.34375, "learning_rate": 0.00021549906055038666, "loss": 1.7181, "num_input_tokens_seen": 553697280, "step": 8450, "train_runtime": 132469.8126, "train_tokens_per_second": 4179.8 }, { "epoch": 2.2339891720586293, "grad_norm": 0.361328125, "learning_rate": 0.0002149697154373959, "loss": 1.7959, "num_input_tokens_seen": 554352640, "step": 8460, "train_runtime": 132588.637, "train_tokens_per_second": 4180.997 }, { "epoch": 2.2366301333685463, "grad_norm": 0.3515625, "learning_rate": 0.00021444053047268852, "loss": 1.7816, "num_input_tokens_seen": 555008000, "step": 8470, "train_runtime": 132708.1769, "train_tokens_per_second": 4182.169 }, { "epoch": 2.239271094678463, "grad_norm": 0.349609375, "learning_rate": 0.0002139115080755445, "loss": 1.7698, "num_input_tokens_seen": 555663360, "step": 8480, "train_runtime": 132826.6113, "train_tokens_per_second": 4183.374 }, { "epoch": 2.24191205598838, "grad_norm": 0.34765625, "learning_rate": 0.00021338265066450063, "loss": 1.7496, "num_input_tokens_seen": 556318720, "step": 8490, "train_runtime": 132945.6949, "train_tokens_per_second": 4184.556 }, { "epoch": 2.244553017298297, "grad_norm": 0.353515625, "learning_rate": 0.00021285396065733915, "loss": 1.7712, "num_input_tokens_seen": 556974080, "step": 8500, "train_runtime": 133064.2087, "train_tokens_per_second": 4185.754 }, { "epoch": 2.2471939786082133, "grad_norm": 0.3359375, "learning_rate": 0.00021232544047107723, "loss": 1.7846, "num_input_tokens_seen": 557629440, "step": 8510, "train_runtime": 133196.9536, "train_tokens_per_second": 4186.503 }, { "epoch": 2.2498349399181303, "grad_norm": 0.353515625, "learning_rate": 0.00021179709252195573, "loss": 1.777, "num_input_tokens_seen": 558284800, "step": 8520, "train_runtime": 133315.2493, "train_tokens_per_second": 4187.704 }, { "epoch": 2.252475901228047, "grad_norm": 0.365234375, "learning_rate": 0.00021126891922542773, "loss": 1.8324, "num_input_tokens_seen": 558940160, "step": 8530, "train_runtime": 133434.1312, "train_tokens_per_second": 4188.884 }, { "epoch": 2.255116862537964, "grad_norm": 0.359375, "learning_rate": 0.00021074092299614827, "loss": 1.778, "num_input_tokens_seen": 559595520, "step": 8540, "train_runtime": 133555.8445, "train_tokens_per_second": 4189.974 }, { "epoch": 2.257757823847881, "grad_norm": 0.34765625, "learning_rate": 0.00021021310624796269, "loss": 1.8027, "num_input_tokens_seen": 560250880, "step": 8550, "train_runtime": 133674.2163, "train_tokens_per_second": 4191.166 }, { "epoch": 2.2603987851577974, "grad_norm": 0.361328125, "learning_rate": 0.00020968547139389577, "loss": 1.8377, "num_input_tokens_seen": 560906240, "step": 8560, "train_runtime": 133793.3182, "train_tokens_per_second": 4192.334 }, { "epoch": 2.2630397464677143, "grad_norm": 0.353515625, "learning_rate": 0.00020915802084614085, "loss": 1.7932, "num_input_tokens_seen": 561561600, "step": 8570, "train_runtime": 133912.0166, "train_tokens_per_second": 4193.512 }, { "epoch": 2.265680707777631, "grad_norm": 0.34765625, "learning_rate": 0.00020863075701604844, "loss": 1.7853, "num_input_tokens_seen": 562216960, "step": 8580, "train_runtime": 134031.1527, "train_tokens_per_second": 4194.674 }, { "epoch": 2.268321669087548, "grad_norm": 0.37890625, "learning_rate": 0.00020810368231411564, "loss": 1.8313, "num_input_tokens_seen": 562872320, "step": 8590, "train_runtime": 134149.8193, "train_tokens_per_second": 4195.849 }, { "epoch": 2.270962630397465, "grad_norm": 0.341796875, "learning_rate": 0.00020757679914997502, "loss": 1.7764, "num_input_tokens_seen": 563527680, "step": 8600, "train_runtime": 134270.6757, "train_tokens_per_second": 4196.953 }, { "epoch": 2.2736035917073814, "grad_norm": 0.34375, "learning_rate": 0.00020705010993238304, "loss": 1.7626, "num_input_tokens_seen": 564183040, "step": 8610, "train_runtime": 134389.173, "train_tokens_per_second": 4198.129 }, { "epoch": 2.2762445530172983, "grad_norm": 0.349609375, "learning_rate": 0.00020652361706920995, "loss": 1.7221, "num_input_tokens_seen": 564838400, "step": 8620, "train_runtime": 134508.1004, "train_tokens_per_second": 4199.289 }, { "epoch": 2.2788855143272153, "grad_norm": 0.3671875, "learning_rate": 0.0002059973229674282, "loss": 1.7946, "num_input_tokens_seen": 565493760, "step": 8630, "train_runtime": 134626.5271, "train_tokens_per_second": 4200.463 }, { "epoch": 2.281526475637132, "grad_norm": 0.373046875, "learning_rate": 0.00020547123003310133, "loss": 1.7064, "num_input_tokens_seen": 566149120, "step": 8640, "train_runtime": 134745.557, "train_tokens_per_second": 4201.616 }, { "epoch": 2.284167436947049, "grad_norm": 0.359375, "learning_rate": 0.00020494534067137351, "loss": 1.7773, "num_input_tokens_seen": 566804480, "step": 8650, "train_runtime": 134864.1759, "train_tokens_per_second": 4202.78 }, { "epoch": 2.2868083982569654, "grad_norm": 0.337890625, "learning_rate": 0.00020441965728645826, "loss": 1.7799, "num_input_tokens_seen": 567459840, "step": 8660, "train_runtime": 134983.5623, "train_tokens_per_second": 4203.918 }, { "epoch": 2.2894493595668823, "grad_norm": 0.353515625, "learning_rate": 0.00020389418228162698, "loss": 1.7674, "num_input_tokens_seen": 568115200, "step": 8670, "train_runtime": 135102.3609, "train_tokens_per_second": 4205.072 }, { "epoch": 2.2920903208767993, "grad_norm": 0.345703125, "learning_rate": 0.0002033689180591989, "loss": 1.8209, "num_input_tokens_seen": 568770560, "step": 8680, "train_runtime": 135221.6573, "train_tokens_per_second": 4206.209 }, { "epoch": 2.294731282186716, "grad_norm": 0.337890625, "learning_rate": 0.00020284386702052948, "loss": 1.7821, "num_input_tokens_seen": 569425920, "step": 8690, "train_runtime": 135341.5766, "train_tokens_per_second": 4207.324 }, { "epoch": 2.297372243496633, "grad_norm": 0.357421875, "learning_rate": 0.00020231903156599934, "loss": 1.7809, "num_input_tokens_seen": 570081280, "step": 8700, "train_runtime": 135460.9632, "train_tokens_per_second": 4208.454 }, { "epoch": 2.3000132048065494, "grad_norm": 0.3515625, "learning_rate": 0.00020179441409500388, "loss": 1.8015, "num_input_tokens_seen": 570736640, "step": 8710, "train_runtime": 135580.0584, "train_tokens_per_second": 4209.591 }, { "epoch": 2.3026541661164663, "grad_norm": 0.35546875, "learning_rate": 0.00020127001700594163, "loss": 1.7868, "num_input_tokens_seen": 571392000, "step": 8720, "train_runtime": 135699.7542, "train_tokens_per_second": 4210.708 }, { "epoch": 2.3052951274263833, "grad_norm": 0.349609375, "learning_rate": 0.00020074584269620378, "loss": 1.7298, "num_input_tokens_seen": 572047360, "step": 8730, "train_runtime": 135819.4122, "train_tokens_per_second": 4211.823 }, { "epoch": 2.3079360887363, "grad_norm": 0.373046875, "learning_rate": 0.00020022189356216303, "loss": 1.8011, "num_input_tokens_seen": 572702720, "step": 8740, "train_runtime": 135938.2393, "train_tokens_per_second": 4212.963 }, { "epoch": 2.310577050046217, "grad_norm": 0.345703125, "learning_rate": 0.0001996981719991625, "loss": 1.8052, "num_input_tokens_seen": 573358080, "step": 8750, "train_runtime": 136057.4104, "train_tokens_per_second": 4214.089 }, { "epoch": 2.313218011356134, "grad_norm": 0.35546875, "learning_rate": 0.00019917468040150498, "loss": 1.7657, "num_input_tokens_seen": 574013440, "step": 8760, "train_runtime": 136179.4226, "train_tokens_per_second": 4215.126 }, { "epoch": 2.3158589726660503, "grad_norm": 0.357421875, "learning_rate": 0.00019865142116244223, "loss": 1.7685, "num_input_tokens_seen": 574668800, "step": 8770, "train_runtime": 136299.568, "train_tokens_per_second": 4216.219 }, { "epoch": 2.3184999339759673, "grad_norm": 0.353515625, "learning_rate": 0.0001981283966741631, "loss": 1.7421, "num_input_tokens_seen": 575324160, "step": 8780, "train_runtime": 136422.5383, "train_tokens_per_second": 4217.222 }, { "epoch": 2.3211408952858843, "grad_norm": 0.345703125, "learning_rate": 0.0001976056093277838, "loss": 1.7527, "num_input_tokens_seen": 575979520, "step": 8790, "train_runtime": 136541.3031, "train_tokens_per_second": 4218.354 }, { "epoch": 2.323781856595801, "grad_norm": 0.357421875, "learning_rate": 0.0001970830615133362, "loss": 1.7837, "num_input_tokens_seen": 576634880, "step": 8800, "train_runtime": 136661.341, "train_tokens_per_second": 4219.444 }, { "epoch": 2.326422817905718, "grad_norm": 0.353515625, "learning_rate": 0.000196560755619757, "loss": 1.7746, "num_input_tokens_seen": 577290240, "step": 8810, "train_runtime": 136781.0065, "train_tokens_per_second": 4220.544 }, { "epoch": 2.3290637792156343, "grad_norm": 0.34375, "learning_rate": 0.0001960386940348771, "loss": 1.7475, "num_input_tokens_seen": 577945600, "step": 8820, "train_runtime": 136900.2296, "train_tokens_per_second": 4221.655 }, { "epoch": 2.3317047405255513, "grad_norm": 0.357421875, "learning_rate": 0.00019551687914541021, "loss": 1.7873, "num_input_tokens_seen": 578600960, "step": 8830, "train_runtime": 137020.1599, "train_tokens_per_second": 4222.743 }, { "epoch": 2.3343457018354683, "grad_norm": 0.359375, "learning_rate": 0.00019499531333694257, "loss": 1.7672, "num_input_tokens_seen": 579256320, "step": 8840, "train_runtime": 137140.2882, "train_tokens_per_second": 4223.823 }, { "epoch": 2.336986663145385, "grad_norm": 0.349609375, "learning_rate": 0.00019447399899392154, "loss": 1.8065, "num_input_tokens_seen": 579911680, "step": 8850, "train_runtime": 137259.1949, "train_tokens_per_second": 4224.939 }, { "epoch": 2.339627624455302, "grad_norm": 0.3671875, "learning_rate": 0.00019395293849964465, "loss": 1.7768, "num_input_tokens_seen": 580567040, "step": 8860, "train_runtime": 137378.606, "train_tokens_per_second": 4226.037 }, { "epoch": 2.3422685857652183, "grad_norm": 0.3515625, "learning_rate": 0.00019343213423624923, "loss": 1.7421, "num_input_tokens_seen": 581222400, "step": 8870, "train_runtime": 137497.6372, "train_tokens_per_second": 4227.145 }, { "epoch": 2.3449095470751353, "grad_norm": 0.359375, "learning_rate": 0.00019291158858470112, "loss": 1.8188, "num_input_tokens_seen": 581877760, "step": 8880, "train_runtime": 137616.1183, "train_tokens_per_second": 4228.267 }, { "epoch": 2.3475505083850523, "grad_norm": 0.34765625, "learning_rate": 0.0001923913039247836, "loss": 1.7275, "num_input_tokens_seen": 582533120, "step": 8890, "train_runtime": 137735.6367, "train_tokens_per_second": 4229.357 }, { "epoch": 2.350191469694969, "grad_norm": 0.35546875, "learning_rate": 0.00019187128263508713, "loss": 1.7716, "num_input_tokens_seen": 583188480, "step": 8900, "train_runtime": 137857.7389, "train_tokens_per_second": 4230.364 }, { "epoch": 2.352832431004886, "grad_norm": 0.333984375, "learning_rate": 0.00019135152709299792, "loss": 1.7503, "num_input_tokens_seen": 583843840, "step": 8910, "train_runtime": 137976.5331, "train_tokens_per_second": 4231.472 }, { "epoch": 2.355473392314803, "grad_norm": 0.37890625, "learning_rate": 0.00019083203967468727, "loss": 1.7817, "num_input_tokens_seen": 584499200, "step": 8920, "train_runtime": 138096.6407, "train_tokens_per_second": 4232.537 }, { "epoch": 2.3581143536247193, "grad_norm": 0.34765625, "learning_rate": 0.00019031282275510086, "loss": 1.7261, "num_input_tokens_seen": 585154560, "step": 8930, "train_runtime": 138215.4588, "train_tokens_per_second": 4233.64 }, { "epoch": 2.3607553149346363, "grad_norm": 0.34765625, "learning_rate": 0.0001897938787079474, "loss": 1.7492, "num_input_tokens_seen": 585809920, "step": 8940, "train_runtime": 138334.5922, "train_tokens_per_second": 4234.732 }, { "epoch": 2.363396276244553, "grad_norm": 0.35546875, "learning_rate": 0.00018927520990568835, "loss": 1.754, "num_input_tokens_seen": 586465280, "step": 8950, "train_runtime": 138454.3847, "train_tokens_per_second": 4235.801 }, { "epoch": 2.36603723755447, "grad_norm": 0.369140625, "learning_rate": 0.00018875681871952695, "loss": 1.7688, "num_input_tokens_seen": 587120640, "step": 8960, "train_runtime": 138573.7515, "train_tokens_per_second": 4236.882 }, { "epoch": 2.368678198864387, "grad_norm": 0.3515625, "learning_rate": 0.00018823870751939688, "loss": 1.7996, "num_input_tokens_seen": 587776000, "step": 8970, "train_runtime": 138693.7682, "train_tokens_per_second": 4237.941 }, { "epoch": 2.3713191601743033, "grad_norm": 0.35546875, "learning_rate": 0.00018772087867395206, "loss": 1.7875, "num_input_tokens_seen": 588431360, "step": 8980, "train_runtime": 138813.0599, "train_tokens_per_second": 4239.02 }, { "epoch": 2.3739601214842203, "grad_norm": 0.34375, "learning_rate": 0.00018720333455055565, "loss": 1.7478, "num_input_tokens_seen": 589086720, "step": 8990, "train_runtime": 138932.062, "train_tokens_per_second": 4240.106 }, { "epoch": 2.376601082794137, "grad_norm": 0.349609375, "learning_rate": 0.0001866860775152689, "loss": 1.7108, "num_input_tokens_seen": 589742080, "step": 9000, "train_runtime": 139052.1789, "train_tokens_per_second": 4241.157 }, { "epoch": 2.379242044104054, "grad_norm": 0.349609375, "learning_rate": 0.00018616910993284066, "loss": 1.762, "num_input_tokens_seen": 590397440, "step": 9010, "train_runtime": 139183.8467, "train_tokens_per_second": 4241.853 }, { "epoch": 2.381883005413971, "grad_norm": 0.34375, "learning_rate": 0.00018565243416669673, "loss": 1.7225, "num_input_tokens_seen": 591052800, "step": 9020, "train_runtime": 139304.0618, "train_tokens_per_second": 4242.897 }, { "epoch": 2.3845239667238873, "grad_norm": 0.3671875, "learning_rate": 0.00018513605257892832, "loss": 1.7293, "num_input_tokens_seen": 591708160, "step": 9030, "train_runtime": 139422.0794, "train_tokens_per_second": 4244.006 }, { "epoch": 2.3871649280338043, "grad_norm": 0.3515625, "learning_rate": 0.00018461996753028225, "loss": 1.8099, "num_input_tokens_seen": 592363520, "step": 9040, "train_runtime": 139540.6114, "train_tokens_per_second": 4245.098 }, { "epoch": 2.3898058893437213, "grad_norm": 0.34765625, "learning_rate": 0.00018410418138014927, "loss": 1.7611, "num_input_tokens_seen": 593018880, "step": 9050, "train_runtime": 139659.2386, "train_tokens_per_second": 4246.184 }, { "epoch": 2.392446850653638, "grad_norm": 0.341796875, "learning_rate": 0.00018358869648655383, "loss": 1.7364, "num_input_tokens_seen": 593674240, "step": 9060, "train_runtime": 139777.7025, "train_tokens_per_second": 4247.274 }, { "epoch": 2.395087811963555, "grad_norm": 0.357421875, "learning_rate": 0.00018307351520614317, "loss": 1.7905, "num_input_tokens_seen": 594329600, "step": 9070, "train_runtime": 139896.6241, "train_tokens_per_second": 4248.348 }, { "epoch": 2.397728773273472, "grad_norm": 0.369140625, "learning_rate": 0.0001825586398941763, "loss": 1.7841, "num_input_tokens_seen": 594984960, "step": 9080, "train_runtime": 140015.3229, "train_tokens_per_second": 4249.427 }, { "epoch": 2.4003697345833883, "grad_norm": 0.337890625, "learning_rate": 0.0001820440729045137, "loss": 1.7908, "num_input_tokens_seen": 595640320, "step": 9090, "train_runtime": 140134.781, "train_tokens_per_second": 4250.482 }, { "epoch": 2.4030106958933053, "grad_norm": 0.361328125, "learning_rate": 0.00018152981658960612, "loss": 1.7509, "num_input_tokens_seen": 596295680, "step": 9100, "train_runtime": 140253.068, "train_tokens_per_second": 4251.57 }, { "epoch": 2.405651657203222, "grad_norm": 0.365234375, "learning_rate": 0.0001810158733004839, "loss": 1.7322, "num_input_tokens_seen": 596951040, "step": 9110, "train_runtime": 140372.2313, "train_tokens_per_second": 4252.629 }, { "epoch": 2.408292618513139, "grad_norm": 0.337890625, "learning_rate": 0.00018050224538674654, "loss": 1.7181, "num_input_tokens_seen": 597606400, "step": 9120, "train_runtime": 140495.1037, "train_tokens_per_second": 4253.575 }, { "epoch": 2.410933579823056, "grad_norm": 0.33984375, "learning_rate": 0.00017998893519655172, "loss": 1.7501, "num_input_tokens_seen": 598261760, "step": 9130, "train_runtime": 140616.0523, "train_tokens_per_second": 4254.577 }, { "epoch": 2.4135745411329723, "grad_norm": 0.357421875, "learning_rate": 0.00017947594507660425, "loss": 1.7542, "num_input_tokens_seen": 598917120, "step": 9140, "train_runtime": 140734.2068, "train_tokens_per_second": 4255.661 }, { "epoch": 2.4162155024428893, "grad_norm": 0.349609375, "learning_rate": 0.00017896327737214606, "loss": 1.7611, "num_input_tokens_seen": 599572480, "step": 9150, "train_runtime": 140851.816, "train_tokens_per_second": 4256.761 }, { "epoch": 2.418856463752806, "grad_norm": 0.3515625, "learning_rate": 0.00017845093442694503, "loss": 1.7998, "num_input_tokens_seen": 600227840, "step": 9160, "train_runtime": 140971.7527, "train_tokens_per_second": 4257.788 }, { "epoch": 2.421497425062723, "grad_norm": 0.341796875, "learning_rate": 0.00017793891858328405, "loss": 1.7349, "num_input_tokens_seen": 600883200, "step": 9170, "train_runtime": 141090.3309, "train_tokens_per_second": 4258.855 }, { "epoch": 2.42413838637264, "grad_norm": 0.365234375, "learning_rate": 0.00017742723218195107, "loss": 1.6993, "num_input_tokens_seen": 601538560, "step": 9180, "train_runtime": 141208.2383, "train_tokens_per_second": 4259.94 }, { "epoch": 2.4267793476825563, "grad_norm": 0.3359375, "learning_rate": 0.00017691587756222735, "loss": 1.7373, "num_input_tokens_seen": 602193920, "step": 9190, "train_runtime": 141327.2411, "train_tokens_per_second": 4260.99 }, { "epoch": 2.4294203089924733, "grad_norm": 0.357421875, "learning_rate": 0.0001764048570618778, "loss": 1.7174, "num_input_tokens_seen": 602849280, "step": 9200, "train_runtime": 141446.1423, "train_tokens_per_second": 4262.041 }, { "epoch": 2.4320612703023903, "grad_norm": 0.341796875, "learning_rate": 0.0001758941730171398, "loss": 1.7377, "num_input_tokens_seen": 603504640, "step": 9210, "train_runtime": 141564.9698, "train_tokens_per_second": 4263.093 }, { "epoch": 2.434702231612307, "grad_norm": 0.353515625, "learning_rate": 0.00017538382776271212, "loss": 1.7213, "num_input_tokens_seen": 604160000, "step": 9220, "train_runtime": 141684.1453, "train_tokens_per_second": 4264.133 }, { "epoch": 2.437343192922224, "grad_norm": 0.34765625, "learning_rate": 0.0001748738236317452, "loss": 1.7713, "num_input_tokens_seen": 604815360, "step": 9230, "train_runtime": 141803.3338, "train_tokens_per_second": 4265.17 }, { "epoch": 2.4399841542321403, "grad_norm": 0.34765625, "learning_rate": 0.0001743641629558298, "loss": 1.7932, "num_input_tokens_seen": 605470720, "step": 9240, "train_runtime": 141922.5822, "train_tokens_per_second": 4266.204 }, { "epoch": 2.4426251155420573, "grad_norm": 0.361328125, "learning_rate": 0.00017385484806498627, "loss": 1.7473, "num_input_tokens_seen": 606126080, "step": 9250, "train_runtime": 142041.8718, "train_tokens_per_second": 4267.235 }, { "epoch": 2.4452660768519743, "grad_norm": 0.36328125, "learning_rate": 0.00017334588128765444, "loss": 1.8083, "num_input_tokens_seen": 606781440, "step": 9260, "train_runtime": 142162.8118, "train_tokens_per_second": 4268.215 }, { "epoch": 2.447907038161891, "grad_norm": 0.365234375, "learning_rate": 0.00017283726495068253, "loss": 1.8204, "num_input_tokens_seen": 607436800, "step": 9270, "train_runtime": 142281.8887, "train_tokens_per_second": 4269.249 }, { "epoch": 2.450547999471808, "grad_norm": 0.36328125, "learning_rate": 0.00017232900137931662, "loss": 1.7764, "num_input_tokens_seen": 608092160, "step": 9280, "train_runtime": 142401.1413, "train_tokens_per_second": 4270.276 }, { "epoch": 2.4531889607817243, "grad_norm": 0.369140625, "learning_rate": 0.00017182109289719022, "loss": 1.8187, "num_input_tokens_seen": 608747520, "step": 9290, "train_runtime": 142520.2632, "train_tokens_per_second": 4271.305 }, { "epoch": 2.4558299220916413, "grad_norm": 0.359375, "learning_rate": 0.00017131354182631315, "loss": 1.7767, "num_input_tokens_seen": 609402880, "step": 9300, "train_runtime": 142639.1497, "train_tokens_per_second": 4272.34 }, { "epoch": 2.4584708834015583, "grad_norm": 0.34765625, "learning_rate": 0.0001708063504870615, "loss": 1.7341, "num_input_tokens_seen": 610058240, "step": 9310, "train_runtime": 142758.107, "train_tokens_per_second": 4273.37 }, { "epoch": 2.461111844711475, "grad_norm": 0.349609375, "learning_rate": 0.00017029952119816688, "loss": 1.765, "num_input_tokens_seen": 610713600, "step": 9320, "train_runtime": 142876.8107, "train_tokens_per_second": 4274.407 }, { "epoch": 2.463752806021392, "grad_norm": 0.353515625, "learning_rate": 0.00016979305627670533, "loss": 1.7883, "num_input_tokens_seen": 611368960, "step": 9330, "train_runtime": 142996.1941, "train_tokens_per_second": 4275.421 }, { "epoch": 2.4663937673313088, "grad_norm": 0.35546875, "learning_rate": 0.00016928695803808738, "loss": 1.7812, "num_input_tokens_seen": 612024320, "step": 9340, "train_runtime": 143114.962, "train_tokens_per_second": 4276.452 }, { "epoch": 2.4690347286412253, "grad_norm": 0.35546875, "learning_rate": 0.00016878122879604725, "loss": 1.7889, "num_input_tokens_seen": 612679680, "step": 9350, "train_runtime": 143234.1143, "train_tokens_per_second": 4277.47 }, { "epoch": 2.4716756899511423, "grad_norm": 0.3671875, "learning_rate": 0.00016827587086263194, "loss": 1.7724, "num_input_tokens_seen": 613335040, "step": 9360, "train_runtime": 143357.6622, "train_tokens_per_second": 4278.355 }, { "epoch": 2.4743166512610593, "grad_norm": 0.361328125, "learning_rate": 0.00016777088654819117, "loss": 1.7803, "num_input_tokens_seen": 613990400, "step": 9370, "train_runtime": 143476.0368, "train_tokens_per_second": 4279.393 }, { "epoch": 2.476957612570976, "grad_norm": 0.353515625, "learning_rate": 0.00016726627816136664, "loss": 1.7175, "num_input_tokens_seen": 614645760, "step": 9380, "train_runtime": 143594.8596, "train_tokens_per_second": 4280.416 }, { "epoch": 2.4795985738808928, "grad_norm": 0.375, "learning_rate": 0.00016676204800908107, "loss": 1.8176, "num_input_tokens_seen": 615301120, "step": 9390, "train_runtime": 143713.5326, "train_tokens_per_second": 4281.442 }, { "epoch": 2.4822395351908093, "grad_norm": 0.337890625, "learning_rate": 0.0001662581983965284, "loss": 1.7688, "num_input_tokens_seen": 615956480, "step": 9400, "train_runtime": 143831.839, "train_tokens_per_second": 4282.477 }, { "epoch": 2.4848804965007263, "grad_norm": 0.337890625, "learning_rate": 0.00016575473162716247, "loss": 1.7741, "num_input_tokens_seen": 616611840, "step": 9410, "train_runtime": 143950.6805, "train_tokens_per_second": 4283.494 }, { "epoch": 2.4875214578106433, "grad_norm": 0.3515625, "learning_rate": 0.0001652516500026872, "loss": 1.8501, "num_input_tokens_seen": 617267200, "step": 9420, "train_runtime": 144069.9304, "train_tokens_per_second": 4284.497 }, { "epoch": 2.49016241912056, "grad_norm": 0.345703125, "learning_rate": 0.00016474895582304562, "loss": 1.6814, "num_input_tokens_seen": 617922560, "step": 9430, "train_runtime": 144188.5969, "train_tokens_per_second": 4285.516 }, { "epoch": 2.492803380430477, "grad_norm": 0.384765625, "learning_rate": 0.00016424665138640944, "loss": 1.7343, "num_input_tokens_seen": 618577920, "step": 9440, "train_runtime": 144306.7939, "train_tokens_per_second": 4286.547 }, { "epoch": 2.4954443417403933, "grad_norm": 0.361328125, "learning_rate": 0.0001637447389891686, "loss": 1.7792, "num_input_tokens_seen": 619233280, "step": 9450, "train_runtime": 144425.3384, "train_tokens_per_second": 4287.567 }, { "epoch": 2.4980853030503103, "grad_norm": 0.373046875, "learning_rate": 0.00016324322092592088, "loss": 1.7462, "num_input_tokens_seen": 619888640, "step": 9460, "train_runtime": 144545.926, "train_tokens_per_second": 4288.524 }, { "epoch": 2.5007262643602273, "grad_norm": 0.359375, "learning_rate": 0.0001627420994894609, "loss": 1.7535, "num_input_tokens_seen": 620544000, "step": 9470, "train_runtime": 144665.4447, "train_tokens_per_second": 4289.511 }, { "epoch": 2.503367225670144, "grad_norm": 0.369140625, "learning_rate": 0.00016224137697077047, "loss": 1.7452, "num_input_tokens_seen": 621199360, "step": 9480, "train_runtime": 144784.5654, "train_tokens_per_second": 4290.508 }, { "epoch": 2.506008186980061, "grad_norm": 0.345703125, "learning_rate": 0.00016174105565900748, "loss": 1.7659, "num_input_tokens_seen": 621854720, "step": 9490, "train_runtime": 144903.4354, "train_tokens_per_second": 4291.511 }, { "epoch": 2.5086491482899778, "grad_norm": 0.34765625, "learning_rate": 0.00016124113784149547, "loss": 1.7674, "num_input_tokens_seen": 622510080, "step": 9500, "train_runtime": 145021.8183, "train_tokens_per_second": 4292.527 }, { "epoch": 2.5112901095998943, "grad_norm": 0.341796875, "learning_rate": 0.00016074162580371356, "loss": 1.8104, "num_input_tokens_seen": 623165440, "step": 9510, "train_runtime": 145154.1453, "train_tokens_per_second": 4293.129 }, { "epoch": 2.5139310709098113, "grad_norm": 0.359375, "learning_rate": 0.00016024252182928562, "loss": 1.8579, "num_input_tokens_seen": 623820800, "step": 9520, "train_runtime": 145274.2883, "train_tokens_per_second": 4294.09 }, { "epoch": 2.5165720322197282, "grad_norm": 0.34375, "learning_rate": 0.00015974382819996995, "loss": 1.731, "num_input_tokens_seen": 624476160, "step": 9530, "train_runtime": 145393.8544, "train_tokens_per_second": 4295.066 }, { "epoch": 2.519212993529645, "grad_norm": 0.365234375, "learning_rate": 0.0001592455471956492, "loss": 1.7521, "num_input_tokens_seen": 625131520, "step": 9540, "train_runtime": 145512.4505, "train_tokens_per_second": 4296.069 }, { "epoch": 2.5218539548395618, "grad_norm": 0.35546875, "learning_rate": 0.00015874768109431898, "loss": 1.8008, "num_input_tokens_seen": 625786880, "step": 9550, "train_runtime": 145630.963, "train_tokens_per_second": 4297.073 }, { "epoch": 2.5244949161494783, "grad_norm": 0.359375, "learning_rate": 0.00015825023217207868, "loss": 1.7386, "num_input_tokens_seen": 626442240, "step": 9560, "train_runtime": 145750.7145, "train_tokens_per_second": 4298.039 }, { "epoch": 2.5271358774593953, "grad_norm": 0.34765625, "learning_rate": 0.00015775320270312027, "loss": 1.7579, "num_input_tokens_seen": 627097600, "step": 9570, "train_runtime": 145869.0005, "train_tokens_per_second": 4299.046 }, { "epoch": 2.529776838769312, "grad_norm": 0.353515625, "learning_rate": 0.00015725659495971795, "loss": 1.7801, "num_input_tokens_seen": 627752960, "step": 9580, "train_runtime": 145988.3153, "train_tokens_per_second": 4300.022 }, { "epoch": 2.532417800079229, "grad_norm": 0.35546875, "learning_rate": 0.00015676041121221807, "loss": 1.6687, "num_input_tokens_seen": 628408320, "step": 9590, "train_runtime": 146107.4785, "train_tokens_per_second": 4301.0 }, { "epoch": 2.5350587613891458, "grad_norm": 0.359375, "learning_rate": 0.00015626465372902865, "loss": 1.7341, "num_input_tokens_seen": 629063680, "step": 9600, "train_runtime": 146228.4682, "train_tokens_per_second": 4301.923 }, { "epoch": 2.5376997226990623, "grad_norm": 0.390625, "learning_rate": 0.0001557693247766088, "loss": 1.7433, "num_input_tokens_seen": 629719040, "step": 9610, "train_runtime": 146347.5804, "train_tokens_per_second": 4302.9 }, { "epoch": 2.5403406840089793, "grad_norm": 0.357421875, "learning_rate": 0.00015527442661945857, "loss": 1.729, "num_input_tokens_seen": 630374400, "step": 9620, "train_runtime": 146466.9131, "train_tokens_per_second": 4303.869 }, { "epoch": 2.5429816453188963, "grad_norm": 0.353515625, "learning_rate": 0.00015477996152010859, "loss": 1.7704, "num_input_tokens_seen": 631029760, "step": 9630, "train_runtime": 146586.2223, "train_tokens_per_second": 4304.837 }, { "epoch": 2.545622606628813, "grad_norm": 0.361328125, "learning_rate": 0.00015428593173910955, "loss": 1.8101, "num_input_tokens_seen": 631685120, "step": 9640, "train_runtime": 146705.5529, "train_tokens_per_second": 4305.802 }, { "epoch": 2.5482635679387298, "grad_norm": 0.345703125, "learning_rate": 0.00015379233953502226, "loss": 1.7623, "num_input_tokens_seen": 632340480, "step": 9650, "train_runtime": 146824.4086, "train_tokens_per_second": 4306.78 }, { "epoch": 2.5509045292486467, "grad_norm": 0.35546875, "learning_rate": 0.00015329918716440664, "loss": 1.7418, "num_input_tokens_seen": 632995840, "step": 9660, "train_runtime": 146944.2577, "train_tokens_per_second": 4307.728 }, { "epoch": 2.5535454905585633, "grad_norm": 0.365234375, "learning_rate": 0.00015280647688181216, "loss": 1.7479, "num_input_tokens_seen": 633651200, "step": 9670, "train_runtime": 147065.0312, "train_tokens_per_second": 4308.646 }, { "epoch": 2.5561864518684803, "grad_norm": 0.3515625, "learning_rate": 0.00015231421093976716, "loss": 1.721, "num_input_tokens_seen": 634306560, "step": 9680, "train_runtime": 147183.3892, "train_tokens_per_second": 4309.634 }, { "epoch": 2.558827413178397, "grad_norm": 0.345703125, "learning_rate": 0.00015182239158876833, "loss": 1.8321, "num_input_tokens_seen": 634961920, "step": 9690, "train_runtime": 147302.1551, "train_tokens_per_second": 4310.608 }, { "epoch": 2.5614683744883138, "grad_norm": 0.333984375, "learning_rate": 0.00015133102107727094, "loss": 1.7363, "num_input_tokens_seen": 635617280, "step": 9700, "train_runtime": 147420.2693, "train_tokens_per_second": 4311.6 }, { "epoch": 2.5641093357982303, "grad_norm": 0.369140625, "learning_rate": 0.00015084010165167827, "loss": 1.6908, "num_input_tokens_seen": 636272640, "step": 9710, "train_runtime": 147539.2396, "train_tokens_per_second": 4312.566 }, { "epoch": 2.5667502971081473, "grad_norm": 0.34765625, "learning_rate": 0.00015034963555633118, "loss": 1.7516, "num_input_tokens_seen": 636928000, "step": 9720, "train_runtime": 147658.2838, "train_tokens_per_second": 4313.527 }, { "epoch": 2.5693912584180643, "grad_norm": 0.3515625, "learning_rate": 0.00014985962503349825, "loss": 1.7675, "num_input_tokens_seen": 637583360, "step": 9730, "train_runtime": 147777.1215, "train_tokens_per_second": 4314.493 }, { "epoch": 2.572032219727981, "grad_norm": 0.365234375, "learning_rate": 0.0001493700723233653, "loss": 1.7258, "num_input_tokens_seen": 638238720, "step": 9740, "train_runtime": 147896.5141, "train_tokens_per_second": 4315.441 }, { "epoch": 2.5746731810378978, "grad_norm": 0.359375, "learning_rate": 0.00014888097966402487, "loss": 1.7838, "num_input_tokens_seen": 638894080, "step": 9750, "train_runtime": 148015.0797, "train_tokens_per_second": 4316.412 }, { "epoch": 2.5773141423478148, "grad_norm": 0.37890625, "learning_rate": 0.00014839234929146672, "loss": 1.8084, "num_input_tokens_seen": 639549440, "step": 9760, "train_runtime": 148133.9696, "train_tokens_per_second": 4317.372 }, { "epoch": 2.5799551036577313, "grad_norm": 0.34765625, "learning_rate": 0.00014790418343956673, "loss": 1.7442, "num_input_tokens_seen": 640204800, "step": 9770, "train_runtime": 148252.6304, "train_tokens_per_second": 4318.337 }, { "epoch": 2.5825960649676483, "grad_norm": 0.353515625, "learning_rate": 0.00014741648434007747, "loss": 1.7685, "num_input_tokens_seen": 640860160, "step": 9780, "train_runtime": 148373.1837, "train_tokens_per_second": 4319.245 }, { "epoch": 2.5852370262775652, "grad_norm": 0.349609375, "learning_rate": 0.0001469292542226176, "loss": 1.7935, "num_input_tokens_seen": 641515520, "step": 9790, "train_runtime": 148492.2266, "train_tokens_per_second": 4320.196 }, { "epoch": 2.5878779875874818, "grad_norm": 0.349609375, "learning_rate": 0.00014644249531466148, "loss": 1.7533, "num_input_tokens_seen": 642170880, "step": 9800, "train_runtime": 148611.013, "train_tokens_per_second": 4321.153 }, { "epoch": 2.5905189488973988, "grad_norm": 0.375, "learning_rate": 0.00014595620984152958, "loss": 1.8039, "num_input_tokens_seen": 642826240, "step": 9810, "train_runtime": 148729.6877, "train_tokens_per_second": 4322.111 }, { "epoch": 2.5931599102073157, "grad_norm": 0.34765625, "learning_rate": 0.00014547040002637775, "loss": 1.8059, "num_input_tokens_seen": 643481600, "step": 9820, "train_runtime": 148848.1016, "train_tokens_per_second": 4323.076 }, { "epoch": 2.5958008715172323, "grad_norm": 0.365234375, "learning_rate": 0.00014498506809018725, "loss": 1.7737, "num_input_tokens_seen": 644136960, "step": 9830, "train_runtime": 148967.8001, "train_tokens_per_second": 4324.001 }, { "epoch": 2.5984418328271492, "grad_norm": 0.34375, "learning_rate": 0.00014450021625175466, "loss": 1.7851, "num_input_tokens_seen": 644792320, "step": 9840, "train_runtime": 149089.5883, "train_tokens_per_second": 4324.865 }, { "epoch": 2.601082794137066, "grad_norm": 0.369140625, "learning_rate": 0.00014401584672768192, "loss": 1.7162, "num_input_tokens_seen": 645447680, "step": 9850, "train_runtime": 149208.5088, "train_tokens_per_second": 4325.81 }, { "epoch": 2.6037237554469828, "grad_norm": 0.3671875, "learning_rate": 0.0001435319617323656, "loss": 1.783, "num_input_tokens_seen": 646103040, "step": 9860, "train_runtime": 149327.0397, "train_tokens_per_second": 4326.765 }, { "epoch": 2.6063647167568993, "grad_norm": 0.3515625, "learning_rate": 0.00014304856347798736, "loss": 1.7032, "num_input_tokens_seen": 646758400, "step": 9870, "train_runtime": 149446.2157, "train_tokens_per_second": 4327.7 }, { "epoch": 2.6090056780668163, "grad_norm": 0.3671875, "learning_rate": 0.00014256565417450356, "loss": 1.7575, "num_input_tokens_seen": 647413760, "step": 9880, "train_runtime": 149565.2883, "train_tokens_per_second": 4328.636 }, { "epoch": 2.6116466393767332, "grad_norm": 0.361328125, "learning_rate": 0.0001420832360296352, "loss": 1.7756, "num_input_tokens_seen": 648069120, "step": 9890, "train_runtime": 149684.1139, "train_tokens_per_second": 4329.578 }, { "epoch": 2.61428760068665, "grad_norm": 0.333984375, "learning_rate": 0.00014160131124885806, "loss": 1.6877, "num_input_tokens_seen": 648724480, "step": 9900, "train_runtime": 149803.3178, "train_tokens_per_second": 4330.508 }, { "epoch": 2.6169285619965668, "grad_norm": 0.359375, "learning_rate": 0.0001411198820353919, "loss": 1.7656, "num_input_tokens_seen": 649379840, "step": 9910, "train_runtime": 149921.9236, "train_tokens_per_second": 4331.453 }, { "epoch": 2.6195695233064837, "grad_norm": 0.353515625, "learning_rate": 0.00014063895059019145, "loss": 1.7506, "num_input_tokens_seen": 650035200, "step": 9920, "train_runtime": 150041.0453, "train_tokens_per_second": 4332.383 }, { "epoch": 2.6222104846164003, "grad_norm": 0.37109375, "learning_rate": 0.0001401585191119355, "loss": 1.7237, "num_input_tokens_seen": 650690560, "step": 9930, "train_runtime": 150160.1682, "train_tokens_per_second": 4333.31 }, { "epoch": 2.6248514459263173, "grad_norm": 0.353515625, "learning_rate": 0.00013967858979701712, "loss": 1.7054, "num_input_tokens_seen": 651345920, "step": 9940, "train_runtime": 150282.8071, "train_tokens_per_second": 4334.135 }, { "epoch": 2.6274924072362342, "grad_norm": 0.34765625, "learning_rate": 0.00013919916483953382, "loss": 1.7295, "num_input_tokens_seen": 652001280, "step": 9950, "train_runtime": 150402.2739, "train_tokens_per_second": 4335.049 }, { "epoch": 2.6301333685461508, "grad_norm": 0.349609375, "learning_rate": 0.00013872024643127716, "loss": 1.7022, "num_input_tokens_seen": 652656640, "step": 9960, "train_runtime": 150521.4317, "train_tokens_per_second": 4335.972 }, { "epoch": 2.6327743298560677, "grad_norm": 0.333984375, "learning_rate": 0.00013824183676172292, "loss": 1.7629, "num_input_tokens_seen": 653312000, "step": 9970, "train_runtime": 150640.7138, "train_tokens_per_second": 4336.889 }, { "epoch": 2.6354152911659843, "grad_norm": 0.359375, "learning_rate": 0.00013776393801802117, "loss": 1.7168, "num_input_tokens_seen": 653967360, "step": 9980, "train_runtime": 150759.568, "train_tokens_per_second": 4337.817 }, { "epoch": 2.6380562524759013, "grad_norm": 0.353515625, "learning_rate": 0.0001372865523849861, "loss": 1.7788, "num_input_tokens_seen": 654622720, "step": 9990, "train_runtime": 150878.3328, "train_tokens_per_second": 4338.746 }, { "epoch": 2.640697213785818, "grad_norm": 0.353515625, "learning_rate": 0.000136809682045086, "loss": 1.7543, "num_input_tokens_seen": 655278080, "step": 10000, "train_runtime": 150996.925, "train_tokens_per_second": 4339.678 }, { "epoch": 2.6433381750957348, "grad_norm": 0.35546875, "learning_rate": 0.0001363333291784337, "loss": 1.7494, "num_input_tokens_seen": 655933440, "step": 10010, "train_runtime": 151130.2799, "train_tokens_per_second": 4340.185 }, { "epoch": 2.6459791364056517, "grad_norm": 0.349609375, "learning_rate": 0.00013585749596277608, "loss": 1.6409, "num_input_tokens_seen": 656588800, "step": 10020, "train_runtime": 151251.6957, "train_tokens_per_second": 4341.034 }, { "epoch": 2.6486200977155683, "grad_norm": 0.345703125, "learning_rate": 0.00013538218457348424, "loss": 1.6976, "num_input_tokens_seen": 657244160, "step": 10030, "train_runtime": 151370.1323, "train_tokens_per_second": 4341.967 }, { "epoch": 2.6512610590254853, "grad_norm": 0.375, "learning_rate": 0.000134907397183544, "loss": 1.7745, "num_input_tokens_seen": 657899520, "step": 10040, "train_runtime": 151488.5968, "train_tokens_per_second": 4342.898 }, { "epoch": 2.6539020203354022, "grad_norm": 0.361328125, "learning_rate": 0.000134433135963545, "loss": 1.7694, "num_input_tokens_seen": 658554880, "step": 10050, "train_runtime": 151608.039, "train_tokens_per_second": 4343.799 }, { "epoch": 2.6565429816453188, "grad_norm": 0.37109375, "learning_rate": 0.00013395940308167203, "loss": 1.7231, "num_input_tokens_seen": 659210240, "step": 10060, "train_runtime": 151726.6255, "train_tokens_per_second": 4344.724 }, { "epoch": 2.6591839429552357, "grad_norm": 0.361328125, "learning_rate": 0.0001334862007036941, "loss": 1.7514, "num_input_tokens_seen": 659865600, "step": 10070, "train_runtime": 151845.6218, "train_tokens_per_second": 4345.635 }, { "epoch": 2.6618249042651527, "grad_norm": 0.359375, "learning_rate": 0.00013301353099295506, "loss": 1.7589, "num_input_tokens_seen": 660520960, "step": 10080, "train_runtime": 151964.4801, "train_tokens_per_second": 4346.548 }, { "epoch": 2.6644658655750693, "grad_norm": 0.3515625, "learning_rate": 0.00013254139611036328, "loss": 1.7025, "num_input_tokens_seen": 661176320, "step": 10090, "train_runtime": 152084.2543, "train_tokens_per_second": 4347.434 }, { "epoch": 2.6671068268849862, "grad_norm": 0.37109375, "learning_rate": 0.00013206979821438254, "loss": 1.7462, "num_input_tokens_seen": 661831680, "step": 10100, "train_runtime": 152205.3163, "train_tokens_per_second": 4348.282 }, { "epoch": 2.669747788194903, "grad_norm": 0.337890625, "learning_rate": 0.0001315987394610213, "loss": 1.7323, "num_input_tokens_seen": 662487040, "step": 10110, "train_runtime": 152325.6475, "train_tokens_per_second": 4349.15 }, { "epoch": 2.6723887495048197, "grad_norm": 0.353515625, "learning_rate": 0.00013112822200382336, "loss": 1.81, "num_input_tokens_seen": 663142400, "step": 10120, "train_runtime": 152444.303, "train_tokens_per_second": 4350.064 }, { "epoch": 2.6750297108147367, "grad_norm": 0.33203125, "learning_rate": 0.00013065824799385773, "loss": 1.7513, "num_input_tokens_seen": 663797760, "step": 10130, "train_runtime": 152563.5979, "train_tokens_per_second": 4350.958 }, { "epoch": 2.6776706721246533, "grad_norm": 0.349609375, "learning_rate": 0.00013018881957970903, "loss": 1.7439, "num_input_tokens_seen": 664453120, "step": 10140, "train_runtime": 152683.481, "train_tokens_per_second": 4351.834 }, { "epoch": 2.6803116334345702, "grad_norm": 0.33984375, "learning_rate": 0.00012971993890746781, "loss": 1.7359, "num_input_tokens_seen": 665108480, "step": 10150, "train_runtime": 152804.3205, "train_tokens_per_second": 4352.681 }, { "epoch": 2.6829525947444868, "grad_norm": 0.341796875, "learning_rate": 0.00012925160812071994, "loss": 1.7752, "num_input_tokens_seen": 665763840, "step": 10160, "train_runtime": 152924.5429, "train_tokens_per_second": 4353.545 }, { "epoch": 2.6855935560544038, "grad_norm": 0.361328125, "learning_rate": 0.000128783829360538, "loss": 1.7361, "num_input_tokens_seen": 666419200, "step": 10170, "train_runtime": 153043.2269, "train_tokens_per_second": 4354.451 }, { "epoch": 2.6882345173643207, "grad_norm": 0.353515625, "learning_rate": 0.00012831660476547046, "loss": 1.7662, "num_input_tokens_seen": 667074560, "step": 10180, "train_runtime": 153161.5733, "train_tokens_per_second": 4355.365 }, { "epoch": 2.6908754786742373, "grad_norm": 0.35546875, "learning_rate": 0.00012784993647153243, "loss": 1.6742, "num_input_tokens_seen": 667729920, "step": 10190, "train_runtime": 153280.1511, "train_tokens_per_second": 4356.271 }, { "epoch": 2.6935164399841542, "grad_norm": 0.35546875, "learning_rate": 0.0001273838266121956, "loss": 1.6736, "num_input_tokens_seen": 668385280, "step": 10200, "train_runtime": 153399.7672, "train_tokens_per_second": 4357.147 }, { "epoch": 2.696157401294071, "grad_norm": 0.359375, "learning_rate": 0.00012691827731837912, "loss": 1.7415, "num_input_tokens_seen": 669040640, "step": 10210, "train_runtime": 153519.1639, "train_tokens_per_second": 4358.027 }, { "epoch": 2.6987983626039878, "grad_norm": 0.3515625, "learning_rate": 0.000126453290718439, "loss": 1.7608, "num_input_tokens_seen": 669696000, "step": 10220, "train_runtime": 153638.739, "train_tokens_per_second": 4358.901 }, { "epoch": 2.7014393239139047, "grad_norm": 0.365234375, "learning_rate": 0.0001259888689381588, "loss": 1.8025, "num_input_tokens_seen": 670351360, "step": 10230, "train_runtime": 153759.6299, "train_tokens_per_second": 4359.736 }, { "epoch": 2.7040802852238217, "grad_norm": 0.3515625, "learning_rate": 0.00012552501410074005, "loss": 1.7883, "num_input_tokens_seen": 671006720, "step": 10240, "train_runtime": 153879.7124, "train_tokens_per_second": 4360.593 }, { "epoch": 2.7067212465337382, "grad_norm": 0.35546875, "learning_rate": 0.00012506172832679215, "loss": 1.7846, "num_input_tokens_seen": 671662080, "step": 10250, "train_runtime": 153998.9053, "train_tokens_per_second": 4361.473 }, { "epoch": 2.7093622078436552, "grad_norm": 0.345703125, "learning_rate": 0.00012459901373432333, "loss": 1.7479, "num_input_tokens_seen": 672317440, "step": 10260, "train_runtime": 154117.2327, "train_tokens_per_second": 4362.377 }, { "epoch": 2.7120031691535718, "grad_norm": 0.375, "learning_rate": 0.00012413687243872996, "loss": 1.7055, "num_input_tokens_seen": 672972800, "step": 10270, "train_runtime": 154236.4631, "train_tokens_per_second": 4363.254 }, { "epoch": 2.7146441304634887, "grad_norm": 0.373046875, "learning_rate": 0.000123675306552788, "loss": 1.7548, "num_input_tokens_seen": 673628160, "step": 10280, "train_runtime": 154355.8921, "train_tokens_per_second": 4364.123 }, { "epoch": 2.7172850917734053, "grad_norm": 0.3515625, "learning_rate": 0.00012321431818664252, "loss": 1.7346, "num_input_tokens_seen": 674283520, "step": 10290, "train_runtime": 154474.3228, "train_tokens_per_second": 4365.02 }, { "epoch": 2.7199260530833222, "grad_norm": 0.357421875, "learning_rate": 0.00012275390944779826, "loss": 1.7375, "num_input_tokens_seen": 674938880, "step": 10300, "train_runtime": 154593.0733, "train_tokens_per_second": 4365.906 }, { "epoch": 2.7225670143932392, "grad_norm": 0.361328125, "learning_rate": 0.00012229408244111045, "loss": 1.7223, "num_input_tokens_seen": 675594240, "step": 10310, "train_runtime": 154715.0506, "train_tokens_per_second": 4366.7 }, { "epoch": 2.7252079757031558, "grad_norm": 0.3515625, "learning_rate": 0.00012183483926877442, "loss": 1.7378, "num_input_tokens_seen": 676249600, "step": 10320, "train_runtime": 154836.8973, "train_tokens_per_second": 4367.496 }, { "epoch": 2.7278489370130727, "grad_norm": 0.35546875, "learning_rate": 0.00012137618203031659, "loss": 1.6316, "num_input_tokens_seen": 676904960, "step": 10330, "train_runtime": 154957.1985, "train_tokens_per_second": 4368.335 }, { "epoch": 2.7304898983229897, "grad_norm": 0.3515625, "learning_rate": 0.00012091811282258452, "loss": 1.6752, "num_input_tokens_seen": 677560320, "step": 10340, "train_runtime": 155075.758, "train_tokens_per_second": 4369.221 }, { "epoch": 2.7331308596329063, "grad_norm": 0.3515625, "learning_rate": 0.00012046063373973759, "loss": 1.7578, "num_input_tokens_seen": 678215680, "step": 10350, "train_runtime": 155195.2428, "train_tokens_per_second": 4370.08 }, { "epoch": 2.7357718209428232, "grad_norm": 0.369140625, "learning_rate": 0.00012000374687323718, "loss": 1.809, "num_input_tokens_seen": 678871040, "step": 10360, "train_runtime": 155314.0119, "train_tokens_per_second": 4370.958 }, { "epoch": 2.73841278225274, "grad_norm": 0.359375, "learning_rate": 0.00011954745431183742, "loss": 1.6975, "num_input_tokens_seen": 679526400, "step": 10370, "train_runtime": 155433.363, "train_tokens_per_second": 4371.818 }, { "epoch": 2.7410537435626567, "grad_norm": 0.3515625, "learning_rate": 0.0001190917581415753, "loss": 1.7345, "num_input_tokens_seen": 680181760, "step": 10380, "train_runtime": 155552.076, "train_tokens_per_second": 4372.695 }, { "epoch": 2.7436947048725737, "grad_norm": 0.361328125, "learning_rate": 0.00011863666044576118, "loss": 1.6923, "num_input_tokens_seen": 680837120, "step": 10390, "train_runtime": 155671.0341, "train_tokens_per_second": 4373.563 }, { "epoch": 2.7463356661824907, "grad_norm": 0.34375, "learning_rate": 0.00011818216330496981, "loss": 1.7456, "num_input_tokens_seen": 681492480, "step": 10400, "train_runtime": 155790.2815, "train_tokens_per_second": 4374.422 }, { "epoch": 2.7489766274924072, "grad_norm": 0.345703125, "learning_rate": 0.00011772826879702969, "loss": 1.6887, "num_input_tokens_seen": 682147840, "step": 10410, "train_runtime": 155908.1093, "train_tokens_per_second": 4375.32 }, { "epoch": 2.751617588802324, "grad_norm": 0.3671875, "learning_rate": 0.00011727497899701489, "loss": 1.6616, "num_input_tokens_seen": 682803200, "step": 10420, "train_runtime": 156026.2955, "train_tokens_per_second": 4376.206 }, { "epoch": 2.7542585501122407, "grad_norm": 0.359375, "learning_rate": 0.00011682229597723462, "loss": 1.7517, "num_input_tokens_seen": 683458560, "step": 10430, "train_runtime": 156146.1264, "train_tokens_per_second": 4377.045 }, { "epoch": 2.7568995114221577, "grad_norm": 0.35546875, "learning_rate": 0.00011637022180722412, "loss": 1.6757, "num_input_tokens_seen": 684113920, "step": 10440, "train_runtime": 156265.4483, "train_tokens_per_second": 4377.896 }, { "epoch": 2.7595404727320743, "grad_norm": 0.359375, "learning_rate": 0.00011591875855373515, "loss": 1.712, "num_input_tokens_seen": 684769280, "step": 10450, "train_runtime": 156384.4208, "train_tokens_per_second": 4378.756 }, { "epoch": 2.7621814340419912, "grad_norm": 0.34375, "learning_rate": 0.00011546790828072643, "loss": 1.7351, "num_input_tokens_seen": 685424640, "step": 10460, "train_runtime": 156503.5248, "train_tokens_per_second": 4379.612 }, { "epoch": 2.764822395351908, "grad_norm": 0.345703125, "learning_rate": 0.00011501767304935463, "loss": 1.7844, "num_input_tokens_seen": 686080000, "step": 10470, "train_runtime": 156623.0679, "train_tokens_per_second": 4380.453 }, { "epoch": 2.7674633566618247, "grad_norm": 0.3515625, "learning_rate": 0.00011456805491796429, "loss": 1.6747, "num_input_tokens_seen": 686735360, "step": 10480, "train_runtime": 156741.8389, "train_tokens_per_second": 4381.315 }, { "epoch": 2.7701043179717417, "grad_norm": 0.380859375, "learning_rate": 0.00011411905594207889, "loss": 1.699, "num_input_tokens_seen": 687390720, "step": 10490, "train_runtime": 156862.1683, "train_tokens_per_second": 4382.132 }, { "epoch": 2.7727452792816587, "grad_norm": 0.34765625, "learning_rate": 0.00011367067817439122, "loss": 1.7249, "num_input_tokens_seen": 688046080, "step": 10500, "train_runtime": 156981.6318, "train_tokens_per_second": 4382.972 }, { "epoch": 2.7753862405915752, "grad_norm": 0.37109375, "learning_rate": 0.00011322292366475442, "loss": 1.7308, "num_input_tokens_seen": 688701440, "step": 10510, "train_runtime": 157113.7815, "train_tokens_per_second": 4383.457 }, { "epoch": 2.778027201901492, "grad_norm": 0.361328125, "learning_rate": 0.0001127757944601717, "loss": 1.802, "num_input_tokens_seen": 689356800, "step": 10520, "train_runtime": 157233.1516, "train_tokens_per_second": 4384.297 }, { "epoch": 2.780668163211409, "grad_norm": 0.35546875, "learning_rate": 0.00011232929260478808, "loss": 1.7407, "num_input_tokens_seen": 690012160, "step": 10530, "train_runtime": 157353.0862, "train_tokens_per_second": 4385.12 }, { "epoch": 2.7833091245213257, "grad_norm": 0.37109375, "learning_rate": 0.00011188342013988026, "loss": 1.7362, "num_input_tokens_seen": 690667520, "step": 10540, "train_runtime": 157471.115, "train_tokens_per_second": 4385.995 }, { "epoch": 2.7859500858312427, "grad_norm": 0.357421875, "learning_rate": 0.00011143817910384752, "loss": 1.6644, "num_input_tokens_seen": 691322880, "step": 10550, "train_runtime": 157589.8414, "train_tokens_per_second": 4386.849 }, { "epoch": 2.7885910471411592, "grad_norm": 0.34375, "learning_rate": 0.00011099357153220268, "loss": 1.7045, "num_input_tokens_seen": 691978240, "step": 10560, "train_runtime": 157709.0865, "train_tokens_per_second": 4387.688 }, { "epoch": 2.791232008451076, "grad_norm": 0.359375, "learning_rate": 0.00011054959945756235, "loss": 1.7144, "num_input_tokens_seen": 692633600, "step": 10570, "train_runtime": 157827.2305, "train_tokens_per_second": 4388.556 }, { "epoch": 2.7938729697609928, "grad_norm": 0.37109375, "learning_rate": 0.0001101062649096378, "loss": 1.7182, "num_input_tokens_seen": 693288960, "step": 10580, "train_runtime": 157948.9031, "train_tokens_per_second": 4389.324 }, { "epoch": 2.7965139310709097, "grad_norm": 0.380859375, "learning_rate": 0.00010966356991522578, "loss": 1.76, "num_input_tokens_seen": 693944320, "step": 10590, "train_runtime": 158067.7946, "train_tokens_per_second": 4390.169 }, { "epoch": 2.7991548923808267, "grad_norm": 0.36328125, "learning_rate": 0.00010922151649819922, "loss": 1.7266, "num_input_tokens_seen": 694599680, "step": 10600, "train_runtime": 158187.267, "train_tokens_per_second": 4390.996 }, { "epoch": 2.8017958536907432, "grad_norm": 0.341796875, "learning_rate": 0.00010878010667949778, "loss": 1.7727, "num_input_tokens_seen": 695255040, "step": 10610, "train_runtime": 158306.3972, "train_tokens_per_second": 4391.832 }, { "epoch": 2.80443681500066, "grad_norm": 0.369140625, "learning_rate": 0.00010833934247711915, "loss": 1.7041, "num_input_tokens_seen": 695910400, "step": 10620, "train_runtime": 158426.0221, "train_tokens_per_second": 4392.652 }, { "epoch": 2.807077776310577, "grad_norm": 0.3671875, "learning_rate": 0.00010789922590610906, "loss": 1.7687, "num_input_tokens_seen": 696565760, "step": 10630, "train_runtime": 158545.9365, "train_tokens_per_second": 4393.463 }, { "epoch": 2.8097187376204937, "grad_norm": 0.365234375, "learning_rate": 0.00010745975897855262, "loss": 1.7769, "num_input_tokens_seen": 697221120, "step": 10640, "train_runtime": 158664.6968, "train_tokens_per_second": 4394.305 }, { "epoch": 2.8123596989304107, "grad_norm": 0.35546875, "learning_rate": 0.00010702094370356491, "loss": 1.7321, "num_input_tokens_seen": 697876480, "step": 10650, "train_runtime": 158783.4357, "train_tokens_per_second": 4395.147 }, { "epoch": 2.8150006602403277, "grad_norm": 0.380859375, "learning_rate": 0.00010658278208728184, "loss": 1.7564, "num_input_tokens_seen": 698531840, "step": 10660, "train_runtime": 158904.9205, "train_tokens_per_second": 4395.911 }, { "epoch": 2.8176416215502442, "grad_norm": 0.36328125, "learning_rate": 0.00010614527613285118, "loss": 1.7091, "num_input_tokens_seen": 699187200, "step": 10670, "train_runtime": 159023.0431, "train_tokens_per_second": 4396.767 }, { "epoch": 2.820282582860161, "grad_norm": 0.361328125, "learning_rate": 0.00010570842784042295, "loss": 1.7171, "num_input_tokens_seen": 699842560, "step": 10680, "train_runtime": 159143.3027, "train_tokens_per_second": 4397.562 }, { "epoch": 2.822923544170078, "grad_norm": 0.3671875, "learning_rate": 0.00010527223920714058, "loss": 1.6966, "num_input_tokens_seen": 700497920, "step": 10690, "train_runtime": 159260.7033, "train_tokens_per_second": 4398.435 }, { "epoch": 2.8255645054799947, "grad_norm": 0.35546875, "learning_rate": 0.00010483671222713184, "loss": 1.7572, "num_input_tokens_seen": 701153280, "step": 10700, "train_runtime": 159380.4878, "train_tokens_per_second": 4399.242 }, { "epoch": 2.8282054667899117, "grad_norm": 0.345703125, "learning_rate": 0.00010440184889149951, "loss": 1.7431, "num_input_tokens_seen": 701808640, "step": 10710, "train_runtime": 159500.2378, "train_tokens_per_second": 4400.048 }, { "epoch": 2.8308464280998282, "grad_norm": 0.36328125, "learning_rate": 0.0001039676511883123, "loss": 1.7611, "num_input_tokens_seen": 702464000, "step": 10720, "train_runtime": 159618.7691, "train_tokens_per_second": 4400.886 }, { "epoch": 2.833487389409745, "grad_norm": 0.365234375, "learning_rate": 0.00010353412110259621, "loss": 1.7719, "num_input_tokens_seen": 703119360, "step": 10730, "train_runtime": 159737.3516, "train_tokens_per_second": 4401.722 }, { "epoch": 2.8361283507196617, "grad_norm": 0.365234375, "learning_rate": 0.00010310126061632469, "loss": 1.7507, "num_input_tokens_seen": 703774720, "step": 10740, "train_runtime": 159858.7302, "train_tokens_per_second": 4402.479 }, { "epoch": 2.8387693120295787, "grad_norm": 0.35546875, "learning_rate": 0.00010266907170841006, "loss": 1.7133, "num_input_tokens_seen": 704430080, "step": 10750, "train_runtime": 159981.1148, "train_tokens_per_second": 4403.208 }, { "epoch": 2.8414102733394957, "grad_norm": 0.359375, "learning_rate": 0.00010223755635469467, "loss": 1.6978, "num_input_tokens_seen": 705085440, "step": 10760, "train_runtime": 160100.2728, "train_tokens_per_second": 4404.024 }, { "epoch": 2.8440512346494122, "grad_norm": 0.353515625, "learning_rate": 0.00010180671652794105, "loss": 1.7199, "num_input_tokens_seen": 705740800, "step": 10770, "train_runtime": 160219.6657, "train_tokens_per_second": 4404.833 }, { "epoch": 2.846692195959329, "grad_norm": 0.36328125, "learning_rate": 0.0001013765541978239, "loss": 1.7775, "num_input_tokens_seen": 706396160, "step": 10780, "train_runtime": 160338.3002, "train_tokens_per_second": 4405.661 }, { "epoch": 2.849333157269246, "grad_norm": 0.376953125, "learning_rate": 0.0001009470713309204, "loss": 1.6821, "num_input_tokens_seen": 707051520, "step": 10790, "train_runtime": 160456.5683, "train_tokens_per_second": 4406.498 }, { "epoch": 2.8519741185791627, "grad_norm": 0.36328125, "learning_rate": 0.00010051826989070142, "loss": 1.7831, "num_input_tokens_seen": 707706880, "step": 10800, "train_runtime": 160578.6246, "train_tokens_per_second": 4407.23 }, { "epoch": 2.8546150798890797, "grad_norm": 0.36328125, "learning_rate": 0.00010009015183752251, "loss": 1.7763, "num_input_tokens_seen": 708362240, "step": 10810, "train_runtime": 160698.0841, "train_tokens_per_second": 4408.032 }, { "epoch": 2.8572560411989967, "grad_norm": 0.361328125, "learning_rate": 9.966271912861502e-05, "loss": 1.7499, "num_input_tokens_seen": 709017600, "step": 10820, "train_runtime": 160816.1067, "train_tokens_per_second": 4408.872 }, { "epoch": 2.859897002508913, "grad_norm": 0.37109375, "learning_rate": 9.923597371807722e-05, "loss": 1.7093, "num_input_tokens_seen": 709672960, "step": 10830, "train_runtime": 160934.861, "train_tokens_per_second": 4409.691 }, { "epoch": 2.86253796381883, "grad_norm": 0.359375, "learning_rate": 9.880991755686508e-05, "loss": 1.7036, "num_input_tokens_seen": 710328320, "step": 10840, "train_runtime": 161060.4538, "train_tokens_per_second": 4410.321 }, { "epoch": 2.8651789251287467, "grad_norm": 0.345703125, "learning_rate": 9.838455259278358e-05, "loss": 1.6477, "num_input_tokens_seen": 710983680, "step": 10850, "train_runtime": 161192.2855, "train_tokens_per_second": 4410.78 }, { "epoch": 2.8678198864386637, "grad_norm": 0.37109375, "learning_rate": 9.795988077047768e-05, "loss": 1.7531, "num_input_tokens_seen": 711639040, "step": 10860, "train_runtime": 161325.0553, "train_tokens_per_second": 4411.212 }, { "epoch": 2.8704608477485802, "grad_norm": 0.359375, "learning_rate": 9.753590403142381e-05, "loss": 1.7527, "num_input_tokens_seen": 712294400, "step": 10870, "train_runtime": 161458.6546, "train_tokens_per_second": 4411.621 }, { "epoch": 2.873101809058497, "grad_norm": 0.380859375, "learning_rate": 9.711262431392009e-05, "loss": 1.6999, "num_input_tokens_seen": 712949760, "step": 10880, "train_runtime": 161591.4557, "train_tokens_per_second": 4412.051 }, { "epoch": 2.875742770368414, "grad_norm": 0.34765625, "learning_rate": 9.669004355307868e-05, "loss": 1.722, "num_input_tokens_seen": 713605120, "step": 10890, "train_runtime": 161729.3806, "train_tokens_per_second": 4412.341 }, { "epoch": 2.8783837316783307, "grad_norm": 0.361328125, "learning_rate": 9.626816368081595e-05, "loss": 1.7297, "num_input_tokens_seen": 714260480, "step": 10900, "train_runtime": 161906.5229, "train_tokens_per_second": 4411.561 }, { "epoch": 2.8810246929882477, "grad_norm": 0.353515625, "learning_rate": 9.584698662584404e-05, "loss": 1.7523, "num_input_tokens_seen": 714915840, "step": 10910, "train_runtime": 162092.9704, "train_tokens_per_second": 4410.53 }, { "epoch": 2.8836656542981647, "grad_norm": 0.35546875, "learning_rate": 9.542651431366231e-05, "loss": 1.7262, "num_input_tokens_seen": 715571200, "step": 10920, "train_runtime": 162275.9244, "train_tokens_per_second": 4409.596 }, { "epoch": 2.886306615608081, "grad_norm": 0.357421875, "learning_rate": 9.500674866654768e-05, "loss": 1.7127, "num_input_tokens_seen": 716226560, "step": 10930, "train_runtime": 162462.9486, "train_tokens_per_second": 4408.553 }, { "epoch": 2.888947576917998, "grad_norm": 0.3671875, "learning_rate": 9.4587691603547e-05, "loss": 1.7462, "num_input_tokens_seen": 716881920, "step": 10940, "train_runtime": 162647.4856, "train_tokens_per_second": 4407.581 }, { "epoch": 2.891588538227915, "grad_norm": 0.341796875, "learning_rate": 9.416934504046725e-05, "loss": 1.7307, "num_input_tokens_seen": 717537280, "step": 10950, "train_runtime": 162831.6973, "train_tokens_per_second": 4406.619 }, { "epoch": 2.8942294995378317, "grad_norm": 0.37109375, "learning_rate": 9.375171088986747e-05, "loss": 1.6707, "num_input_tokens_seen": 718192640, "step": 10960, "train_runtime": 163015.2947, "train_tokens_per_second": 4405.676 }, { "epoch": 2.8968704608477487, "grad_norm": 0.36328125, "learning_rate": 9.333479106104954e-05, "loss": 1.7345, "num_input_tokens_seen": 718848000, "step": 10970, "train_runtime": 163200.4999, "train_tokens_per_second": 4404.692 }, { "epoch": 2.8995114221576657, "grad_norm": 0.359375, "learning_rate": 9.291858746004995e-05, "loss": 1.7042, "num_input_tokens_seen": 719503360, "step": 10980, "train_runtime": 163398.6307, "train_tokens_per_second": 4403.362 }, { "epoch": 2.902152383467582, "grad_norm": 0.359375, "learning_rate": 9.250310198963052e-05, "loss": 1.7219, "num_input_tokens_seen": 720158720, "step": 10990, "train_runtime": 163604.6978, "train_tokens_per_second": 4401.822 }, { "epoch": 2.904793344777499, "grad_norm": 0.369140625, "learning_rate": 9.208833654927019e-05, "loss": 1.6894, "num_input_tokens_seen": 720814080, "step": 11000, "train_runtime": 163811.6561, "train_tokens_per_second": 4400.261 }, { "epoch": 2.9074343060874157, "grad_norm": 0.341796875, "learning_rate": 9.167429303515596e-05, "loss": 1.6541, "num_input_tokens_seen": 721469440, "step": 11010, "train_runtime": 164028.1648, "train_tokens_per_second": 4398.449 }, { "epoch": 2.9100752673973327, "grad_norm": 0.373046875, "learning_rate": 9.126097334017447e-05, "loss": 1.6768, "num_input_tokens_seen": 722124800, "step": 11020, "train_runtime": 164235.0251, "train_tokens_per_second": 4396.899 }, { "epoch": 2.912716228707249, "grad_norm": 0.34375, "learning_rate": 9.084837935390347e-05, "loss": 1.6783, "num_input_tokens_seen": 722780160, "step": 11030, "train_runtime": 164441.5491, "train_tokens_per_second": 4395.362 }, { "epoch": 2.915357190017166, "grad_norm": 0.412109375, "learning_rate": 9.043651296260253e-05, "loss": 1.7177, "num_input_tokens_seen": 723435520, "step": 11040, "train_runtime": 164647.0842, "train_tokens_per_second": 4393.856 }, { "epoch": 2.917998151327083, "grad_norm": 0.353515625, "learning_rate": 9.00253760492053e-05, "loss": 1.7235, "num_input_tokens_seen": 724090880, "step": 11050, "train_runtime": 164853.7354, "train_tokens_per_second": 4392.323 }, { "epoch": 2.9206391126369997, "grad_norm": 0.373046875, "learning_rate": 8.961497049331027e-05, "loss": 1.7528, "num_input_tokens_seen": 724746240, "step": 11060, "train_runtime": 165061.0799, "train_tokens_per_second": 4390.776 }, { "epoch": 2.9232800739469167, "grad_norm": 0.3671875, "learning_rate": 8.920529817117237e-05, "loss": 1.7676, "num_input_tokens_seen": 725401600, "step": 11070, "train_runtime": 165266.6018, "train_tokens_per_second": 4389.281 }, { "epoch": 2.9259210352568337, "grad_norm": 0.359375, "learning_rate": 8.879636095569438e-05, "loss": 1.6574, "num_input_tokens_seen": 726056960, "step": 11080, "train_runtime": 165472.1923, "train_tokens_per_second": 4387.788 }, { "epoch": 2.92856199656675, "grad_norm": 0.34765625, "learning_rate": 8.838816071641856e-05, "loss": 1.6925, "num_input_tokens_seen": 726712320, "step": 11090, "train_runtime": 165676.9164, "train_tokens_per_second": 4386.322 }, { "epoch": 2.931202957876667, "grad_norm": 0.3515625, "learning_rate": 8.79806993195178e-05, "loss": 1.7012, "num_input_tokens_seen": 727367680, "step": 11100, "train_runtime": 165880.6202, "train_tokens_per_second": 4384.886 }, { "epoch": 2.933843919186584, "grad_norm": 0.361328125, "learning_rate": 8.757397862778704e-05, "loss": 1.7296, "num_input_tokens_seen": 728023040, "step": 11110, "train_runtime": 166086.5299, "train_tokens_per_second": 4383.396 }, { "epoch": 2.9364848804965007, "grad_norm": 0.353515625, "learning_rate": 8.716800050063545e-05, "loss": 1.6135, "num_input_tokens_seen": 728678400, "step": 11120, "train_runtime": 166291.6164, "train_tokens_per_second": 4381.931 }, { "epoch": 2.9391258418064177, "grad_norm": 0.384765625, "learning_rate": 8.676276679407671e-05, "loss": 1.7623, "num_input_tokens_seen": 729333760, "step": 11130, "train_runtime": 166498.2747, "train_tokens_per_second": 4380.428 }, { "epoch": 2.941766803116334, "grad_norm": 0.357421875, "learning_rate": 8.635827936072183e-05, "loss": 1.7493, "num_input_tokens_seen": 729989120, "step": 11140, "train_runtime": 166703.3736, "train_tokens_per_second": 4378.97 }, { "epoch": 2.944407764426251, "grad_norm": 0.349609375, "learning_rate": 8.595454004976977e-05, "loss": 1.6727, "num_input_tokens_seen": 730644480, "step": 11150, "train_runtime": 166908.2835, "train_tokens_per_second": 4377.521 }, { "epoch": 2.9470487257361677, "grad_norm": 0.34375, "learning_rate": 8.555155070699935e-05, "loss": 1.6715, "num_input_tokens_seen": 731299840, "step": 11160, "train_runtime": 167114.1981, "train_tokens_per_second": 4376.049 }, { "epoch": 2.9496896870460847, "grad_norm": 0.35546875, "learning_rate": 8.514931317476076e-05, "loss": 1.6644, "num_input_tokens_seen": 731955200, "step": 11170, "train_runtime": 167319.6064, "train_tokens_per_second": 4374.593 }, { "epoch": 2.9523306483560017, "grad_norm": 0.359375, "learning_rate": 8.474782929196705e-05, "loss": 1.6907, "num_input_tokens_seen": 732610560, "step": 11180, "train_runtime": 167524.7882, "train_tokens_per_second": 4373.147 }, { "epoch": 2.954971609665918, "grad_norm": 0.37109375, "learning_rate": 8.434710089408609e-05, "loss": 1.6984, "num_input_tokens_seen": 733265920, "step": 11190, "train_runtime": 167728.7061, "train_tokens_per_second": 4371.738 }, { "epoch": 2.957612570975835, "grad_norm": 0.341796875, "learning_rate": 8.394712981313155e-05, "loss": 1.7113, "num_input_tokens_seen": 733921280, "step": 11200, "train_runtime": 167932.659, "train_tokens_per_second": 4370.331 }, { "epoch": 2.960253532285752, "grad_norm": 0.36328125, "learning_rate": 8.354791787765503e-05, "loss": 1.7431, "num_input_tokens_seen": 734576640, "step": 11210, "train_runtime": 168135.7357, "train_tokens_per_second": 4368.95 }, { "epoch": 2.9628944935956687, "grad_norm": 0.34765625, "learning_rate": 8.314946691273742e-05, "loss": 1.6599, "num_input_tokens_seen": 735232000, "step": 11220, "train_runtime": 168338.9372, "train_tokens_per_second": 4367.569 }, { "epoch": 2.9655354549055857, "grad_norm": 0.36328125, "learning_rate": 8.275177873998105e-05, "loss": 1.7044, "num_input_tokens_seen": 735887360, "step": 11230, "train_runtime": 168541.4777, "train_tokens_per_second": 4366.209 }, { "epoch": 2.9681764162155027, "grad_norm": 0.3515625, "learning_rate": 8.235485517750032e-05, "loss": 1.733, "num_input_tokens_seen": 736542720, "step": 11240, "train_runtime": 168744.6864, "train_tokens_per_second": 4364.835 }, { "epoch": 2.970817377525419, "grad_norm": 0.369140625, "learning_rate": 8.19586980399147e-05, "loss": 1.7647, "num_input_tokens_seen": 737198080, "step": 11250, "train_runtime": 168948.3139, "train_tokens_per_second": 4363.453 }, { "epoch": 2.973458338835336, "grad_norm": 0.359375, "learning_rate": 8.156330913833948e-05, "loss": 1.7157, "num_input_tokens_seen": 737853440, "step": 11260, "train_runtime": 169151.1825, "train_tokens_per_second": 4362.094 }, { "epoch": 2.976099300145253, "grad_norm": 0.369140625, "learning_rate": 8.116869028037774e-05, "loss": 1.7479, "num_input_tokens_seen": 738508800, "step": 11270, "train_runtime": 169354.1482, "train_tokens_per_second": 4360.736 }, { "epoch": 2.9787402614551697, "grad_norm": 0.35546875, "learning_rate": 8.077484327011248e-05, "loss": 1.7005, "num_input_tokens_seen": 739164160, "step": 11280, "train_runtime": 169557.8097, "train_tokens_per_second": 4359.364 }, { "epoch": 2.981381222765086, "grad_norm": 0.359375, "learning_rate": 8.038176990809748e-05, "loss": 1.7275, "num_input_tokens_seen": 739819520, "step": 11290, "train_runtime": 169762.6635, "train_tokens_per_second": 4357.964 }, { "epoch": 2.984022184075003, "grad_norm": 0.349609375, "learning_rate": 7.998947199135017e-05, "loss": 1.7322, "num_input_tokens_seen": 740474880, "step": 11300, "train_runtime": 169964.0397, "train_tokens_per_second": 4356.656 }, { "epoch": 2.98666314538492, "grad_norm": 0.34765625, "learning_rate": 7.959795131334263e-05, "loss": 1.708, "num_input_tokens_seen": 741130240, "step": 11310, "train_runtime": 170169.0421, "train_tokens_per_second": 4355.259 }, { "epoch": 2.9893041066948367, "grad_norm": 0.345703125, "learning_rate": 7.920720966399361e-05, "loss": 1.6864, "num_input_tokens_seen": 741785600, "step": 11320, "train_runtime": 170374.6853, "train_tokens_per_second": 4353.849 }, { "epoch": 2.9919450680047537, "grad_norm": 0.359375, "learning_rate": 7.881724882966031e-05, "loss": 1.7189, "num_input_tokens_seen": 742440960, "step": 11330, "train_runtime": 170579.1932, "train_tokens_per_second": 4352.471 }, { "epoch": 2.9945860293146707, "grad_norm": 0.3984375, "learning_rate": 7.842807059313056e-05, "loss": 1.6928, "num_input_tokens_seen": 743096320, "step": 11340, "train_runtime": 170783.4647, "train_tokens_per_second": 4351.102 }, { "epoch": 2.997226990624587, "grad_norm": 0.353515625, "learning_rate": 7.80396767336141e-05, "loss": 1.6561, "num_input_tokens_seen": 743751680, "step": 11350, "train_runtime": 170988.5409, "train_tokens_per_second": 4349.717 }, { "epoch": 2.999867951934504, "grad_norm": 0.353515625, "learning_rate": 7.765206902673478e-05, "loss": 1.6885, "num_input_tokens_seen": 744407040, "step": 11360, "train_runtime": 171193.267, "train_tokens_per_second": 4348.343 }, { "epoch": 3.0023768651789253, "grad_norm": 0.48828125, "learning_rate": 7.726524924452247e-05, "loss": 1.366, "num_input_tokens_seen": 745021440, "step": 11370, "train_runtime": 171385.9885, "train_tokens_per_second": 4347.038 }, { "epoch": 3.005017826488842, "grad_norm": 0.50390625, "learning_rate": 7.687921915540469e-05, "loss": 1.2321, "num_input_tokens_seen": 745676800, "step": 11380, "train_runtime": 171592.3094, "train_tokens_per_second": 4345.631 }, { "epoch": 3.007658787798759, "grad_norm": 0.42578125, "learning_rate": 7.649398052419918e-05, "loss": 1.2529, "num_input_tokens_seen": 746332160, "step": 11390, "train_runtime": 171795.8175, "train_tokens_per_second": 4344.298 }, { "epoch": 3.0102997491086754, "grad_norm": 0.421875, "learning_rate": 7.610953511210461e-05, "loss": 1.2261, "num_input_tokens_seen": 746987520, "step": 11400, "train_runtime": 172001.2238, "train_tokens_per_second": 4342.92 }, { "epoch": 3.0129407104185923, "grad_norm": 0.447265625, "learning_rate": 7.572588467669403e-05, "loss": 1.237, "num_input_tokens_seen": 747642880, "step": 11410, "train_runtime": 172206.9653, "train_tokens_per_second": 4341.537 }, { "epoch": 3.0155816717285093, "grad_norm": 0.4453125, "learning_rate": 7.534303097190565e-05, "loss": 1.1882, "num_input_tokens_seen": 748298240, "step": 11420, "train_runtime": 172412.2409, "train_tokens_per_second": 4340.169 }, { "epoch": 3.018222633038426, "grad_norm": 0.447265625, "learning_rate": 7.49609757480354e-05, "loss": 1.1744, "num_input_tokens_seen": 748953600, "step": 11430, "train_runtime": 172617.4078, "train_tokens_per_second": 4338.807 }, { "epoch": 3.020863594348343, "grad_norm": 0.4296875, "learning_rate": 7.457972075172864e-05, "loss": 1.2523, "num_input_tokens_seen": 749608960, "step": 11440, "train_runtime": 172822.1584, "train_tokens_per_second": 4337.459 }, { "epoch": 3.02350455565826, "grad_norm": 0.435546875, "learning_rate": 7.419926772597266e-05, "loss": 1.2435, "num_input_tokens_seen": 750264320, "step": 11450, "train_runtime": 173026.6398, "train_tokens_per_second": 4336.12 }, { "epoch": 3.0261455169681764, "grad_norm": 0.458984375, "learning_rate": 7.381961841008802e-05, "loss": 1.2082, "num_input_tokens_seen": 750919680, "step": 11460, "train_runtime": 173229.1934, "train_tokens_per_second": 4334.833 }, { "epoch": 3.0287864782780933, "grad_norm": 0.498046875, "learning_rate": 7.344077453972106e-05, "loss": 1.1764, "num_input_tokens_seen": 751575040, "step": 11470, "train_runtime": 173434.0962, "train_tokens_per_second": 4333.491 }, { "epoch": 3.03142743958801, "grad_norm": 0.451171875, "learning_rate": 7.306273784683609e-05, "loss": 1.1778, "num_input_tokens_seen": 752230400, "step": 11480, "train_runtime": 173639.6384, "train_tokens_per_second": 4332.135 }, { "epoch": 3.034068400897927, "grad_norm": 0.435546875, "learning_rate": 7.268551005970672e-05, "loss": 1.2463, "num_input_tokens_seen": 752885760, "step": 11490, "train_runtime": 173844.5667, "train_tokens_per_second": 4330.798 }, { "epoch": 3.036709362207844, "grad_norm": 0.466796875, "learning_rate": 7.230909290290916e-05, "loss": 1.2593, "num_input_tokens_seen": 753541120, "step": 11500, "train_runtime": 174048.9046, "train_tokens_per_second": 4329.479 }, { "epoch": 3.0393503235177604, "grad_norm": 0.478515625, "learning_rate": 7.19334880973129e-05, "loss": 1.2332, "num_input_tokens_seen": 754196480, "step": 11510, "train_runtime": 174262.4978, "train_tokens_per_second": 4327.933 }, { "epoch": 3.0419912848276773, "grad_norm": 0.455078125, "learning_rate": 7.155869736007428e-05, "loss": 1.1999, "num_input_tokens_seen": 754851840, "step": 11520, "train_runtime": 174466.1751, "train_tokens_per_second": 4326.637 }, { "epoch": 3.044632246137594, "grad_norm": 0.46875, "learning_rate": 7.118472240462753e-05, "loss": 1.1875, "num_input_tokens_seen": 755507200, "step": 11530, "train_runtime": 174667.5255, "train_tokens_per_second": 4325.402 }, { "epoch": 3.047273207447511, "grad_norm": 0.451171875, "learning_rate": 7.081156494067747e-05, "loss": 1.2414, "num_input_tokens_seen": 756162560, "step": 11540, "train_runtime": 174869.0498, "train_tokens_per_second": 4324.165 }, { "epoch": 3.049914168757428, "grad_norm": 0.47265625, "learning_rate": 7.043922667419173e-05, "loss": 1.2012, "num_input_tokens_seen": 756817920, "step": 11550, "train_runtime": 175072.7246, "train_tokens_per_second": 4322.877 }, { "epoch": 3.0525551300673444, "grad_norm": 0.46875, "learning_rate": 7.006770930739263e-05, "loss": 1.2022, "num_input_tokens_seen": 757473280, "step": 11560, "train_runtime": 175275.7965, "train_tokens_per_second": 4321.608 }, { "epoch": 3.0551960913772613, "grad_norm": 0.466796875, "learning_rate": 6.96970145387496e-05, "loss": 1.1891, "num_input_tokens_seen": 758128640, "step": 11570, "train_runtime": 175477.3259, "train_tokens_per_second": 4320.379 }, { "epoch": 3.0578370526871783, "grad_norm": 0.484375, "learning_rate": 6.932714406297136e-05, "loss": 1.2574, "num_input_tokens_seen": 758784000, "step": 11580, "train_runtime": 175679.6596, "train_tokens_per_second": 4319.134 }, { "epoch": 3.060478013997095, "grad_norm": 0.44921875, "learning_rate": 6.89580995709985e-05, "loss": 1.2055, "num_input_tokens_seen": 759439360, "step": 11590, "train_runtime": 175881.5726, "train_tokens_per_second": 4317.902 }, { "epoch": 3.063118975307012, "grad_norm": 0.443359375, "learning_rate": 6.858988274999492e-05, "loss": 1.2026, "num_input_tokens_seen": 760094720, "step": 11600, "train_runtime": 176083.1278, "train_tokens_per_second": 4316.681 }, { "epoch": 3.0657599366169284, "grad_norm": 0.462890625, "learning_rate": 6.822249528334115e-05, "loss": 1.2139, "num_input_tokens_seen": 760750080, "step": 11610, "train_runtime": 176285.2639, "train_tokens_per_second": 4315.449 }, { "epoch": 3.0684008979268453, "grad_norm": 0.474609375, "learning_rate": 6.785593885062588e-05, "loss": 1.1767, "num_input_tokens_seen": 761405440, "step": 11620, "train_runtime": 176488.0127, "train_tokens_per_second": 4314.205 }, { "epoch": 3.0710418592367623, "grad_norm": 0.8125, "learning_rate": 6.749021512763856e-05, "loss": 1.2371, "num_input_tokens_seen": 762060800, "step": 11630, "train_runtime": 176689.3264, "train_tokens_per_second": 4312.999 }, { "epoch": 3.073682820546679, "grad_norm": 0.4609375, "learning_rate": 6.712532578636199e-05, "loss": 1.2089, "num_input_tokens_seen": 762716160, "step": 11640, "train_runtime": 176892.066, "train_tokens_per_second": 4311.76 }, { "epoch": 3.076323781856596, "grad_norm": 0.47265625, "learning_rate": 6.676127249496396e-05, "loss": 1.19, "num_input_tokens_seen": 763371520, "step": 11650, "train_runtime": 177095.7753, "train_tokens_per_second": 4310.501 }, { "epoch": 3.078964743166513, "grad_norm": 0.44921875, "learning_rate": 6.639805691779057e-05, "loss": 1.2058, "num_input_tokens_seen": 764026880, "step": 11660, "train_runtime": 177299.3681, "train_tokens_per_second": 4309.248 }, { "epoch": 3.0816057044764293, "grad_norm": 0.52734375, "learning_rate": 6.603568071535782e-05, "loss": 1.2047, "num_input_tokens_seen": 764682240, "step": 11670, "train_runtime": 177500.3616, "train_tokens_per_second": 4308.06 }, { "epoch": 3.0842466657863463, "grad_norm": 0.455078125, "learning_rate": 6.567414554434442e-05, "loss": 1.1677, "num_input_tokens_seen": 765337600, "step": 11680, "train_runtime": 177703.4506, "train_tokens_per_second": 4306.825 }, { "epoch": 3.086887627096263, "grad_norm": 0.478515625, "learning_rate": 6.531345305758405e-05, "loss": 1.2848, "num_input_tokens_seen": 765992960, "step": 11690, "train_runtime": 177906.3446, "train_tokens_per_second": 4305.597 }, { "epoch": 3.08952858840618, "grad_norm": 0.4609375, "learning_rate": 6.495360490405816e-05, "loss": 1.2277, "num_input_tokens_seen": 766648320, "step": 11700, "train_runtime": 178109.5755, "train_tokens_per_second": 4304.363 }, { "epoch": 3.092169549716097, "grad_norm": 0.4375, "learning_rate": 6.459460272888781e-05, "loss": 1.1662, "num_input_tokens_seen": 767303680, "step": 11710, "train_runtime": 178311.4916, "train_tokens_per_second": 4303.164 }, { "epoch": 3.0948105110260133, "grad_norm": 0.498046875, "learning_rate": 6.423644817332666e-05, "loss": 1.1641, "num_input_tokens_seen": 767959040, "step": 11720, "train_runtime": 178513.9369, "train_tokens_per_second": 4301.956 }, { "epoch": 3.0974514723359303, "grad_norm": 0.4765625, "learning_rate": 6.387914287475344e-05, "loss": 1.1757, "num_input_tokens_seen": 768614400, "step": 11730, "train_runtime": 178718.9456, "train_tokens_per_second": 4300.688 }, { "epoch": 3.1000924336458473, "grad_norm": 0.478515625, "learning_rate": 6.352268846666387e-05, "loss": 1.2153, "num_input_tokens_seen": 769269760, "step": 11740, "train_runtime": 178919.3015, "train_tokens_per_second": 4299.535 }, { "epoch": 3.102733394955764, "grad_norm": 0.458984375, "learning_rate": 6.316708657866427e-05, "loss": 1.1833, "num_input_tokens_seen": 769925120, "step": 11750, "train_runtime": 179121.932, "train_tokens_per_second": 4298.33 }, { "epoch": 3.105374356265681, "grad_norm": 0.466796875, "learning_rate": 6.281233883646282e-05, "loss": 1.1648, "num_input_tokens_seen": 770580480, "step": 11760, "train_runtime": 179322.9721, "train_tokens_per_second": 4297.165 }, { "epoch": 3.1080153175755973, "grad_norm": 0.462890625, "learning_rate": 6.24584468618634e-05, "loss": 1.2042, "num_input_tokens_seen": 771235840, "step": 11770, "train_runtime": 179526.3532, "train_tokens_per_second": 4295.948 }, { "epoch": 3.1106562788855143, "grad_norm": 0.46875, "learning_rate": 6.210541227275715e-05, "loss": 1.1574, "num_input_tokens_seen": 771891200, "step": 11780, "train_runtime": 179730.1261, "train_tokens_per_second": 4294.724 }, { "epoch": 3.1132972401954313, "grad_norm": 0.486328125, "learning_rate": 6.175323668311564e-05, "loss": 1.1767, "num_input_tokens_seen": 772546560, "step": 11790, "train_runtime": 179934.1076, "train_tokens_per_second": 4293.497 }, { "epoch": 3.115938201505348, "grad_norm": 0.46875, "learning_rate": 6.140192170298347e-05, "loss": 1.1836, "num_input_tokens_seen": 773201920, "step": 11800, "train_runtime": 180138.7366, "train_tokens_per_second": 4292.258 }, { "epoch": 3.118579162815265, "grad_norm": 0.462890625, "learning_rate": 6.105146893847061e-05, "loss": 1.242, "num_input_tokens_seen": 773857280, "step": 11810, "train_runtime": 180342.6868, "train_tokens_per_second": 4291.038 }, { "epoch": 3.1212201241251813, "grad_norm": 0.515625, "learning_rate": 6.070187999174523e-05, "loss": 1.2258, "num_input_tokens_seen": 774512640, "step": 11820, "train_runtime": 180546.2469, "train_tokens_per_second": 4289.83 }, { "epoch": 3.1238610854350983, "grad_norm": 0.458984375, "learning_rate": 6.0353156461026375e-05, "loss": 1.2256, "num_input_tokens_seen": 775168000, "step": 11830, "train_runtime": 180747.8866, "train_tokens_per_second": 4288.67 }, { "epoch": 3.1265020467450153, "grad_norm": 0.46484375, "learning_rate": 6.000529994057693e-05, "loss": 1.1685, "num_input_tokens_seen": 775823360, "step": 11840, "train_runtime": 180951.0813, "train_tokens_per_second": 4287.476 }, { "epoch": 3.129143008054932, "grad_norm": 0.48828125, "learning_rate": 5.9658312020695546e-05, "loss": 1.2126, "num_input_tokens_seen": 776478720, "step": 11850, "train_runtime": 181155.4595, "train_tokens_per_second": 4286.256 }, { "epoch": 3.131783969364849, "grad_norm": 0.490234375, "learning_rate": 5.931219428771051e-05, "loss": 1.1811, "num_input_tokens_seen": 777134080, "step": 11860, "train_runtime": 181359.9392, "train_tokens_per_second": 4285.037 }, { "epoch": 3.134424930674766, "grad_norm": 0.451171875, "learning_rate": 5.8966948323971174e-05, "loss": 1.1746, "num_input_tokens_seen": 777789440, "step": 11870, "train_runtime": 181564.7329, "train_tokens_per_second": 4283.813 }, { "epoch": 3.1370658919846823, "grad_norm": 0.5078125, "learning_rate": 5.8622575707842044e-05, "loss": 1.1834, "num_input_tokens_seen": 778444800, "step": 11880, "train_runtime": 181768.6004, "train_tokens_per_second": 4282.614 }, { "epoch": 3.1397068532945993, "grad_norm": 0.478515625, "learning_rate": 5.8279078013694614e-05, "loss": 1.1937, "num_input_tokens_seen": 779100160, "step": 11890, "train_runtime": 181971.0491, "train_tokens_per_second": 4281.451 }, { "epoch": 3.142347814604516, "grad_norm": 0.46875, "learning_rate": 5.793645681190041e-05, "loss": 1.152, "num_input_tokens_seen": 779755520, "step": 11900, "train_runtime": 182172.5015, "train_tokens_per_second": 4280.314 }, { "epoch": 3.144988775914433, "grad_norm": 0.466796875, "learning_rate": 5.759471366882421e-05, "loss": 1.1854, "num_input_tokens_seen": 780410880, "step": 11910, "train_runtime": 182375.3746, "train_tokens_per_second": 4279.146 }, { "epoch": 3.14762973722435, "grad_norm": 0.4453125, "learning_rate": 5.72538501468163e-05, "loss": 1.2418, "num_input_tokens_seen": 781066240, "step": 11920, "train_runtime": 182577.484, "train_tokens_per_second": 4277.999 }, { "epoch": 3.1502706985342663, "grad_norm": 0.52734375, "learning_rate": 5.6913867804205654e-05, "loss": 1.2272, "num_input_tokens_seen": 781721600, "step": 11930, "train_runtime": 182777.1398, "train_tokens_per_second": 4276.911 }, { "epoch": 3.1529116598441833, "grad_norm": 0.44921875, "learning_rate": 5.65747681952927e-05, "loss": 1.2006, "num_input_tokens_seen": 782376960, "step": 11940, "train_runtime": 182981.0444, "train_tokens_per_second": 4275.727 }, { "epoch": 3.1555526211541003, "grad_norm": 0.478515625, "learning_rate": 5.623655287034255e-05, "loss": 1.2469, "num_input_tokens_seen": 783032320, "step": 11950, "train_runtime": 183183.1595, "train_tokens_per_second": 4274.587 }, { "epoch": 3.158193582464017, "grad_norm": 0.51171875, "learning_rate": 5.5899223375577124e-05, "loss": 1.1927, "num_input_tokens_seen": 783687680, "step": 11960, "train_runtime": 183384.8108, "train_tokens_per_second": 4273.46 }, { "epoch": 3.160834543773934, "grad_norm": 0.46875, "learning_rate": 5.5562781253169045e-05, "loss": 1.2217, "num_input_tokens_seen": 784343040, "step": 11970, "train_runtime": 183588.1507, "train_tokens_per_second": 4272.297 }, { "epoch": 3.1634755050838503, "grad_norm": 0.439453125, "learning_rate": 5.5227228041233925e-05, "loss": 1.2154, "num_input_tokens_seen": 784998400, "step": 11980, "train_runtime": 183791.6363, "train_tokens_per_second": 4271.132 }, { "epoch": 3.1661164663937673, "grad_norm": 0.458984375, "learning_rate": 5.489256527382344e-05, "loss": 1.2052, "num_input_tokens_seen": 785653760, "step": 11990, "train_runtime": 183993.7186, "train_tokens_per_second": 4270.003 }, { "epoch": 3.1687574277036843, "grad_norm": 0.494140625, "learning_rate": 5.45587944809188e-05, "loss": 1.2221, "num_input_tokens_seen": 786309120, "step": 12000, "train_runtime": 184196.8061, "train_tokens_per_second": 4268.853 }, { "epoch": 3.171398389013601, "grad_norm": 0.447265625, "learning_rate": 5.422591718842276e-05, "loss": 1.1988, "num_input_tokens_seen": 786964480, "step": 12010, "train_runtime": 184403.0738, "train_tokens_per_second": 4267.632 }, { "epoch": 3.174039350323518, "grad_norm": 0.48046875, "learning_rate": 5.3893934918153807e-05, "loss": 1.208, "num_input_tokens_seen": 787619840, "step": 12020, "train_runtime": 184604.1963, "train_tokens_per_second": 4266.533 }, { "epoch": 3.176680311633435, "grad_norm": 0.470703125, "learning_rate": 5.356284918783841e-05, "loss": 1.175, "num_input_tokens_seen": 788275200, "step": 12030, "train_runtime": 184803.1735, "train_tokens_per_second": 4265.485 }, { "epoch": 3.1793212729433513, "grad_norm": 0.46484375, "learning_rate": 5.3232661511104284e-05, "loss": 1.2011, "num_input_tokens_seen": 788930560, "step": 12040, "train_runtime": 185004.3045, "train_tokens_per_second": 4264.39 }, { "epoch": 3.1819622342532683, "grad_norm": 0.48046875, "learning_rate": 5.2903373397473475e-05, "loss": 1.2469, "num_input_tokens_seen": 789585920, "step": 12050, "train_runtime": 185207.5872, "train_tokens_per_second": 4263.248 }, { "epoch": 3.184603195563185, "grad_norm": 0.470703125, "learning_rate": 5.2574986352355744e-05, "loss": 1.2128, "num_input_tokens_seen": 790241280, "step": 12060, "train_runtime": 185410.2119, "train_tokens_per_second": 4262.124 }, { "epoch": 3.187244156873102, "grad_norm": 0.50390625, "learning_rate": 5.224750187704119e-05, "loss": 1.2556, "num_input_tokens_seen": 790896640, "step": 12070, "train_runtime": 185612.4609, "train_tokens_per_second": 4261.01 }, { "epoch": 3.189885118183019, "grad_norm": 0.474609375, "learning_rate": 5.1920921468693596e-05, "loss": 1.1623, "num_input_tokens_seen": 791552000, "step": 12080, "train_runtime": 185816.1209, "train_tokens_per_second": 4259.867 }, { "epoch": 3.1925260794929353, "grad_norm": 0.474609375, "learning_rate": 5.15952466203439e-05, "loss": 1.1934, "num_input_tokens_seen": 792207360, "step": 12090, "train_runtime": 186019.6856, "train_tokens_per_second": 4258.729 }, { "epoch": 3.1951670408028523, "grad_norm": 0.462890625, "learning_rate": 5.1270478820882624e-05, "loss": 1.2061, "num_input_tokens_seen": 792862720, "step": 12100, "train_runtime": 186222.1638, "train_tokens_per_second": 4257.617 }, { "epoch": 3.197808002112769, "grad_norm": 0.486328125, "learning_rate": 5.0946619555054087e-05, "loss": 1.2467, "num_input_tokens_seen": 793518080, "step": 12110, "train_runtime": 186424.8323, "train_tokens_per_second": 4256.504 }, { "epoch": 3.200448963422686, "grad_norm": 0.490234375, "learning_rate": 5.062367030344847e-05, "loss": 1.2435, "num_input_tokens_seen": 794173440, "step": 12120, "train_runtime": 186626.2296, "train_tokens_per_second": 4255.422 }, { "epoch": 3.203089924732603, "grad_norm": 0.49609375, "learning_rate": 5.0301632542496116e-05, "loss": 1.157, "num_input_tokens_seen": 794828800, "step": 12130, "train_runtime": 186828.4998, "train_tokens_per_second": 4254.323 }, { "epoch": 3.2057308860425193, "grad_norm": 0.478515625, "learning_rate": 4.998050774446003e-05, "loss": 1.2036, "num_input_tokens_seen": 795484160, "step": 12140, "train_runtime": 187032.7024, "train_tokens_per_second": 4253.182 }, { "epoch": 3.2083718473524363, "grad_norm": 0.490234375, "learning_rate": 4.9660297377429467e-05, "loss": 1.1836, "num_input_tokens_seen": 796139520, "step": 12150, "train_runtime": 187236.0655, "train_tokens_per_second": 4252.063 }, { "epoch": 3.2110128086623533, "grad_norm": 0.54296875, "learning_rate": 4.9341002905313266e-05, "loss": 1.237, "num_input_tokens_seen": 796794880, "step": 12160, "train_runtime": 187440.5407, "train_tokens_per_second": 4250.921 }, { "epoch": 3.21365376997227, "grad_norm": 0.490234375, "learning_rate": 4.902262578783298e-05, "loss": 1.1889, "num_input_tokens_seen": 797450240, "step": 12170, "train_runtime": 187637.9299, "train_tokens_per_second": 4249.942 }, { "epoch": 3.216294731282187, "grad_norm": 0.48828125, "learning_rate": 4.870516748051623e-05, "loss": 1.2064, "num_input_tokens_seen": 798105600, "step": 12180, "train_runtime": 187836.5977, "train_tokens_per_second": 4248.936 }, { "epoch": 3.2189356925921033, "grad_norm": 0.451171875, "learning_rate": 4.8388629434690165e-05, "loss": 1.1945, "num_input_tokens_seen": 798760960, "step": 12190, "train_runtime": 188035.2611, "train_tokens_per_second": 4247.932 }, { "epoch": 3.2215766539020203, "grad_norm": 0.51953125, "learning_rate": 4.807301309747491e-05, "loss": 1.1948, "num_input_tokens_seen": 799416320, "step": 12200, "train_runtime": 188234.517, "train_tokens_per_second": 4246.917 }, { "epoch": 3.2242176152119373, "grad_norm": 0.4921875, "learning_rate": 4.77583199117764e-05, "loss": 1.2201, "num_input_tokens_seen": 800071680, "step": 12210, "train_runtime": 188434.3452, "train_tokens_per_second": 4245.891 }, { "epoch": 3.226858576521854, "grad_norm": 0.46484375, "learning_rate": 4.7444551316280695e-05, "loss": 1.2017, "num_input_tokens_seen": 800727040, "step": 12220, "train_runtime": 188636.5671, "train_tokens_per_second": 4244.813 }, { "epoch": 3.229499537831771, "grad_norm": 0.47265625, "learning_rate": 4.7131708745446534e-05, "loss": 1.1923, "num_input_tokens_seen": 801382400, "step": 12230, "train_runtime": 188837.9683, "train_tokens_per_second": 4243.757 }, { "epoch": 3.2321404991416878, "grad_norm": 0.447265625, "learning_rate": 4.6819793629499256e-05, "loss": 1.1951, "num_input_tokens_seen": 802037760, "step": 12240, "train_runtime": 189039.7027, "train_tokens_per_second": 4242.695 }, { "epoch": 3.2347814604516043, "grad_norm": 0.48046875, "learning_rate": 4.65088073944242e-05, "loss": 1.1761, "num_input_tokens_seen": 802693120, "step": 12250, "train_runtime": 189240.384, "train_tokens_per_second": 4241.659 }, { "epoch": 3.2374224217615213, "grad_norm": 0.490234375, "learning_rate": 4.619875146195995e-05, "loss": 1.1761, "num_input_tokens_seen": 803348480, "step": 12260, "train_runtime": 189439.481, "train_tokens_per_second": 4240.66 }, { "epoch": 3.240063383071438, "grad_norm": 0.48046875, "learning_rate": 4.5889627249592335e-05, "loss": 1.2408, "num_input_tokens_seen": 804003840, "step": 12270, "train_runtime": 189638.4453, "train_tokens_per_second": 4239.667 }, { "epoch": 3.242704344381355, "grad_norm": 0.462890625, "learning_rate": 4.5581436170547355e-05, "loss": 1.2056, "num_input_tokens_seen": 804659200, "step": 12280, "train_runtime": 189837.7164, "train_tokens_per_second": 4238.669 }, { "epoch": 3.2453453056912718, "grad_norm": 0.4765625, "learning_rate": 4.527417963378508e-05, "loss": 1.2204, "num_input_tokens_seen": 805314560, "step": 12290, "train_runtime": 190038.789, "train_tokens_per_second": 4237.633 }, { "epoch": 3.2479862670011883, "grad_norm": 0.458984375, "learning_rate": 4.4967859043993056e-05, "loss": 1.1911, "num_input_tokens_seen": 805969920, "step": 12300, "train_runtime": 190240.2099, "train_tokens_per_second": 4236.591 }, { "epoch": 3.2506272283111053, "grad_norm": 0.51953125, "learning_rate": 4.4662475801580224e-05, "loss": 1.2564, "num_input_tokens_seen": 806625280, "step": 12310, "train_runtime": 190442.6886, "train_tokens_per_second": 4235.528 }, { "epoch": 3.2532681896210223, "grad_norm": 0.462890625, "learning_rate": 4.435803130266977e-05, "loss": 1.216, "num_input_tokens_seen": 807280640, "step": 12320, "train_runtime": 190642.2658, "train_tokens_per_second": 4234.531 }, { "epoch": 3.255909150930939, "grad_norm": 0.4765625, "learning_rate": 4.40545269390937e-05, "loss": 1.199, "num_input_tokens_seen": 807936000, "step": 12330, "train_runtime": 190843.6231, "train_tokens_per_second": 4233.497 }, { "epoch": 3.258550112240856, "grad_norm": 0.4765625, "learning_rate": 4.375196409838575e-05, "loss": 1.192, "num_input_tokens_seen": 808591360, "step": 12340, "train_runtime": 191046.6976, "train_tokens_per_second": 4232.428 }, { "epoch": 3.2611910735507723, "grad_norm": 0.47265625, "learning_rate": 4.345034416377519e-05, "loss": 1.2505, "num_input_tokens_seen": 809246720, "step": 12350, "train_runtime": 191249.443, "train_tokens_per_second": 4231.368 }, { "epoch": 3.2638320348606893, "grad_norm": 0.5078125, "learning_rate": 4.314966851418098e-05, "loss": 1.1946, "num_input_tokens_seen": 809902080, "step": 12360, "train_runtime": 191453.933, "train_tokens_per_second": 4230.271 }, { "epoch": 3.2664729961706063, "grad_norm": 0.498046875, "learning_rate": 4.284993852420455e-05, "loss": 1.2053, "num_input_tokens_seen": 810557440, "step": 12370, "train_runtime": 191654.5173, "train_tokens_per_second": 4229.263 }, { "epoch": 3.269113957480523, "grad_norm": 0.478515625, "learning_rate": 4.25511555641245e-05, "loss": 1.2578, "num_input_tokens_seen": 811212800, "step": 12380, "train_runtime": 191851.853, "train_tokens_per_second": 4228.329 }, { "epoch": 3.27175491879044, "grad_norm": 0.51953125, "learning_rate": 4.225332099988971e-05, "loss": 1.2183, "num_input_tokens_seen": 811868160, "step": 12390, "train_runtime": 192052.9899, "train_tokens_per_second": 4227.313 }, { "epoch": 3.2743958801003563, "grad_norm": 0.5, "learning_rate": 4.1956436193113235e-05, "loss": 1.16, "num_input_tokens_seen": 812523520, "step": 12400, "train_runtime": 192252.7378, "train_tokens_per_second": 4226.33 }, { "epoch": 3.2770368414102733, "grad_norm": 0.46484375, "learning_rate": 4.166050250106609e-05, "loss": 1.2374, "num_input_tokens_seen": 813178880, "step": 12410, "train_runtime": 192451.3782, "train_tokens_per_second": 4225.373 }, { "epoch": 3.2796778027201903, "grad_norm": 0.48046875, "learning_rate": 4.1365521276671284e-05, "loss": 1.2071, "num_input_tokens_seen": 813834240, "step": 12420, "train_runtime": 192649.8482, "train_tokens_per_second": 4224.422 }, { "epoch": 3.282318764030107, "grad_norm": 0.51953125, "learning_rate": 4.1071493868497205e-05, "loss": 1.1917, "num_input_tokens_seen": 814489600, "step": 12430, "train_runtime": 192850.3521, "train_tokens_per_second": 4223.428 }, { "epoch": 3.284959725340024, "grad_norm": 0.453125, "learning_rate": 4.077842162075174e-05, "loss": 1.2022, "num_input_tokens_seen": 815144960, "step": 12440, "train_runtime": 193052.5646, "train_tokens_per_second": 4222.399 }, { "epoch": 3.2876006866499408, "grad_norm": 0.515625, "learning_rate": 4.048630587327603e-05, "loss": 1.2061, "num_input_tokens_seen": 815800320, "step": 12450, "train_runtime": 193253.9883, "train_tokens_per_second": 4221.389 }, { "epoch": 3.2902416479598573, "grad_norm": 0.458984375, "learning_rate": 4.0195147961538364e-05, "loss": 1.2372, "num_input_tokens_seen": 816455680, "step": 12460, "train_runtime": 193452.2257, "train_tokens_per_second": 4220.451 }, { "epoch": 3.2928826092697743, "grad_norm": 0.466796875, "learning_rate": 3.990494921662832e-05, "loss": 1.1781, "num_input_tokens_seen": 817111040, "step": 12470, "train_runtime": 193650.1728, "train_tokens_per_second": 4219.521 }, { "epoch": 3.2955235705796913, "grad_norm": 0.462890625, "learning_rate": 3.961571096524996e-05, "loss": 1.2021, "num_input_tokens_seen": 817766400, "step": 12480, "train_runtime": 193850.9784, "train_tokens_per_second": 4218.531 }, { "epoch": 3.298164531889608, "grad_norm": 0.5, "learning_rate": 3.932743452971674e-05, "loss": 1.162, "num_input_tokens_seen": 818421760, "step": 12490, "train_runtime": 194051.3656, "train_tokens_per_second": 4217.552 }, { "epoch": 3.3008054931995248, "grad_norm": 0.482421875, "learning_rate": 3.904012122794473e-05, "loss": 1.2186, "num_input_tokens_seen": 819077120, "step": 12500, "train_runtime": 194250.6449, "train_tokens_per_second": 4216.599 }, { "epoch": 3.3034464545094413, "grad_norm": 0.4609375, "learning_rate": 3.875377237344677e-05, "loss": 1.2087, "num_input_tokens_seen": 819732480, "step": 12510, "train_runtime": 194460.5412, "train_tokens_per_second": 4215.418 }, { "epoch": 3.3060874158193583, "grad_norm": 0.48828125, "learning_rate": 3.846838927532678e-05, "loss": 1.1453, "num_input_tokens_seen": 820387840, "step": 12520, "train_runtime": 194659.8251, "train_tokens_per_second": 4214.469 }, { "epoch": 3.308728377129275, "grad_norm": 0.494140625, "learning_rate": 3.81839732382733e-05, "loss": 1.2011, "num_input_tokens_seen": 821043200, "step": 12530, "train_runtime": 194859.1382, "train_tokens_per_second": 4213.522 }, { "epoch": 3.311369338439192, "grad_norm": 0.455078125, "learning_rate": 3.79005255625538e-05, "loss": 1.1627, "num_input_tokens_seen": 821698560, "step": 12540, "train_runtime": 195058.5844, "train_tokens_per_second": 4212.573 }, { "epoch": 3.3140102997491088, "grad_norm": 0.46484375, "learning_rate": 3.7618047544008686e-05, "loss": 1.1898, "num_input_tokens_seen": 822353920, "step": 12550, "train_runtime": 195256.4063, "train_tokens_per_second": 4211.662 }, { "epoch": 3.3166512610590253, "grad_norm": 0.49609375, "learning_rate": 3.73365404740455e-05, "loss": 1.2063, "num_input_tokens_seen": 823009280, "step": 12560, "train_runtime": 195456.346, "train_tokens_per_second": 4210.706 }, { "epoch": 3.3192922223689423, "grad_norm": 0.451171875, "learning_rate": 3.70560056396326e-05, "loss": 1.245, "num_input_tokens_seen": 823664640, "step": 12570, "train_runtime": 195655.7369, "train_tokens_per_second": 4209.765 }, { "epoch": 3.3219331836788593, "grad_norm": 0.466796875, "learning_rate": 3.677644432329391e-05, "loss": 1.226, "num_input_tokens_seen": 824320000, "step": 12580, "train_runtime": 195853.919, "train_tokens_per_second": 4208.851 }, { "epoch": 3.324574144988776, "grad_norm": 0.5, "learning_rate": 3.649785780310247e-05, "loss": 1.2331, "num_input_tokens_seen": 824975360, "step": 12590, "train_runtime": 196052.8684, "train_tokens_per_second": 4207.923 }, { "epoch": 3.3272151062986928, "grad_norm": 0.484375, "learning_rate": 3.62202473526749e-05, "loss": 1.1965, "num_input_tokens_seen": 825630720, "step": 12600, "train_runtime": 196252.9747, "train_tokens_per_second": 4206.972 }, { "epoch": 3.3298560676086097, "grad_norm": 0.474609375, "learning_rate": 3.5943614241165544e-05, "loss": 1.2003, "num_input_tokens_seen": 826286080, "step": 12610, "train_runtime": 196452.4579, "train_tokens_per_second": 4206.036 }, { "epoch": 3.3324970289185263, "grad_norm": 0.48828125, "learning_rate": 3.5667959733260565e-05, "loss": 1.2092, "num_input_tokens_seen": 826941440, "step": 12620, "train_runtime": 196651.6064, "train_tokens_per_second": 4205.109 }, { "epoch": 3.3351379902284433, "grad_norm": 0.4765625, "learning_rate": 3.539328508917239e-05, "loss": 1.208, "num_input_tokens_seen": 827596800, "step": 12630, "train_runtime": 196851.5946, "train_tokens_per_second": 4204.166 }, { "epoch": 3.33777895153836, "grad_norm": 0.478515625, "learning_rate": 3.511959156463362e-05, "loss": 1.2347, "num_input_tokens_seen": 828252160, "step": 12640, "train_runtime": 197049.8646, "train_tokens_per_second": 4203.262 }, { "epoch": 3.3404199128482768, "grad_norm": 0.494140625, "learning_rate": 3.484688041089157e-05, "loss": 1.2062, "num_input_tokens_seen": 828907520, "step": 12650, "train_runtime": 197250.6507, "train_tokens_per_second": 4202.306 }, { "epoch": 3.3430608741581938, "grad_norm": 0.484375, "learning_rate": 3.4575152874702284e-05, "loss": 1.2187, "num_input_tokens_seen": 829562880, "step": 12660, "train_runtime": 197450.127, "train_tokens_per_second": 4201.379 }, { "epoch": 3.3457018354681103, "grad_norm": 0.49609375, "learning_rate": 3.4304410198325335e-05, "loss": 1.2662, "num_input_tokens_seen": 830218240, "step": 12670, "train_runtime": 197649.6302, "train_tokens_per_second": 4200.454 }, { "epoch": 3.3483427967780273, "grad_norm": 0.53125, "learning_rate": 3.403465361951732e-05, "loss": 1.2491, "num_input_tokens_seen": 830873600, "step": 12680, "train_runtime": 197848.3821, "train_tokens_per_second": 4199.547 }, { "epoch": 3.350983758087944, "grad_norm": 0.462890625, "learning_rate": 3.3765884371527114e-05, "loss": 1.2065, "num_input_tokens_seen": 831528960, "step": 12690, "train_runtime": 198048.1863, "train_tokens_per_second": 4198.619 }, { "epoch": 3.3536247193978608, "grad_norm": 0.50390625, "learning_rate": 3.349810368308962e-05, "loss": 1.1948, "num_input_tokens_seen": 832184320, "step": 12700, "train_runtime": 198248.7732, "train_tokens_per_second": 4197.677 }, { "epoch": 3.3562656807077778, "grad_norm": 0.484375, "learning_rate": 3.323131277842023e-05, "loss": 1.2187, "num_input_tokens_seen": 832839680, "step": 12710, "train_runtime": 198450.305, "train_tokens_per_second": 4196.717 }, { "epoch": 3.3589066420176943, "grad_norm": 0.48828125, "learning_rate": 3.296551287720964e-05, "loss": 1.2602, "num_input_tokens_seen": 833495040, "step": 12720, "train_runtime": 198649.2565, "train_tokens_per_second": 4195.813 }, { "epoch": 3.3615476033276113, "grad_norm": 0.46875, "learning_rate": 3.270070519461754e-05, "loss": 1.1583, "num_input_tokens_seen": 834150400, "step": 12730, "train_runtime": 198848.4163, "train_tokens_per_second": 4194.906 }, { "epoch": 3.3641885646375282, "grad_norm": 0.47265625, "learning_rate": 3.2436890941267924e-05, "loss": 1.1927, "num_input_tokens_seen": 834805760, "step": 12740, "train_runtime": 199048.4254, "train_tokens_per_second": 4193.983 }, { "epoch": 3.366829525947445, "grad_norm": 0.46484375, "learning_rate": 3.217407132324279e-05, "loss": 1.1686, "num_input_tokens_seen": 835461120, "step": 12750, "train_runtime": 199249.215, "train_tokens_per_second": 4193.046 }, { "epoch": 3.3694704872573618, "grad_norm": 0.470703125, "learning_rate": 3.191224754207714e-05, "loss": 1.2457, "num_input_tokens_seen": 836116480, "step": 12760, "train_runtime": 199450.6316, "train_tokens_per_second": 4192.097 }, { "epoch": 3.3721114485672787, "grad_norm": 0.484375, "learning_rate": 3.165142079475314e-05, "loss": 1.2059, "num_input_tokens_seen": 836771840, "step": 12770, "train_runtime": 199649.8184, "train_tokens_per_second": 4191.198 }, { "epoch": 3.3747524098771953, "grad_norm": 0.46875, "learning_rate": 3.1391592273695005e-05, "loss": 1.2168, "num_input_tokens_seen": 837427200, "step": 12780, "train_runtime": 199848.8563, "train_tokens_per_second": 4190.303 }, { "epoch": 3.3773933711871122, "grad_norm": 0.474609375, "learning_rate": 3.113276316676322e-05, "loss": 1.271, "num_input_tokens_seen": 838082560, "step": 12790, "train_runtime": 200048.4541, "train_tokens_per_second": 4189.398 }, { "epoch": 3.380034332497029, "grad_norm": 0.51953125, "learning_rate": 3.087493465724922e-05, "loss": 1.1732, "num_input_tokens_seen": 838737920, "step": 12800, "train_runtime": 200248.2489, "train_tokens_per_second": 4188.491 }, { "epoch": 3.3826752938069458, "grad_norm": 0.474609375, "learning_rate": 3.061810792387007e-05, "loss": 1.1944, "num_input_tokens_seen": 839393280, "step": 12810, "train_runtime": 200450.0953, "train_tokens_per_second": 4187.542 }, { "epoch": 3.3853162551168623, "grad_norm": 0.48046875, "learning_rate": 3.036228414076292e-05, "loss": 1.1843, "num_input_tokens_seen": 840048640, "step": 12820, "train_runtime": 200649.5406, "train_tokens_per_second": 4186.646 }, { "epoch": 3.3879572164267793, "grad_norm": 0.486328125, "learning_rate": 3.0107464477479944e-05, "loss": 1.2258, "num_input_tokens_seen": 840704000, "step": 12830, "train_runtime": 200849.6394, "train_tokens_per_second": 4185.738 }, { "epoch": 3.3905981777366963, "grad_norm": 0.486328125, "learning_rate": 2.985365009898236e-05, "loss": 1.2559, "num_input_tokens_seen": 841359360, "step": 12840, "train_runtime": 201048.8138, "train_tokens_per_second": 4184.851 }, { "epoch": 3.393239139046613, "grad_norm": 0.47265625, "learning_rate": 2.9600842165635993e-05, "loss": 1.1931, "num_input_tokens_seen": 842014720, "step": 12850, "train_runtime": 201247.0508, "train_tokens_per_second": 4183.985 }, { "epoch": 3.3958801003565298, "grad_norm": 0.48828125, "learning_rate": 2.9349041833205136e-05, "loss": 1.1732, "num_input_tokens_seen": 842670080, "step": 12860, "train_runtime": 201444.9723, "train_tokens_per_second": 4183.128 }, { "epoch": 3.3985210616664467, "grad_norm": 0.470703125, "learning_rate": 2.9098250252847736e-05, "loss": 1.1667, "num_input_tokens_seen": 843325440, "step": 12870, "train_runtime": 201644.4698, "train_tokens_per_second": 4182.239 }, { "epoch": 3.4011620229763633, "grad_norm": 0.49609375, "learning_rate": 2.8848468571110148e-05, "loss": 1.2126, "num_input_tokens_seen": 843980800, "step": 12880, "train_runtime": 201843.5639, "train_tokens_per_second": 4181.361 }, { "epoch": 3.4038029842862803, "grad_norm": 0.490234375, "learning_rate": 2.859969792992159e-05, "loss": 1.2616, "num_input_tokens_seen": 844636160, "step": 12890, "train_runtime": 202042.1976, "train_tokens_per_second": 4180.494 }, { "epoch": 3.4064439455961972, "grad_norm": 0.46875, "learning_rate": 2.8351939466589148e-05, "loss": 1.1774, "num_input_tokens_seen": 845291520, "step": 12900, "train_runtime": 202241.7893, "train_tokens_per_second": 4179.609 }, { "epoch": 3.4090849069061138, "grad_norm": 0.51953125, "learning_rate": 2.8105194313792547e-05, "loss": 1.1754, "num_input_tokens_seen": 845946880, "step": 12910, "train_runtime": 202442.689, "train_tokens_per_second": 4178.698 }, { "epoch": 3.4117258682160307, "grad_norm": 0.48828125, "learning_rate": 2.7859463599578914e-05, "loss": 1.2352, "num_input_tokens_seen": 846602240, "step": 12920, "train_runtime": 202643.8979, "train_tokens_per_second": 4177.783 }, { "epoch": 3.4143668295259473, "grad_norm": 0.5, "learning_rate": 2.7614748447357645e-05, "loss": 1.2543, "num_input_tokens_seen": 847257600, "step": 12930, "train_runtime": 202843.0942, "train_tokens_per_second": 4176.911 }, { "epoch": 3.4170077908358643, "grad_norm": 0.5, "learning_rate": 2.737104997589543e-05, "loss": 1.1776, "num_input_tokens_seen": 847912960, "step": 12940, "train_runtime": 203043.176, "train_tokens_per_second": 4176.023 }, { "epoch": 3.4196487521457812, "grad_norm": 0.546875, "learning_rate": 2.7128369299310813e-05, "loss": 1.2102, "num_input_tokens_seen": 848568320, "step": 12950, "train_runtime": 203242.0299, "train_tokens_per_second": 4175.162 }, { "epoch": 3.4222897134556978, "grad_norm": 0.48046875, "learning_rate": 2.6886707527069315e-05, "loss": 1.2282, "num_input_tokens_seen": 849223680, "step": 12960, "train_runtime": 203440.4961, "train_tokens_per_second": 4174.31 }, { "epoch": 3.4249306747656147, "grad_norm": 0.490234375, "learning_rate": 2.6646065763978405e-05, "loss": 1.2195, "num_input_tokens_seen": 849879040, "step": 12970, "train_runtime": 203639.9581, "train_tokens_per_second": 4173.439 }, { "epoch": 3.4275716360755313, "grad_norm": 0.4765625, "learning_rate": 2.6406445110182196e-05, "loss": 1.2001, "num_input_tokens_seen": 850534400, "step": 12980, "train_runtime": 203840.5648, "train_tokens_per_second": 4172.547 }, { "epoch": 3.4302125973854483, "grad_norm": 0.4921875, "learning_rate": 2.6167846661156845e-05, "loss": 1.2176, "num_input_tokens_seen": 851189760, "step": 12990, "train_runtime": 204039.4385, "train_tokens_per_second": 4171.692 }, { "epoch": 3.4328535586953652, "grad_norm": 0.5078125, "learning_rate": 2.593027150770508e-05, "loss": 1.2267, "num_input_tokens_seen": 851845120, "step": 13000, "train_runtime": 204238.7053, "train_tokens_per_second": 4170.831 }, { "epoch": 3.4354945200052818, "grad_norm": 0.46484375, "learning_rate": 2.569372073595147e-05, "loss": 1.224, "num_input_tokens_seen": 852500480, "step": 13010, "train_runtime": 204447.0868, "train_tokens_per_second": 4169.785 }, { "epoch": 3.4381354813151987, "grad_norm": 0.5078125, "learning_rate": 2.545819542733735e-05, "loss": 1.2173, "num_input_tokens_seen": 853155840, "step": 13020, "train_runtime": 204644.5237, "train_tokens_per_second": 4168.965 }, { "epoch": 3.4407764426251157, "grad_norm": 0.484375, "learning_rate": 2.522369665861618e-05, "loss": 1.1678, "num_input_tokens_seen": 853811200, "step": 13030, "train_runtime": 204845.3204, "train_tokens_per_second": 4168.078 }, { "epoch": 3.4434174039350323, "grad_norm": 0.5, "learning_rate": 2.4990225501847985e-05, "loss": 1.2247, "num_input_tokens_seen": 854466560, "step": 13040, "train_runtime": 205045.968, "train_tokens_per_second": 4167.195 }, { "epoch": 3.4460583652449492, "grad_norm": 0.48046875, "learning_rate": 2.4757783024395242e-05, "loss": 1.1834, "num_input_tokens_seen": 855121920, "step": 13050, "train_runtime": 205244.1594, "train_tokens_per_second": 4166.364 }, { "epoch": 3.448699326554866, "grad_norm": 0.4765625, "learning_rate": 2.452637028891733e-05, "loss": 1.1835, "num_input_tokens_seen": 855777280, "step": 13060, "train_runtime": 205443.9479, "train_tokens_per_second": 4165.503 }, { "epoch": 3.4513402878647828, "grad_norm": 0.4765625, "learning_rate": 2.4295988353365994e-05, "loss": 1.1729, "num_input_tokens_seen": 856432640, "step": 13070, "train_runtime": 205644.5652, "train_tokens_per_second": 4164.626 }, { "epoch": 3.4539812491746997, "grad_norm": 0.5546875, "learning_rate": 2.4066638270980712e-05, "loss": 1.2091, "num_input_tokens_seen": 857088000, "step": 13080, "train_runtime": 205842.0822, "train_tokens_per_second": 4163.813 }, { "epoch": 3.4566222104846163, "grad_norm": 0.48046875, "learning_rate": 2.3838321090283168e-05, "loss": 1.2263, "num_input_tokens_seen": 857743360, "step": 13090, "train_runtime": 206041.7868, "train_tokens_per_second": 4162.958 }, { "epoch": 3.4592631717945332, "grad_norm": 0.494140625, "learning_rate": 2.3611037855073346e-05, "loss": 1.2314, "num_input_tokens_seen": 858398720, "step": 13100, "train_runtime": 206242.0558, "train_tokens_per_second": 4162.094 }, { "epoch": 3.4619041331044498, "grad_norm": 0.46484375, "learning_rate": 2.338478960442414e-05, "loss": 1.1976, "num_input_tokens_seen": 859054080, "step": 13110, "train_runtime": 206440.9257, "train_tokens_per_second": 4161.259 }, { "epoch": 3.4645450944143668, "grad_norm": 0.498046875, "learning_rate": 2.3159577372676765e-05, "loss": 1.211, "num_input_tokens_seen": 859709440, "step": 13120, "train_runtime": 206641.8579, "train_tokens_per_second": 4160.384 }, { "epoch": 3.4671860557242837, "grad_norm": 0.4765625, "learning_rate": 2.2935402189436126e-05, "loss": 1.2251, "num_input_tokens_seen": 860364800, "step": 13130, "train_runtime": 206843.7926, "train_tokens_per_second": 4159.491 }, { "epoch": 3.4698270170342003, "grad_norm": 0.490234375, "learning_rate": 2.2712265079566084e-05, "loss": 1.1866, "num_input_tokens_seen": 861020160, "step": 13140, "train_runtime": 207043.9691, "train_tokens_per_second": 4158.634 }, { "epoch": 3.4724679783441172, "grad_norm": 0.50390625, "learning_rate": 2.249016706318463e-05, "loss": 1.1895, "num_input_tokens_seen": 861675520, "step": 13150, "train_runtime": 207242.3309, "train_tokens_per_second": 4157.816 }, { "epoch": 3.4751089396540342, "grad_norm": 0.48046875, "learning_rate": 2.2269109155659385e-05, "loss": 1.2169, "num_input_tokens_seen": 862330880, "step": 13160, "train_runtime": 207442.3254, "train_tokens_per_second": 4156.967 }, { "epoch": 3.4777499009639508, "grad_norm": 0.470703125, "learning_rate": 2.2049092367602857e-05, "loss": 1.1999, "num_input_tokens_seen": 862986240, "step": 13170, "train_runtime": 207643.5395, "train_tokens_per_second": 4156.095 }, { "epoch": 3.4803908622738677, "grad_norm": 0.48828125, "learning_rate": 2.183011770486784e-05, "loss": 1.1669, "num_input_tokens_seen": 863641600, "step": 13180, "train_runtime": 207840.7608, "train_tokens_per_second": 4155.304 }, { "epoch": 3.4830318235837847, "grad_norm": 0.48828125, "learning_rate": 2.161218616854302e-05, "loss": 1.1572, "num_input_tokens_seen": 864296960, "step": 13190, "train_runtime": 208041.9979, "train_tokens_per_second": 4154.435 }, { "epoch": 3.4856727848937012, "grad_norm": 0.466796875, "learning_rate": 2.139529875494789e-05, "loss": 1.2137, "num_input_tokens_seen": 864952320, "step": 13200, "train_runtime": 208241.7729, "train_tokens_per_second": 4153.597 }, { "epoch": 3.4883137462036182, "grad_norm": 0.4921875, "learning_rate": 2.117945645562877e-05, "loss": 1.1582, "num_input_tokens_seen": 865607680, "step": 13210, "train_runtime": 208443.0606, "train_tokens_per_second": 4152.73 }, { "epoch": 3.4909547075135348, "grad_norm": 0.5625, "learning_rate": 2.096466025735397e-05, "loss": 1.2028, "num_input_tokens_seen": 866263040, "step": 13220, "train_runtime": 208642.2744, "train_tokens_per_second": 4151.906 }, { "epoch": 3.4935956688234517, "grad_norm": 0.4765625, "learning_rate": 2.0750911142109223e-05, "loss": 1.196, "num_input_tokens_seen": 866918400, "step": 13230, "train_runtime": 208842.4503, "train_tokens_per_second": 4151.064 }, { "epoch": 3.4962366301333687, "grad_norm": 0.48828125, "learning_rate": 2.0538210087093473e-05, "loss": 1.1948, "num_input_tokens_seen": 867573760, "step": 13240, "train_runtime": 209041.9236, "train_tokens_per_second": 4150.238 }, { "epoch": 3.4988775914432853, "grad_norm": 0.484375, "learning_rate": 2.032655806471409e-05, "loss": 1.2176, "num_input_tokens_seen": 868229120, "step": 13250, "train_runtime": 209240.9022, "train_tokens_per_second": 4149.424 }, { "epoch": 3.5015185527532022, "grad_norm": 0.478515625, "learning_rate": 2.0115956042582652e-05, "loss": 1.2142, "num_input_tokens_seen": 868884480, "step": 13260, "train_runtime": 209441.5027, "train_tokens_per_second": 4148.578 }, { "epoch": 3.5041595140631188, "grad_norm": 0.62109375, "learning_rate": 1.9906404983510373e-05, "loss": 1.1974, "num_input_tokens_seen": 869539840, "step": 13270, "train_runtime": 209641.0139, "train_tokens_per_second": 4147.756 }, { "epoch": 3.5068004753730357, "grad_norm": 0.453125, "learning_rate": 1.9697905845503877e-05, "loss": 1.1643, "num_input_tokens_seen": 870195200, "step": 13280, "train_runtime": 209842.1067, "train_tokens_per_second": 4146.905 }, { "epoch": 3.5094414366829527, "grad_norm": 0.50390625, "learning_rate": 1.9490459581760572e-05, "loss": 1.2387, "num_input_tokens_seen": 870850560, "step": 13290, "train_runtime": 210042.5034, "train_tokens_per_second": 4146.068 }, { "epoch": 3.5120823979928693, "grad_norm": 0.453125, "learning_rate": 1.928406714066458e-05, "loss": 1.1701, "num_input_tokens_seen": 871505920, "step": 13300, "train_runtime": 210243.3045, "train_tokens_per_second": 4145.226 }, { "epoch": 3.5147233593027862, "grad_norm": 0.478515625, "learning_rate": 1.9078729465782124e-05, "loss": 1.1755, "num_input_tokens_seen": 872161280, "step": 13310, "train_runtime": 210444.1165, "train_tokens_per_second": 4144.384 }, { "epoch": 3.517364320612703, "grad_norm": 0.490234375, "learning_rate": 1.88744474958574e-05, "loss": 1.1613, "num_input_tokens_seen": 872816640, "step": 13320, "train_runtime": 210644.2092, "train_tokens_per_second": 4143.559 }, { "epoch": 3.5200052819226197, "grad_norm": 0.48046875, "learning_rate": 1.8671222164808293e-05, "loss": 1.2078, "num_input_tokens_seen": 873472000, "step": 13330, "train_runtime": 210844.3767, "train_tokens_per_second": 4142.733 }, { "epoch": 3.5226462432325367, "grad_norm": 0.486328125, "learning_rate": 1.8469054401721862e-05, "loss": 1.204, "num_input_tokens_seen": 874127360, "step": 13340, "train_runtime": 211043.7854, "train_tokens_per_second": 4141.924 }, { "epoch": 3.5252872045424537, "grad_norm": 0.466796875, "learning_rate": 1.826794513085045e-05, "loss": 1.186, "num_input_tokens_seen": 874782720, "step": 13350, "train_runtime": 211244.06, "train_tokens_per_second": 4141.1 }, { "epoch": 3.5279281658523702, "grad_norm": 0.51953125, "learning_rate": 1.8067895271607237e-05, "loss": 1.2133, "num_input_tokens_seen": 875438080, "step": 13360, "train_runtime": 211443.4451, "train_tokens_per_second": 4140.294 }, { "epoch": 3.530569127162287, "grad_norm": 0.486328125, "learning_rate": 1.7868905738562008e-05, "loss": 1.1679, "num_input_tokens_seen": 876093440, "step": 13370, "train_runtime": 211643.4306, "train_tokens_per_second": 4139.479 }, { "epoch": 3.5332100884722037, "grad_norm": 0.486328125, "learning_rate": 1.7670977441437086e-05, "loss": 1.1936, "num_input_tokens_seen": 876748800, "step": 13380, "train_runtime": 211842.8057, "train_tokens_per_second": 4138.676 }, { "epoch": 3.5358510497821207, "grad_norm": 0.51171875, "learning_rate": 1.747411128510315e-05, "loss": 1.2133, "num_input_tokens_seen": 877404160, "step": 13390, "train_runtime": 212042.2914, "train_tokens_per_second": 4137.873 }, { "epoch": 3.5384920110920373, "grad_norm": 0.47265625, "learning_rate": 1.7278308169575097e-05, "loss": 1.2262, "num_input_tokens_seen": 878059520, "step": 13400, "train_runtime": 212242.0738, "train_tokens_per_second": 4137.066 }, { "epoch": 3.5411329724019542, "grad_norm": 0.478515625, "learning_rate": 1.7083568990007903e-05, "loss": 1.2296, "num_input_tokens_seen": 878714880, "step": 13410, "train_runtime": 212442.3742, "train_tokens_per_second": 4136.251 }, { "epoch": 3.543773933711871, "grad_norm": 0.474609375, "learning_rate": 1.6889894636692436e-05, "loss": 1.2331, "num_input_tokens_seen": 879370240, "step": 13420, "train_runtime": 212641.3248, "train_tokens_per_second": 4135.463 }, { "epoch": 3.5464148950217877, "grad_norm": 0.48828125, "learning_rate": 1.66972859950516e-05, "loss": 1.2616, "num_input_tokens_seen": 880025600, "step": 13430, "train_runtime": 212839.0824, "train_tokens_per_second": 4134.699 }, { "epoch": 3.5490558563317047, "grad_norm": 0.462890625, "learning_rate": 1.6505743945636254e-05, "loss": 1.1874, "num_input_tokens_seen": 880680960, "step": 13440, "train_runtime": 213038.6519, "train_tokens_per_second": 4133.902 }, { "epoch": 3.5516968176416217, "grad_norm": 0.46484375, "learning_rate": 1.631526936412081e-05, "loss": 1.2255, "num_input_tokens_seen": 881336320, "step": 13450, "train_runtime": 213238.8451, "train_tokens_per_second": 4133.095 }, { "epoch": 3.5543377789515382, "grad_norm": 0.474609375, "learning_rate": 1.6125863121299878e-05, "loss": 1.2345, "num_input_tokens_seen": 881991680, "step": 13460, "train_runtime": 213436.9801, "train_tokens_per_second": 4132.328 }, { "epoch": 3.556978740261455, "grad_norm": 0.46484375, "learning_rate": 1.5937526083083685e-05, "loss": 1.1988, "num_input_tokens_seen": 882647040, "step": 13470, "train_runtime": 213636.4625, "train_tokens_per_second": 4131.537 }, { "epoch": 3.559619701571372, "grad_norm": 0.470703125, "learning_rate": 1.5750259110494464e-05, "loss": 1.1976, "num_input_tokens_seen": 883302400, "step": 13480, "train_runtime": 213835.9394, "train_tokens_per_second": 4130.748 }, { "epoch": 3.5622606628812887, "grad_norm": 0.48828125, "learning_rate": 1.5564063059662376e-05, "loss": 1.1442, "num_input_tokens_seen": 883957760, "step": 13490, "train_runtime": 214033.1806, "train_tokens_per_second": 4130.003 }, { "epoch": 3.5649016241912057, "grad_norm": 0.490234375, "learning_rate": 1.5378938781821727e-05, "loss": 1.2078, "num_input_tokens_seen": 884613120, "step": 13500, "train_runtime": 214232.5465, "train_tokens_per_second": 4129.219 }, { "epoch": 3.5675425855011222, "grad_norm": 0.474609375, "learning_rate": 1.5194887123306911e-05, "loss": 1.2222, "num_input_tokens_seen": 885268480, "step": 13510, "train_runtime": 214439.8773, "train_tokens_per_second": 4128.283 }, { "epoch": 3.570183546811039, "grad_norm": 0.462890625, "learning_rate": 1.5011908925548656e-05, "loss": 1.2242, "num_input_tokens_seen": 885923840, "step": 13520, "train_runtime": 214640.0038, "train_tokens_per_second": 4127.487 }, { "epoch": 3.5728245081209558, "grad_norm": 0.462890625, "learning_rate": 1.4830005025070065e-05, "loss": 1.2099, "num_input_tokens_seen": 886579200, "step": 13530, "train_runtime": 214841.1359, "train_tokens_per_second": 4126.673 }, { "epoch": 3.5754654694308727, "grad_norm": 0.462890625, "learning_rate": 1.4649176253482944e-05, "loss": 1.2126, "num_input_tokens_seen": 887234560, "step": 13540, "train_runtime": 215040.6109, "train_tokens_per_second": 4125.893 }, { "epoch": 3.5781064307407897, "grad_norm": 0.494140625, "learning_rate": 1.4469423437483974e-05, "loss": 1.226, "num_input_tokens_seen": 887889920, "step": 13550, "train_runtime": 215240.4623, "train_tokens_per_second": 4125.107 }, { "epoch": 3.5807473920507062, "grad_norm": 0.48046875, "learning_rate": 1.429074739885064e-05, "loss": 1.2343, "num_input_tokens_seen": 888545280, "step": 13560, "train_runtime": 215437.507, "train_tokens_per_second": 4124.376 }, { "epoch": 3.5833883533606232, "grad_norm": 0.466796875, "learning_rate": 1.4113148954438048e-05, "loss": 1.2473, "num_input_tokens_seen": 889200640, "step": 13570, "train_runtime": 215636.2558, "train_tokens_per_second": 4123.614 }, { "epoch": 3.58602931467054, "grad_norm": 0.44921875, "learning_rate": 1.3936628916174588e-05, "loss": 1.2331, "num_input_tokens_seen": 889856000, "step": 13580, "train_runtime": 215837.184, "train_tokens_per_second": 4122.811 }, { "epoch": 3.5886702759804567, "grad_norm": 0.462890625, "learning_rate": 1.3761188091058614e-05, "loss": 1.1826, "num_input_tokens_seen": 890511360, "step": 13590, "train_runtime": 216037.0606, "train_tokens_per_second": 4122.031 }, { "epoch": 3.5913112372903737, "grad_norm": 0.47265625, "learning_rate": 1.3586827281154624e-05, "loss": 1.2182, "num_input_tokens_seen": 891166720, "step": 13600, "train_runtime": 216236.5386, "train_tokens_per_second": 4121.259 }, { "epoch": 3.5939521986002907, "grad_norm": 0.5, "learning_rate": 1.3413547283589566e-05, "loss": 1.21, "num_input_tokens_seen": 891822080, "step": 13610, "train_runtime": 216435.3099, "train_tokens_per_second": 4120.502 }, { "epoch": 3.5965931599102072, "grad_norm": 0.48046875, "learning_rate": 1.324134889054926e-05, "loss": 1.2063, "num_input_tokens_seen": 892477440, "step": 13620, "train_runtime": 216633.5407, "train_tokens_per_second": 4119.757 }, { "epoch": 3.599234121220124, "grad_norm": 0.447265625, "learning_rate": 1.3070232889274697e-05, "loss": 1.167, "num_input_tokens_seen": 893132800, "step": 13630, "train_runtime": 216831.2699, "train_tokens_per_second": 4119.022 }, { "epoch": 3.601875082530041, "grad_norm": 0.494140625, "learning_rate": 1.2900200062058554e-05, "loss": 1.2579, "num_input_tokens_seen": 893788160, "step": 13640, "train_runtime": 217031.5927, "train_tokens_per_second": 4118.24 }, { "epoch": 3.6045160438399577, "grad_norm": 0.48046875, "learning_rate": 1.2731251186241466e-05, "loss": 1.1622, "num_input_tokens_seen": 894443520, "step": 13650, "train_runtime": 217231.4601, "train_tokens_per_second": 4117.468 }, { "epoch": 3.6071570051498747, "grad_norm": 0.470703125, "learning_rate": 1.2563387034208673e-05, "loss": 1.1766, "num_input_tokens_seen": 895098880, "step": 13660, "train_runtime": 217431.878, "train_tokens_per_second": 4116.687 }, { "epoch": 3.6097979664597912, "grad_norm": 0.466796875, "learning_rate": 1.239660837338627e-05, "loss": 1.2258, "num_input_tokens_seen": 895754240, "step": 13670, "train_runtime": 217632.1859, "train_tokens_per_second": 4115.909 }, { "epoch": 3.612438927769708, "grad_norm": 0.4765625, "learning_rate": 1.2230915966237821e-05, "loss": 1.2101, "num_input_tokens_seen": 896409600, "step": 13680, "train_runtime": 217832.5619, "train_tokens_per_second": 4115.131 }, { "epoch": 3.6150798890796247, "grad_norm": 0.453125, "learning_rate": 1.2066310570260975e-05, "loss": 1.1702, "num_input_tokens_seen": 897064960, "step": 13690, "train_runtime": 218032.3563, "train_tokens_per_second": 4114.366 }, { "epoch": 3.6177208503895417, "grad_norm": 0.458984375, "learning_rate": 1.1902792937983603e-05, "loss": 1.1979, "num_input_tokens_seen": 897720320, "step": 13700, "train_runtime": 218233.3223, "train_tokens_per_second": 4113.58 }, { "epoch": 3.6203618116994587, "grad_norm": 0.453125, "learning_rate": 1.1740363816960974e-05, "loss": 1.1481, "num_input_tokens_seen": 898375680, "step": 13710, "train_runtime": 218431.4103, "train_tokens_per_second": 4112.85 }, { "epoch": 3.6230027730093752, "grad_norm": 0.5, "learning_rate": 1.1579023949771755e-05, "loss": 1.2161, "num_input_tokens_seen": 899031040, "step": 13720, "train_runtime": 218629.5375, "train_tokens_per_second": 4112.121 }, { "epoch": 3.625643734319292, "grad_norm": 0.4765625, "learning_rate": 1.1418774074014954e-05, "loss": 1.2183, "num_input_tokens_seen": 899686400, "step": 13730, "train_runtime": 218828.3947, "train_tokens_per_second": 4111.379 }, { "epoch": 3.628284695629209, "grad_norm": 0.484375, "learning_rate": 1.1259614922306483e-05, "loss": 1.219, "num_input_tokens_seen": 900341760, "step": 13740, "train_runtime": 219025.7375, "train_tokens_per_second": 4110.666 }, { "epoch": 3.6309256569391257, "grad_norm": 0.44921875, "learning_rate": 1.110154722227566e-05, "loss": 1.1889, "num_input_tokens_seen": 900997120, "step": 13750, "train_runtime": 219224.4013, "train_tokens_per_second": 4109.931 }, { "epoch": 3.6335666182490427, "grad_norm": 0.48828125, "learning_rate": 1.0944571696562156e-05, "loss": 1.2194, "num_input_tokens_seen": 901652480, "step": 13760, "train_runtime": 219422.6586, "train_tokens_per_second": 4109.204 }, { "epoch": 3.6362075795589597, "grad_norm": 0.46484375, "learning_rate": 1.078868906281244e-05, "loss": 1.1816, "num_input_tokens_seen": 902307840, "step": 13770, "train_runtime": 219620.9956, "train_tokens_per_second": 4108.477 }, { "epoch": 3.638848540868876, "grad_norm": 0.482421875, "learning_rate": 1.0633900033676646e-05, "loss": 1.2197, "num_input_tokens_seen": 902963200, "step": 13780, "train_runtime": 219820.9998, "train_tokens_per_second": 4107.72 }, { "epoch": 3.641489502178793, "grad_norm": 0.48046875, "learning_rate": 1.0480205316805214e-05, "loss": 1.2363, "num_input_tokens_seen": 903618560, "step": 13790, "train_runtime": 220021.0937, "train_tokens_per_second": 4106.963 }, { "epoch": 3.6441304634887097, "grad_norm": 0.48828125, "learning_rate": 1.0327605614845803e-05, "loss": 1.2226, "num_input_tokens_seen": 904273920, "step": 13800, "train_runtime": 220221.0633, "train_tokens_per_second": 4106.21 }, { "epoch": 3.6467714247986267, "grad_norm": 0.51953125, "learning_rate": 1.0176101625439777e-05, "loss": 1.2025, "num_input_tokens_seen": 904929280, "step": 13810, "train_runtime": 220418.9013, "train_tokens_per_second": 4105.498 }, { "epoch": 3.6494123861085432, "grad_norm": 0.46484375, "learning_rate": 1.0025694041219501e-05, "loss": 1.2232, "num_input_tokens_seen": 905584640, "step": 13820, "train_runtime": 220617.1805, "train_tokens_per_second": 4104.778 }, { "epoch": 3.65205334741846, "grad_norm": 0.48046875, "learning_rate": 9.876383549804662e-06, "loss": 1.2338, "num_input_tokens_seen": 906240000, "step": 13830, "train_runtime": 220816.7237, "train_tokens_per_second": 4104.037 }, { "epoch": 3.654694308728377, "grad_norm": 0.478515625, "learning_rate": 9.72817083379951e-06, "loss": 1.1752, "num_input_tokens_seen": 906895360, "step": 13840, "train_runtime": 221016.4261, "train_tokens_per_second": 4103.294 }, { "epoch": 3.6573352700382937, "grad_norm": 0.466796875, "learning_rate": 9.581056570789449e-06, "loss": 1.2227, "num_input_tokens_seen": 907550720, "step": 13850, "train_runtime": 221217.2326, "train_tokens_per_second": 4102.532 }, { "epoch": 3.6599762313482107, "grad_norm": 0.4921875, "learning_rate": 9.435041433338204e-06, "loss": 1.2033, "num_input_tokens_seen": 908206080, "step": 13860, "train_runtime": 221415.9105, "train_tokens_per_second": 4101.81 }, { "epoch": 3.6626171926581277, "grad_norm": 0.5, "learning_rate": 9.290126088984523e-06, "loss": 1.1826, "num_input_tokens_seen": 908861440, "step": 13870, "train_runtime": 221615.2648, "train_tokens_per_second": 4101.078 }, { "epoch": 3.665258153968044, "grad_norm": 0.4921875, "learning_rate": 9.146311200239316e-06, "loss": 1.2178, "num_input_tokens_seen": 909516800, "step": 13880, "train_runtime": 221815.2625, "train_tokens_per_second": 4100.335 }, { "epoch": 3.667899115277961, "grad_norm": 0.490234375, "learning_rate": 9.003597424582427e-06, "loss": 1.2385, "num_input_tokens_seen": 910172160, "step": 13890, "train_runtime": 222014.2274, "train_tokens_per_second": 4099.612 }, { "epoch": 3.670540076587878, "grad_norm": 0.470703125, "learning_rate": 8.861985414459733e-06, "loss": 1.1916, "num_input_tokens_seen": 910827520, "step": 13900, "train_runtime": 222212.6787, "train_tokens_per_second": 4098.9 }, { "epoch": 3.6731810378977947, "grad_norm": 0.48828125, "learning_rate": 8.721475817280306e-06, "loss": 1.1773, "num_input_tokens_seen": 911482880, "step": 13910, "train_runtime": 222409.6941, "train_tokens_per_second": 4098.216 }, { "epoch": 3.6758219992077117, "grad_norm": 0.470703125, "learning_rate": 8.582069275413107e-06, "loss": 1.178, "num_input_tokens_seen": 912138240, "step": 13920, "train_runtime": 222608.8701, "train_tokens_per_second": 4097.493 }, { "epoch": 3.6784629605176287, "grad_norm": 0.51171875, "learning_rate": 8.443766426184384e-06, "loss": 1.2265, "num_input_tokens_seen": 912793600, "step": 13930, "train_runtime": 222809.1608, "train_tokens_per_second": 4096.751 }, { "epoch": 3.681103921827545, "grad_norm": 0.484375, "learning_rate": 8.30656790187459e-06, "loss": 1.2005, "num_input_tokens_seen": 913448960, "step": 13940, "train_runtime": 223009.6557, "train_tokens_per_second": 4096.006 }, { "epoch": 3.683744883137462, "grad_norm": 0.44921875, "learning_rate": 8.170474329715489e-06, "loss": 1.1958, "num_input_tokens_seen": 914104320, "step": 13950, "train_runtime": 223208.3043, "train_tokens_per_second": 4095.297 }, { "epoch": 3.6863858444473787, "grad_norm": 0.466796875, "learning_rate": 8.03548633188736e-06, "loss": 1.2196, "num_input_tokens_seen": 914759680, "step": 13960, "train_runtime": 223405.5391, "train_tokens_per_second": 4094.615 }, { "epoch": 3.6890268057572957, "grad_norm": 0.49609375, "learning_rate": 7.901604525516137e-06, "loss": 1.1927, "num_input_tokens_seen": 915415040, "step": 13970, "train_runtime": 223604.7082, "train_tokens_per_second": 4093.899 }, { "epoch": 3.6916677670672122, "grad_norm": 0.5, "learning_rate": 7.768829522670523e-06, "loss": 1.2416, "num_input_tokens_seen": 916070400, "step": 13980, "train_runtime": 223803.4688, "train_tokens_per_second": 4093.191 }, { "epoch": 3.694308728377129, "grad_norm": 0.46484375, "learning_rate": 7.637161930359238e-06, "loss": 1.1839, "num_input_tokens_seen": 916725760, "step": 13990, "train_runtime": 224004.0002, "train_tokens_per_second": 4092.453 }, { "epoch": 3.696949689687046, "grad_norm": 0.49609375, "learning_rate": 7.506602350528302e-06, "loss": 1.2306, "num_input_tokens_seen": 917381120, "step": 14000, "train_runtime": 224203.0768, "train_tokens_per_second": 4091.742 }, { "epoch": 3.6995906509969627, "grad_norm": 0.486328125, "learning_rate": 7.377151380058095e-06, "loss": 1.1985, "num_input_tokens_seen": 918036480, "step": 14010, "train_runtime": 224415.321, "train_tokens_per_second": 4090.792 }, { "epoch": 3.7022316123068797, "grad_norm": 0.4765625, "learning_rate": 7.248809610760965e-06, "loss": 1.2116, "num_input_tokens_seen": 918691840, "step": 14020, "train_runtime": 224616.2308, "train_tokens_per_second": 4090.051 }, { "epoch": 3.7048725736167967, "grad_norm": 0.486328125, "learning_rate": 7.121577629378096e-06, "loss": 1.1776, "num_input_tokens_seen": 919347200, "step": 14030, "train_runtime": 224817.62, "train_tokens_per_second": 4089.302 }, { "epoch": 3.707513534926713, "grad_norm": 0.474609375, "learning_rate": 6.995456017577173e-06, "loss": 1.1871, "num_input_tokens_seen": 920002560, "step": 14040, "train_runtime": 225018.0887, "train_tokens_per_second": 4088.572 }, { "epoch": 3.71015449623663, "grad_norm": 0.48828125, "learning_rate": 6.870445351949611e-06, "loss": 1.2087, "num_input_tokens_seen": 920657920, "step": 14050, "train_runtime": 225218.7085, "train_tokens_per_second": 4087.839 }, { "epoch": 3.712795457546547, "grad_norm": 0.482421875, "learning_rate": 6.746546204007748e-06, "loss": 1.1506, "num_input_tokens_seen": 921313280, "step": 14060, "train_runtime": 225419.3873, "train_tokens_per_second": 4087.108 }, { "epoch": 3.7154364188564637, "grad_norm": 0.49609375, "learning_rate": 6.6237591401825945e-06, "loss": 1.1702, "num_input_tokens_seen": 921968640, "step": 14070, "train_runtime": 225621.2595, "train_tokens_per_second": 4086.355 }, { "epoch": 3.7180773801663807, "grad_norm": 0.4765625, "learning_rate": 6.502084721820872e-06, "loss": 1.2142, "num_input_tokens_seen": 922624000, "step": 14080, "train_runtime": 225824.7316, "train_tokens_per_second": 4085.576 }, { "epoch": 3.720718341476297, "grad_norm": 0.494140625, "learning_rate": 6.3815235051827015e-06, "loss": 1.1986, "num_input_tokens_seen": 923279360, "step": 14090, "train_runtime": 226027.682, "train_tokens_per_second": 4084.807 }, { "epoch": 3.723359302786214, "grad_norm": 0.48046875, "learning_rate": 6.262076041438913e-06, "loss": 1.1982, "num_input_tokens_seen": 923934720, "step": 14100, "train_runtime": 226231.419, "train_tokens_per_second": 4084.025 }, { "epoch": 3.7260002640961307, "grad_norm": 0.462890625, "learning_rate": 6.143742876668579e-06, "loss": 1.2041, "num_input_tokens_seen": 924590080, "step": 14110, "train_runtime": 226433.1677, "train_tokens_per_second": 4083.28 }, { "epoch": 3.7286412254060477, "grad_norm": 0.46875, "learning_rate": 6.026524551856622e-06, "loss": 1.2097, "num_input_tokens_seen": 925245440, "step": 14120, "train_runtime": 226634.7238, "train_tokens_per_second": 4082.541 }, { "epoch": 3.7312821867159647, "grad_norm": 0.4765625, "learning_rate": 5.910421602891153e-06, "loss": 1.1972, "num_input_tokens_seen": 925900800, "step": 14130, "train_runtime": 226835.6514, "train_tokens_per_second": 4081.813 }, { "epoch": 3.733923148025881, "grad_norm": 0.498046875, "learning_rate": 5.795434560561086e-06, "loss": 1.1631, "num_input_tokens_seen": 926556160, "step": 14140, "train_runtime": 227036.2349, "train_tokens_per_second": 4081.094 }, { "epoch": 3.736564109335798, "grad_norm": 0.453125, "learning_rate": 5.681563950553748e-06, "loss": 1.1805, "num_input_tokens_seen": 927211520, "step": 14150, "train_runtime": 227237.6319, "train_tokens_per_second": 4080.361 }, { "epoch": 3.739205070645715, "grad_norm": 0.5078125, "learning_rate": 5.5688102934525755e-06, "loss": 1.1834, "num_input_tokens_seen": 927866880, "step": 14160, "train_runtime": 227439.4682, "train_tokens_per_second": 4079.621 }, { "epoch": 3.7418460319556317, "grad_norm": 0.490234375, "learning_rate": 5.457174104734452e-06, "loss": 1.2132, "num_input_tokens_seen": 928522240, "step": 14170, "train_runtime": 227640.8196, "train_tokens_per_second": 4078.892 }, { "epoch": 3.7444869932655487, "grad_norm": 0.478515625, "learning_rate": 5.346655894767627e-06, "loss": 1.2065, "num_input_tokens_seen": 929177600, "step": 14180, "train_runtime": 227843.4445, "train_tokens_per_second": 4078.141 }, { "epoch": 3.7471279545754657, "grad_norm": 0.478515625, "learning_rate": 5.23725616880924e-06, "loss": 1.2355, "num_input_tokens_seen": 929832960, "step": 14190, "train_runtime": 228044.6956, "train_tokens_per_second": 4077.415 }, { "epoch": 3.749768915885382, "grad_norm": 0.458984375, "learning_rate": 5.128975427003052e-06, "loss": 1.2436, "num_input_tokens_seen": 930488320, "step": 14200, "train_runtime": 228245.4699, "train_tokens_per_second": 4076.7 }, { "epoch": 3.752409877195299, "grad_norm": 0.466796875, "learning_rate": 5.021814164377164e-06, "loss": 1.2149, "num_input_tokens_seen": 931143680, "step": 14210, "train_runtime": 228448.5168, "train_tokens_per_second": 4075.945 }, { "epoch": 3.755050838505216, "grad_norm": 0.53515625, "learning_rate": 4.9157728708417175e-06, "loss": 1.187, "num_input_tokens_seen": 931799040, "step": 14220, "train_runtime": 228650.9477, "train_tokens_per_second": 4075.203 }, { "epoch": 3.7576917998151327, "grad_norm": 0.484375, "learning_rate": 4.810852031186724e-06, "loss": 1.1855, "num_input_tokens_seen": 932454400, "step": 14230, "train_runtime": 228852.2679, "train_tokens_per_second": 4074.482 }, { "epoch": 3.7603327611250497, "grad_norm": 0.515625, "learning_rate": 4.7070521250797415e-06, "loss": 1.2241, "num_input_tokens_seen": 933109760, "step": 14240, "train_runtime": 229055.8793, "train_tokens_per_second": 4073.721 }, { "epoch": 3.762973722434966, "grad_norm": 0.484375, "learning_rate": 4.6043736270638405e-06, "loss": 1.2204, "num_input_tokens_seen": 933765120, "step": 14250, "train_runtime": 229257.8004, "train_tokens_per_second": 4072.992 }, { "epoch": 3.765614683744883, "grad_norm": 0.486328125, "learning_rate": 4.502817006555221e-06, "loss": 1.2021, "num_input_tokens_seen": 934420480, "step": 14260, "train_runtime": 229459.3204, "train_tokens_per_second": 4072.271 }, { "epoch": 3.7682556450547997, "grad_norm": 0.494140625, "learning_rate": 4.402382727841298e-06, "loss": 1.2272, "num_input_tokens_seen": 935075840, "step": 14270, "train_runtime": 229660.3344, "train_tokens_per_second": 4071.56 }, { "epoch": 3.7708966063647167, "grad_norm": 0.4765625, "learning_rate": 4.303071250078339e-06, "loss": 1.2002, "num_input_tokens_seen": 935731200, "step": 14280, "train_runtime": 229863.3416, "train_tokens_per_second": 4070.815 }, { "epoch": 3.7735375676746337, "grad_norm": 0.462890625, "learning_rate": 4.204883027289663e-06, "loss": 1.2122, "num_input_tokens_seen": 936386560, "step": 14290, "train_runtime": 230065.2321, "train_tokens_per_second": 4070.092 }, { "epoch": 3.77617852898455, "grad_norm": 0.47265625, "learning_rate": 4.107818508363226e-06, "loss": 1.1655, "num_input_tokens_seen": 937041920, "step": 14300, "train_runtime": 230267.3337, "train_tokens_per_second": 4069.365 }, { "epoch": 3.778819490294467, "grad_norm": 0.482421875, "learning_rate": 4.0118781370498406e-06, "loss": 1.2199, "num_input_tokens_seen": 937697280, "step": 14310, "train_runtime": 230468.4884, "train_tokens_per_second": 4068.657 }, { "epoch": 3.781460451604384, "grad_norm": 0.466796875, "learning_rate": 3.917062351961015e-06, "loss": 1.2092, "num_input_tokens_seen": 938352640, "step": 14320, "train_runtime": 230670.1688, "train_tokens_per_second": 4067.941 }, { "epoch": 3.7841014129143007, "grad_norm": 0.51953125, "learning_rate": 3.823371586566926e-06, "loss": 1.2375, "num_input_tokens_seen": 939008000, "step": 14330, "train_runtime": 230872.2466, "train_tokens_per_second": 4067.219 }, { "epoch": 3.7867423742242177, "grad_norm": 0.4921875, "learning_rate": 3.7308062691945864e-06, "loss": 1.2465, "num_input_tokens_seen": 939663360, "step": 14340, "train_runtime": 231072.487, "train_tokens_per_second": 4066.531 }, { "epoch": 3.7893833355341346, "grad_norm": 0.494140625, "learning_rate": 3.639366823025708e-06, "loss": 1.2263, "num_input_tokens_seen": 940318720, "step": 14350, "train_runtime": 231272.5489, "train_tokens_per_second": 4065.847 }, { "epoch": 3.792024296844051, "grad_norm": 0.453125, "learning_rate": 3.54905366609487e-06, "loss": 1.1946, "num_input_tokens_seen": 940974080, "step": 14360, "train_runtime": 231474.7511, "train_tokens_per_second": 4065.126 }, { "epoch": 3.794665258153968, "grad_norm": 0.474609375, "learning_rate": 3.459867211287576e-06, "loss": 1.2057, "num_input_tokens_seen": 941629440, "step": 14370, "train_runtime": 231676.2195, "train_tokens_per_second": 4064.42 }, { "epoch": 3.7973062194638847, "grad_norm": 0.46875, "learning_rate": 3.3718078663384223e-06, "loss": 1.2121, "num_input_tokens_seen": 942284800, "step": 14380, "train_runtime": 231877.9644, "train_tokens_per_second": 4063.71 }, { "epoch": 3.7999471807738017, "grad_norm": 0.484375, "learning_rate": 3.284876033829126e-06, "loss": 1.1891, "num_input_tokens_seen": 942940160, "step": 14390, "train_runtime": 232080.1367, "train_tokens_per_second": 4062.994 }, { "epoch": 3.802588142083718, "grad_norm": 0.453125, "learning_rate": 3.1990721111867514e-06, "loss": 1.1746, "num_input_tokens_seen": 943595520, "step": 14400, "train_runtime": 232285.0573, "train_tokens_per_second": 4062.231 }, { "epoch": 3.805229103393635, "grad_norm": 0.47265625, "learning_rate": 3.114396490681959e-06, "loss": 1.1686, "num_input_tokens_seen": 944250880, "step": 14410, "train_runtime": 232486.4408, "train_tokens_per_second": 4061.531 }, { "epoch": 3.807870064703552, "grad_norm": 0.4609375, "learning_rate": 3.0308495594270348e-06, "loss": 1.175, "num_input_tokens_seen": 944906240, "step": 14420, "train_runtime": 232689.4941, "train_tokens_per_second": 4060.803 }, { "epoch": 3.8105110260134687, "grad_norm": 0.455078125, "learning_rate": 2.948431699374282e-06, "loss": 1.2303, "num_input_tokens_seen": 945561600, "step": 14430, "train_runtime": 232892.6589, "train_tokens_per_second": 4060.075 }, { "epoch": 3.8131519873233857, "grad_norm": 0.47265625, "learning_rate": 2.8671432873142167e-06, "loss": 1.2429, "num_input_tokens_seen": 946216960, "step": 14440, "train_runtime": 233096.2378, "train_tokens_per_second": 4059.34 }, { "epoch": 3.8157929486333027, "grad_norm": 0.462890625, "learning_rate": 2.7869846948738453e-06, "loss": 1.2247, "num_input_tokens_seen": 946872320, "step": 14450, "train_runtime": 233299.6228, "train_tokens_per_second": 4058.611 }, { "epoch": 3.818433909943219, "grad_norm": 0.482421875, "learning_rate": 2.707956288514973e-06, "loss": 1.2455, "num_input_tokens_seen": 947527680, "step": 14460, "train_runtime": 233502.6646, "train_tokens_per_second": 4057.888 }, { "epoch": 3.821074871253136, "grad_norm": 0.486328125, "learning_rate": 2.6300584295324838e-06, "loss": 1.1743, "num_input_tokens_seen": 948183040, "step": 14470, "train_runtime": 233707.2456, "train_tokens_per_second": 4057.14 }, { "epoch": 3.823715832563053, "grad_norm": 0.478515625, "learning_rate": 2.5532914740527824e-06, "loss": 1.1732, "num_input_tokens_seen": 948838400, "step": 14480, "train_runtime": 233910.9349, "train_tokens_per_second": 4056.409 }, { "epoch": 3.8263567938729697, "grad_norm": 0.46875, "learning_rate": 2.477655773032078e-06, "loss": 1.1558, "num_input_tokens_seen": 949493760, "step": 14490, "train_runtime": 234114.5913, "train_tokens_per_second": 4055.68 }, { "epoch": 3.8289977551828867, "grad_norm": 0.46875, "learning_rate": 2.4031516722548275e-06, "loss": 1.2516, "num_input_tokens_seen": 950149120, "step": 14500, "train_runtime": 234316.7652, "train_tokens_per_second": 4054.977 }, { "epoch": 3.8316387164928036, "grad_norm": 0.484375, "learning_rate": 2.3297795123320974e-06, "loss": 1.2092, "num_input_tokens_seen": 950804480, "step": 14510, "train_runtime": 234525.197, "train_tokens_per_second": 4054.168 }, { "epoch": 3.83427967780272, "grad_norm": 0.478515625, "learning_rate": 2.2575396287001504e-06, "loss": 1.2242, "num_input_tokens_seen": 951459840, "step": 14520, "train_runtime": 234726.1867, "train_tokens_per_second": 4053.488 }, { "epoch": 3.836920639112637, "grad_norm": 0.484375, "learning_rate": 2.1864323516186945e-06, "loss": 1.1876, "num_input_tokens_seen": 952115200, "step": 14530, "train_runtime": 234926.1034, "train_tokens_per_second": 4052.828 }, { "epoch": 3.8395616004225537, "grad_norm": 0.462890625, "learning_rate": 2.1164580061695526e-06, "loss": 1.2072, "num_input_tokens_seen": 952770560, "step": 14540, "train_runtime": 235125.3789, "train_tokens_per_second": 4052.181 }, { "epoch": 3.8422025617324707, "grad_norm": 0.44921875, "learning_rate": 2.047616912255107e-06, "loss": 1.2411, "num_input_tokens_seen": 953425920, "step": 14550, "train_runtime": 235324.7506, "train_tokens_per_second": 4051.533 }, { "epoch": 3.844843523042387, "grad_norm": 0.5078125, "learning_rate": 1.9799093845968288e-06, "loss": 1.2211, "num_input_tokens_seen": 954081280, "step": 14560, "train_runtime": 235524.5496, "train_tokens_per_second": 4050.878 }, { "epoch": 3.847484484352304, "grad_norm": 0.5390625, "learning_rate": 1.9133357327338897e-06, "loss": 1.2291, "num_input_tokens_seen": 954736640, "step": 14570, "train_runtime": 235726.1599, "train_tokens_per_second": 4050.194 }, { "epoch": 3.850125445662221, "grad_norm": 0.466796875, "learning_rate": 1.8478962610216644e-06, "loss": 1.1795, "num_input_tokens_seen": 955392000, "step": 14580, "train_runtime": 235926.1679, "train_tokens_per_second": 4049.538 }, { "epoch": 3.8527664069721377, "grad_norm": 0.48046875, "learning_rate": 1.7835912686303967e-06, "loss": 1.2069, "num_input_tokens_seen": 956047360, "step": 14590, "train_runtime": 236128.0729, "train_tokens_per_second": 4048.851 }, { "epoch": 3.8554073682820547, "grad_norm": 0.52734375, "learning_rate": 1.720421049543841e-06, "loss": 1.2321, "num_input_tokens_seen": 956702720, "step": 14600, "train_runtime": 236329.5962, "train_tokens_per_second": 4048.171 }, { "epoch": 3.8580483295919716, "grad_norm": 0.4765625, "learning_rate": 1.6583858925578732e-06, "loss": 1.1925, "num_input_tokens_seen": 957358080, "step": 14610, "train_runtime": 236530.4196, "train_tokens_per_second": 4047.505 }, { "epoch": 3.860689290901888, "grad_norm": 0.46875, "learning_rate": 1.5974860812792146e-06, "loss": 1.2331, "num_input_tokens_seen": 958013440, "step": 14620, "train_runtime": 236731.028, "train_tokens_per_second": 4046.844 }, { "epoch": 3.863330252211805, "grad_norm": 0.53125, "learning_rate": 1.5377218941241277e-06, "loss": 1.2274, "num_input_tokens_seen": 958668800, "step": 14630, "train_runtime": 236932.0351, "train_tokens_per_second": 4046.176 }, { "epoch": 3.865971213521722, "grad_norm": 0.46875, "learning_rate": 1.4790936043170832e-06, "loss": 1.1883, "num_input_tokens_seen": 959324160, "step": 14640, "train_runtime": 237133.0352, "train_tokens_per_second": 4045.51 }, { "epoch": 3.8686121748316387, "grad_norm": 0.4765625, "learning_rate": 1.4216014798896227e-06, "loss": 1.18, "num_input_tokens_seen": 959979520, "step": 14650, "train_runtime": 237333.5434, "train_tokens_per_second": 4044.854 }, { "epoch": 3.8712531361415556, "grad_norm": 0.486328125, "learning_rate": 1.3652457836789977e-06, "loss": 1.2159, "num_input_tokens_seen": 960634880, "step": 14660, "train_runtime": 237534.255, "train_tokens_per_second": 4044.195 }, { "epoch": 3.873894097451472, "grad_norm": 0.484375, "learning_rate": 1.3100267733270887e-06, "loss": 1.2189, "num_input_tokens_seen": 961290240, "step": 14670, "train_runtime": 237736.0042, "train_tokens_per_second": 4043.52 }, { "epoch": 3.876535058761389, "grad_norm": 0.470703125, "learning_rate": 1.2559447012791824e-06, "loss": 1.2065, "num_input_tokens_seen": 961945600, "step": 14680, "train_runtime": 237936.4718, "train_tokens_per_second": 4042.867 }, { "epoch": 3.8791760200713057, "grad_norm": 0.49609375, "learning_rate": 1.2029998147827793e-06, "loss": 1.2207, "num_input_tokens_seen": 962600960, "step": 14690, "train_runtime": 238135.9488, "train_tokens_per_second": 4042.233 }, { "epoch": 3.8818169813812227, "grad_norm": 0.451171875, "learning_rate": 1.1511923558865657e-06, "loss": 1.2172, "num_input_tokens_seen": 963256320, "step": 14700, "train_runtime": 238334.3027, "train_tokens_per_second": 4041.618 }, { "epoch": 3.8844579426911396, "grad_norm": 0.474609375, "learning_rate": 1.10052256143911e-06, "loss": 1.1744, "num_input_tokens_seen": 963911680, "step": 14710, "train_runtime": 238533.0939, "train_tokens_per_second": 4040.998 }, { "epoch": 3.887098904001056, "grad_norm": 0.478515625, "learning_rate": 1.0509906630880583e-06, "loss": 1.2104, "num_input_tokens_seen": 964567040, "step": 14720, "train_runtime": 238732.2492, "train_tokens_per_second": 4040.372 }, { "epoch": 3.889739865310973, "grad_norm": 0.48046875, "learning_rate": 1.0025968872788282e-06, "loss": 1.2209, "num_input_tokens_seen": 965222400, "step": 14730, "train_runtime": 238930.6916, "train_tokens_per_second": 4039.759 }, { "epoch": 3.89238082662089, "grad_norm": 0.462890625, "learning_rate": 9.55341455253722e-07, "loss": 1.1996, "num_input_tokens_seen": 965877760, "step": 14740, "train_runtime": 239127.4331, "train_tokens_per_second": 4039.176 }, { "epoch": 3.8950217879308067, "grad_norm": 0.443359375, "learning_rate": 9.092245830508438e-07, "loss": 1.1773, "num_input_tokens_seen": 966533120, "step": 14750, "train_runtime": 239326.275, "train_tokens_per_second": 4038.558 }, { "epoch": 3.8976627492407236, "grad_norm": 0.51171875, "learning_rate": 8.642464815031004e-07, "loss": 1.2295, "num_input_tokens_seen": 967188480, "step": 14760, "train_runtime": 239521.3552, "train_tokens_per_second": 4038.005 }, { "epoch": 3.9003037105506406, "grad_norm": 0.494140625, "learning_rate": 8.204073562373404e-07, "loss": 1.1858, "num_input_tokens_seen": 967843840, "step": 14770, "train_runtime": 239721.737, "train_tokens_per_second": 4037.364 }, { "epoch": 3.902944671860557, "grad_norm": 0.51953125, "learning_rate": 7.777074076733004e-07, "loss": 1.1794, "num_input_tokens_seen": 968499200, "step": 14780, "train_runtime": 239920.1058, "train_tokens_per_second": 4036.757 }, { "epoch": 3.905585633170474, "grad_norm": 0.4765625, "learning_rate": 7.361468310227159e-07, "loss": 1.2027, "num_input_tokens_seen": 969154560, "step": 14790, "train_runtime": 240118.6572, "train_tokens_per_second": 4036.149 }, { "epoch": 3.908226594480391, "grad_norm": 0.5078125, "learning_rate": 6.957258162885171e-07, "loss": 1.2326, "num_input_tokens_seen": 969809920, "step": 14800, "train_runtime": 240318.3329, "train_tokens_per_second": 4035.522 }, { "epoch": 3.9108675557903076, "grad_norm": 0.470703125, "learning_rate": 6.564445482638015e-07, "loss": 1.1767, "num_input_tokens_seen": 970465280, "step": 14810, "train_runtime": 240516.6053, "train_tokens_per_second": 4034.92 }, { "epoch": 3.9135085171002246, "grad_norm": 0.47265625, "learning_rate": 6.183032065311123e-07, "loss": 1.2003, "num_input_tokens_seen": 971120640, "step": 14820, "train_runtime": 240716.4572, "train_tokens_per_second": 4034.293 }, { "epoch": 3.916149478410141, "grad_norm": 0.4765625, "learning_rate": 5.813019654615781e-07, "loss": 1.2024, "num_input_tokens_seen": 971776000, "step": 14830, "train_runtime": 240914.4151, "train_tokens_per_second": 4033.698 }, { "epoch": 3.918790439720058, "grad_norm": 0.474609375, "learning_rate": 5.454409942141636e-07, "loss": 1.202, "num_input_tokens_seen": 972431360, "step": 14840, "train_runtime": 241113.589, "train_tokens_per_second": 4033.084 }, { "epoch": 3.9214314010299747, "grad_norm": 0.474609375, "learning_rate": 5.107204567347812e-07, "loss": 1.2099, "num_input_tokens_seen": 973086720, "step": 14850, "train_runtime": 241313.8553, "train_tokens_per_second": 4032.453 }, { "epoch": 3.9240723623398917, "grad_norm": 0.494140625, "learning_rate": 4.771405117556526e-07, "loss": 1.2373, "num_input_tokens_seen": 973742080, "step": 14860, "train_runtime": 241512.7412, "train_tokens_per_second": 4031.846 }, { "epoch": 3.9267133236498086, "grad_norm": 0.49609375, "learning_rate": 4.447013127945043e-07, "loss": 1.2142, "num_input_tokens_seen": 974397440, "step": 14870, "train_runtime": 241712.1245, "train_tokens_per_second": 4031.231 }, { "epoch": 3.929354284959725, "grad_norm": 0.50390625, "learning_rate": 4.134030081539564e-07, "loss": 1.2312, "num_input_tokens_seen": 975052800, "step": 14880, "train_runtime": 241912.1289, "train_tokens_per_second": 4030.607 }, { "epoch": 3.931995246269642, "grad_norm": 0.474609375, "learning_rate": 3.832457409207457e-07, "loss": 1.2093, "num_input_tokens_seen": 975708160, "step": 14890, "train_runtime": 242113.1223, "train_tokens_per_second": 4029.968 }, { "epoch": 3.934636207579559, "grad_norm": 0.482421875, "learning_rate": 3.5422964896517087e-07, "loss": 1.1968, "num_input_tokens_seen": 976363520, "step": 14900, "train_runtime": 242311.4676, "train_tokens_per_second": 4029.374 }, { "epoch": 3.9372771688894757, "grad_norm": 0.51171875, "learning_rate": 3.2635486494031475e-07, "loss": 1.193, "num_input_tokens_seen": 977018880, "step": 14910, "train_runtime": 242511.3522, "train_tokens_per_second": 4028.755 }, { "epoch": 3.9399181301993926, "grad_norm": 0.484375, "learning_rate": 2.996215162816285e-07, "loss": 1.1974, "num_input_tokens_seen": 977674240, "step": 14920, "train_runtime": 242711.749, "train_tokens_per_second": 4028.129 }, { "epoch": 3.9425590915093096, "grad_norm": 0.470703125, "learning_rate": 2.7402972520623736e-07, "loss": 1.221, "num_input_tokens_seen": 978329600, "step": 14930, "train_runtime": 242909.5916, "train_tokens_per_second": 4027.546 }, { "epoch": 3.945200052819226, "grad_norm": 0.52734375, "learning_rate": 2.495796087123303e-07, "loss": 1.2264, "num_input_tokens_seen": 978984960, "step": 14940, "train_runtime": 243108.5553, "train_tokens_per_second": 4026.946 }, { "epoch": 3.947841014129143, "grad_norm": 0.478515625, "learning_rate": 2.2627127857874352e-07, "loss": 1.1979, "num_input_tokens_seen": 979640320, "step": 14950, "train_runtime": 243307.219, "train_tokens_per_second": 4026.351 }, { "epoch": 3.9504819754390597, "grad_norm": 0.49609375, "learning_rate": 2.0410484136443309e-07, "loss": 1.2009, "num_input_tokens_seen": 980295680, "step": 14960, "train_runtime": 243505.9871, "train_tokens_per_second": 4025.756 }, { "epoch": 3.9531229367489766, "grad_norm": 0.466796875, "learning_rate": 1.8308039840783663e-07, "loss": 1.2191, "num_input_tokens_seen": 980951040, "step": 14970, "train_runtime": 243704.8421, "train_tokens_per_second": 4025.16 }, { "epoch": 3.955763898058893, "grad_norm": 0.478515625, "learning_rate": 1.6319804582667907e-07, "loss": 1.1971, "num_input_tokens_seen": 981606400, "step": 14980, "train_runtime": 243904.6275, "train_tokens_per_second": 4024.55 }, { "epoch": 3.95840485936881, "grad_norm": 0.484375, "learning_rate": 1.444578745172509e-07, "loss": 1.1879, "num_input_tokens_seen": 982261760, "step": 14990, "train_runtime": 244103.8987, "train_tokens_per_second": 4023.949 }, { "epoch": 3.961045820678727, "grad_norm": 0.474609375, "learning_rate": 1.268599701541584e-07, "loss": 1.1779, "num_input_tokens_seen": 982917120, "step": 15000, "train_runtime": 244302.6509, "train_tokens_per_second": 4023.358 }, { "epoch": 3.9636867819886437, "grad_norm": 0.50390625, "learning_rate": 1.1040441318996286e-07, "loss": 1.2384, "num_input_tokens_seen": 983572480, "step": 15010, "train_runtime": 244515.7267, "train_tokens_per_second": 4022.533 }, { "epoch": 3.9663277432985606, "grad_norm": 0.48828125, "learning_rate": 9.509127885462542e-08, "loss": 1.2202, "num_input_tokens_seen": 984227840, "step": 15020, "train_runtime": 244719.9245, "train_tokens_per_second": 4021.854 }, { "epoch": 3.9689687046084776, "grad_norm": 0.5, "learning_rate": 8.09206371553961e-08, "loss": 1.21, "num_input_tokens_seen": 984883200, "step": 15030, "train_runtime": 244922.6385, "train_tokens_per_second": 4021.201 }, { "epoch": 3.971609665918394, "grad_norm": 0.455078125, "learning_rate": 6.789255287631412e-08, "loss": 1.1314, "num_input_tokens_seen": 985538560, "step": 15040, "train_runtime": 245125.8276, "train_tokens_per_second": 4020.541 }, { "epoch": 3.974250627228311, "grad_norm": 0.48828125, "learning_rate": 5.6007085578013705e-08, "loss": 1.2161, "num_input_tokens_seen": 986193920, "step": 15050, "train_runtime": 245326.2235, "train_tokens_per_second": 4019.929 }, { "epoch": 3.976891588538228, "grad_norm": 0.48046875, "learning_rate": 4.5264289597363174e-08, "loss": 1.1896, "num_input_tokens_seen": 986849280, "step": 15060, "train_runtime": 245528.3684, "train_tokens_per_second": 4019.288 }, { "epoch": 3.9795325498481446, "grad_norm": 0.49609375, "learning_rate": 3.566421404732623e-08, "loss": 1.167, "num_input_tokens_seen": 987504640, "step": 15070, "train_runtime": 245728.9485, "train_tokens_per_second": 4018.674 }, { "epoch": 3.9821735111580616, "grad_norm": 0.51171875, "learning_rate": 2.7206902816628854e-08, "loss": 1.2484, "num_input_tokens_seen": 988160000, "step": 15080, "train_runtime": 245928.8507, "train_tokens_per_second": 4018.073 }, { "epoch": 3.9848144724679786, "grad_norm": 0.484375, "learning_rate": 1.989239456970382e-08, "loss": 1.1808, "num_input_tokens_seen": 988815360, "step": 15090, "train_runtime": 246132.5042, "train_tokens_per_second": 4017.411 }, { "epoch": 3.987455433777895, "grad_norm": 0.47265625, "learning_rate": 1.3720722746302095e-08, "loss": 1.2086, "num_input_tokens_seen": 989470720, "step": 15100, "train_runtime": 246332.5248, "train_tokens_per_second": 4016.809 }, { "epoch": 3.990096395087812, "grad_norm": 0.490234375, "learning_rate": 8.691915561520602e-09, "loss": 1.2707, "num_input_tokens_seen": 990126080, "step": 15110, "train_runtime": 246535.5849, "train_tokens_per_second": 4016.159 }, { "epoch": 3.9927373563977286, "grad_norm": 0.46484375, "learning_rate": 4.805996005635693e-09, "loss": 1.2222, "num_input_tokens_seen": 990781440, "step": 15120, "train_runtime": 246736.6315, "train_tokens_per_second": 4015.543 }, { "epoch": 3.9953783177076456, "grad_norm": 0.478515625, "learning_rate": 2.0629818439366065e-09, "loss": 1.1705, "num_input_tokens_seen": 991436800, "step": 15130, "train_runtime": 246936.1816, "train_tokens_per_second": 4014.952 }, { "epoch": 3.998019279017562, "grad_norm": 0.474609375, "learning_rate": 4.6288561664220575e-10, "loss": 1.1888, "num_input_tokens_seen": 992092160, "step": 15140, "train_runtime": 247137.4018, "train_tokens_per_second": 4014.334 }, { "epoch": 4.0, "num_input_tokens_seen": 992575488, "step": 15148, "total_flos": 2.1583206982169395e+19, "train_loss": 2.0365376835735676, "train_runtime": 247315.1676, "train_samples_per_second": 1.96, "train_steps_per_second": 0.061, "train_tokens_per_second": 4013.403 } ], "logging_steps": 10, "max_steps": 15148, "num_input_tokens_seen": 992575488, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.1583206982169395e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }