diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15195 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 15148, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00026409613099168095, + "grad_norm": 344.0, + "learning_rate": 0.0, + "loss": 13.395, + "num_input_tokens_seen": 65536, + "step": 1, + "train_runtime": 39.9594, + "train_tokens_per_second": 1640.065 + }, + { + "epoch": 0.00264096130991681, + "grad_norm": 31.5, + "learning_rate": 9.89010989010989e-06, + "loss": 12.1214, + "num_input_tokens_seen": 655360, + "step": 10, + "train_runtime": 183.0716, + "train_tokens_per_second": 3579.802 + }, + { + "epoch": 0.00528192261983362, + "grad_norm": 3.703125, + "learning_rate": 2.087912087912088e-05, + "loss": 10.1795, + "num_input_tokens_seen": 1310720, + "step": 20, + "train_runtime": 442.7935, + "train_tokens_per_second": 2960.116 + }, + { + "epoch": 0.00792288392975043, + "grad_norm": 4.40625, + "learning_rate": 3.1868131868131866e-05, + "loss": 9.794, + "num_input_tokens_seen": 1966080, + "step": 30, + "train_runtime": 710.3312, + "train_tokens_per_second": 2767.836 + }, + { + "epoch": 0.01056384523966724, + "grad_norm": 15.375, + "learning_rate": 4.2857142857142856e-05, + "loss": 9.2328, + "num_input_tokens_seen": 2621440, + "step": 40, + "train_runtime": 978.4124, + "train_tokens_per_second": 2679.279 + }, + { + "epoch": 0.013204806549584048, + "grad_norm": 20.0, + "learning_rate": 5.3846153846153853e-05, + "loss": 7.5562, + "num_input_tokens_seen": 3276800, + "step": 50, + "train_runtime": 1245.024, + "train_tokens_per_second": 2631.917 + }, + { + "epoch": 0.01584576785950086, + "grad_norm": 15.25, + "learning_rate": 6.483516483516484e-05, + "loss": 6.6608, + "num_input_tokens_seen": 3932160, + "step": 60, + "train_runtime": 1512.3636, + "train_tokens_per_second": 2600.01 + }, + { + "epoch": 0.018486729169417668, + "grad_norm": 6.90625, + "learning_rate": 7.582417582417581e-05, + "loss": 6.3654, + "num_input_tokens_seen": 4587520, + "step": 70, + "train_runtime": 1781.6818, + "train_tokens_per_second": 2574.826 + }, + { + "epoch": 0.02112769047933448, + "grad_norm": 6.8125, + "learning_rate": 8.681318681318681e-05, + "loss": 5.9489, + "num_input_tokens_seen": 5242880, + "step": 80, + "train_runtime": 2051.9937, + "train_tokens_per_second": 2555.018 + }, + { + "epoch": 0.023768651789251288, + "grad_norm": 6.0625, + "learning_rate": 9.780219780219781e-05, + "loss": 5.5165, + "num_input_tokens_seen": 5898240, + "step": 90, + "train_runtime": 2374.9912, + "train_tokens_per_second": 2483.479 + }, + { + "epoch": 0.026409613099168096, + "grad_norm": 4.96875, + "learning_rate": 0.00010879120879120879, + "loss": 5.1193, + "num_input_tokens_seen": 6553600, + "step": 100, + "train_runtime": 2729.9406, + "train_tokens_per_second": 2400.638 + }, + { + "epoch": 0.029050574409084907, + "grad_norm": 4.6875, + "learning_rate": 0.00011978021978021979, + "loss": 4.9257, + "num_input_tokens_seen": 7208960, + "step": 110, + "train_runtime": 3085.8271, + "train_tokens_per_second": 2336.152 + }, + { + "epoch": 0.03169153571900172, + "grad_norm": 5.84375, + "learning_rate": 0.00013076923076923077, + "loss": 4.6357, + "num_input_tokens_seen": 7864320, + "step": 120, + "train_runtime": 3441.9154, + "train_tokens_per_second": 2284.867 + }, + { + "epoch": 0.034332497028918524, + "grad_norm": 4.71875, + "learning_rate": 0.00014175824175824176, + "loss": 4.6891, + "num_input_tokens_seen": 8519680, + "step": 130, + "train_runtime": 3798.0993, + "train_tokens_per_second": 2243.143 + }, + { + "epoch": 0.036973458338835336, + "grad_norm": 2.75, + "learning_rate": 0.00015274725274725277, + "loss": 4.5238, + "num_input_tokens_seen": 9175040, + "step": 140, + "train_runtime": 4158.3319, + "train_tokens_per_second": 2206.423 + }, + { + "epoch": 0.03961441964875215, + "grad_norm": 3.375, + "learning_rate": 0.00016373626373626375, + "loss": 4.1358, + "num_input_tokens_seen": 9830400, + "step": 150, + "train_runtime": 4518.0275, + "train_tokens_per_second": 2175.817 + }, + { + "epoch": 0.04225538095866896, + "grad_norm": 2.921875, + "learning_rate": 0.0001747252747252747, + "loss": 4.2246, + "num_input_tokens_seen": 10485760, + "step": 160, + "train_runtime": 4877.0489, + "train_tokens_per_second": 2150.022 + }, + { + "epoch": 0.044896342268585764, + "grad_norm": 3.140625, + "learning_rate": 0.00018571428571428572, + "loss": 3.9947, + "num_input_tokens_seen": 11141120, + "step": 170, + "train_runtime": 5236.3642, + "train_tokens_per_second": 2127.644 + }, + { + "epoch": 0.047537303578502575, + "grad_norm": 2.359375, + "learning_rate": 0.0001967032967032967, + "loss": 3.9596, + "num_input_tokens_seen": 11796480, + "step": 180, + "train_runtime": 5592.6933, + "train_tokens_per_second": 2109.266 + }, + { + "epoch": 0.05017826488841939, + "grad_norm": 5.03125, + "learning_rate": 0.0002076923076923077, + "loss": 3.9512, + "num_input_tokens_seen": 12451840, + "step": 190, + "train_runtime": 5949.0933, + "train_tokens_per_second": 2093.065 + }, + { + "epoch": 0.05281922619833619, + "grad_norm": 2.125, + "learning_rate": 0.0002186813186813187, + "loss": 3.8762, + "num_input_tokens_seen": 13107200, + "step": 200, + "train_runtime": 6307.5561, + "train_tokens_per_second": 2078.016 + }, + { + "epoch": 0.055460187508253, + "grad_norm": 2.109375, + "learning_rate": 0.00022967032967032965, + "loss": 3.7407, + "num_input_tokens_seen": 13762560, + "step": 210, + "train_runtime": 6669.6459, + "train_tokens_per_second": 2063.462 + }, + { + "epoch": 0.058101148818169815, + "grad_norm": 1.9375, + "learning_rate": 0.00024065934065934066, + "loss": 3.8132, + "num_input_tokens_seen": 14417920, + "step": 220, + "train_runtime": 7028.6859, + "train_tokens_per_second": 2051.297 + }, + { + "epoch": 0.06074211012808663, + "grad_norm": 2.15625, + "learning_rate": 0.00025164835164835165, + "loss": 3.7891, + "num_input_tokens_seen": 15073280, + "step": 230, + "train_runtime": 7387.9861, + "train_tokens_per_second": 2040.242 + }, + { + "epoch": 0.06338307143800344, + "grad_norm": 1.5078125, + "learning_rate": 0.00026263736263736266, + "loss": 3.6183, + "num_input_tokens_seen": 15728640, + "step": 240, + "train_runtime": 7747.3492, + "train_tokens_per_second": 2030.196 + }, + { + "epoch": 0.06602403274792025, + "grad_norm": 1.9453125, + "learning_rate": 0.00027362637362637367, + "loss": 3.7294, + "num_input_tokens_seen": 16384000, + "step": 250, + "train_runtime": 8107.0182, + "train_tokens_per_second": 2020.965 + }, + { + "epoch": 0.06866499405783705, + "grad_norm": 1.6640625, + "learning_rate": 0.0002846153846153846, + "loss": 3.5328, + "num_input_tokens_seen": 17039360, + "step": 260, + "train_runtime": 8467.8519, + "train_tokens_per_second": 2012.241 + }, + { + "epoch": 0.07130595536775386, + "grad_norm": 1.8515625, + "learning_rate": 0.00029560439560439563, + "loss": 3.6193, + "num_input_tokens_seen": 17694720, + "step": 270, + "train_runtime": 8827.9374, + "train_tokens_per_second": 2004.4 + }, + { + "epoch": 0.07394691667767067, + "grad_norm": 1.4921875, + "learning_rate": 0.00030659340659340665, + "loss": 3.5792, + "num_input_tokens_seen": 18350080, + "step": 280, + "train_runtime": 9190.5559, + "train_tokens_per_second": 1996.624 + }, + { + "epoch": 0.07658787798758748, + "grad_norm": 1.453125, + "learning_rate": 0.00031758241758241755, + "loss": 3.5012, + "num_input_tokens_seen": 19005440, + "step": 290, + "train_runtime": 9551.7396, + "train_tokens_per_second": 1989.736 + }, + { + "epoch": 0.0792288392975043, + "grad_norm": 1.1171875, + "learning_rate": 0.00032857142857142856, + "loss": 3.5406, + "num_input_tokens_seen": 19660800, + "step": 300, + "train_runtime": 9913.3256, + "train_tokens_per_second": 1983.27 + }, + { + "epoch": 0.0818698006074211, + "grad_norm": 1.390625, + "learning_rate": 0.00033956043956043957, + "loss": 3.5479, + "num_input_tokens_seen": 20316160, + "step": 310, + "train_runtime": 10272.1454, + "train_tokens_per_second": 1977.791 + }, + { + "epoch": 0.08451076191733792, + "grad_norm": 1.546875, + "learning_rate": 0.0003505494505494505, + "loss": 3.4769, + "num_input_tokens_seen": 20971520, + "step": 320, + "train_runtime": 10630.524, + "train_tokens_per_second": 1972.764 + }, + { + "epoch": 0.08715172322725472, + "grad_norm": 1.5390625, + "learning_rate": 0.00036153846153846154, + "loss": 3.5208, + "num_input_tokens_seen": 21626880, + "step": 330, + "train_runtime": 10988.8532, + "train_tokens_per_second": 1968.074 + }, + { + "epoch": 0.08979268453717153, + "grad_norm": 1.390625, + "learning_rate": 0.00037252747252747255, + "loss": 3.5306, + "num_input_tokens_seen": 22282240, + "step": 340, + "train_runtime": 11348.4705, + "train_tokens_per_second": 1963.458 + }, + { + "epoch": 0.09243364584708834, + "grad_norm": 1.3359375, + "learning_rate": 0.00038351648351648356, + "loss": 3.5036, + "num_input_tokens_seen": 22937600, + "step": 350, + "train_runtime": 11707.3514, + "train_tokens_per_second": 1959.248 + }, + { + "epoch": 0.09507460715700515, + "grad_norm": 1.125, + "learning_rate": 0.0003945054945054945, + "loss": 3.4787, + "num_input_tokens_seen": 23592960, + "step": 360, + "train_runtime": 12066.3216, + "train_tokens_per_second": 1955.274 + }, + { + "epoch": 0.09771556846692196, + "grad_norm": 1.3203125, + "learning_rate": 0.0004054945054945055, + "loss": 3.3933, + "num_input_tokens_seen": 24248320, + "step": 370, + "train_runtime": 12425.7846, + "train_tokens_per_second": 1951.452 + }, + { + "epoch": 0.10035652977683877, + "grad_norm": 1.2734375, + "learning_rate": 0.00041648351648351654, + "loss": 3.4365, + "num_input_tokens_seen": 24903680, + "step": 380, + "train_runtime": 12785.724, + "train_tokens_per_second": 1947.772 + }, + { + "epoch": 0.10299749108675559, + "grad_norm": 1.25, + "learning_rate": 0.00042747252747252744, + "loss": 3.3713, + "num_input_tokens_seen": 25559040, + "step": 390, + "train_runtime": 13145.3532, + "train_tokens_per_second": 1944.34 + }, + { + "epoch": 0.10563845239667238, + "grad_norm": 1.2265625, + "learning_rate": 0.00043846153846153845, + "loss": 3.4548, + "num_input_tokens_seen": 26214400, + "step": 400, + "train_runtime": 13504.0465, + "train_tokens_per_second": 1941.226 + }, + { + "epoch": 0.1082794137065892, + "grad_norm": 0.9453125, + "learning_rate": 0.00044945054945054946, + "loss": 3.5148, + "num_input_tokens_seen": 26869760, + "step": 410, + "train_runtime": 13863.849, + "train_tokens_per_second": 1938.117 + }, + { + "epoch": 0.110920375016506, + "grad_norm": 0.859375, + "learning_rate": 0.0004604395604395604, + "loss": 3.4389, + "num_input_tokens_seen": 27525120, + "step": 420, + "train_runtime": 14223.2273, + "train_tokens_per_second": 1935.223 + }, + { + "epoch": 0.11356133632642282, + "grad_norm": 0.89453125, + "learning_rate": 0.0004714285714285714, + "loss": 3.4073, + "num_input_tokens_seen": 28180480, + "step": 430, + "train_runtime": 14584.6221, + "train_tokens_per_second": 1932.205 + }, + { + "epoch": 0.11620229763633963, + "grad_norm": 1.0, + "learning_rate": 0.00048241758241758244, + "loss": 3.3578, + "num_input_tokens_seen": 28835840, + "step": 440, + "train_runtime": 14947.6491, + "train_tokens_per_second": 1929.122 + }, + { + "epoch": 0.11884325894625644, + "grad_norm": 0.95703125, + "learning_rate": 0.0004934065934065934, + "loss": 3.3008, + "num_input_tokens_seen": 29491200, + "step": 450, + "train_runtime": 15308.7424, + "train_tokens_per_second": 1926.429 + }, + { + "epoch": 0.12148422025617325, + "grad_norm": 1.0390625, + "learning_rate": 0.0004999999085657815, + "loss": 3.3657, + "num_input_tokens_seen": 30146560, + "step": 460, + "train_runtime": 15670.4393, + "train_tokens_per_second": 1923.785 + }, + { + "epoch": 0.12412518156609005, + "grad_norm": 0.69140625, + "learning_rate": 0.0004999988799315912, + "loss": 3.3419, + "num_input_tokens_seen": 30801920, + "step": 470, + "train_runtime": 16031.0698, + "train_tokens_per_second": 1921.389 + }, + { + "epoch": 0.12676614287600688, + "grad_norm": 0.734375, + "learning_rate": 0.0004999967083751558, + "loss": 3.2552, + "num_input_tokens_seen": 31457280, + "step": 480, + "train_runtime": 16392.9295, + "train_tokens_per_second": 1918.954 + }, + { + "epoch": 0.1294071041859237, + "grad_norm": 0.74609375, + "learning_rate": 0.0004999933939064028, + "loss": 3.2604, + "num_input_tokens_seen": 32112640, + "step": 490, + "train_runtime": 16754.6432, + "train_tokens_per_second": 1916.641 + }, + { + "epoch": 0.1320480654958405, + "grad_norm": 0.7734375, + "learning_rate": 0.0004999889365404853, + "loss": 3.3463, + "num_input_tokens_seen": 32768000, + "step": 500, + "train_runtime": 17119.8942, + "train_tokens_per_second": 1914.031 + }, + { + "epoch": 0.13468902680575728, + "grad_norm": 0.875, + "learning_rate": 0.0004999833362977808, + "loss": 3.2953, + "num_input_tokens_seen": 33423360, + "step": 510, + "train_runtime": 17482.3757, + "train_tokens_per_second": 1911.832 + }, + { + "epoch": 0.1373299881156741, + "grad_norm": 0.61328125, + "learning_rate": 0.0004999765932038922, + "loss": 3.2767, + "num_input_tokens_seen": 34078720, + "step": 520, + "train_runtime": 17845.7721, + "train_tokens_per_second": 1909.624 + }, + { + "epoch": 0.1399709494255909, + "grad_norm": 0.5859375, + "learning_rate": 0.0004999687072896469, + "loss": 3.2203, + "num_input_tokens_seen": 34734080, + "step": 530, + "train_runtime": 18209.6586, + "train_tokens_per_second": 1907.454 + }, + { + "epoch": 0.14261191073550772, + "grad_norm": 0.734375, + "learning_rate": 0.0004999596785910971, + "loss": 3.2896, + "num_input_tokens_seen": 35389440, + "step": 540, + "train_runtime": 18573.9267, + "train_tokens_per_second": 1905.329 + }, + { + "epoch": 0.14525287204542453, + "grad_norm": 0.63671875, + "learning_rate": 0.0004999495071495191, + "loss": 3.2258, + "num_input_tokens_seen": 36044800, + "step": 550, + "train_runtime": 18927.2647, + "train_tokens_per_second": 1904.385 + }, + { + "epoch": 0.14789383335534134, + "grad_norm": 0.65625, + "learning_rate": 0.0004999381930114139, + "loss": 3.1621, + "num_input_tokens_seen": 36700160, + "step": 560, + "train_runtime": 19241.9441, + "train_tokens_per_second": 1907.3 + }, + { + "epoch": 0.15053479466525815, + "grad_norm": 0.625, + "learning_rate": 0.0004999257362285067, + "loss": 3.1957, + "num_input_tokens_seen": 37355520, + "step": 570, + "train_runtime": 19556.4805, + "train_tokens_per_second": 1910.135 + }, + { + "epoch": 0.15317575597517497, + "grad_norm": 0.56640625, + "learning_rate": 0.000499912136857746, + "loss": 3.245, + "num_input_tokens_seen": 38010880, + "step": 580, + "train_runtime": 19869.9929, + "train_tokens_per_second": 1912.979 + }, + { + "epoch": 0.15581671728509178, + "grad_norm": 0.59765625, + "learning_rate": 0.0004998973949613041, + "loss": 3.2056, + "num_input_tokens_seen": 38666240, + "step": 590, + "train_runtime": 20184.3191, + "train_tokens_per_second": 1915.657 + }, + { + "epoch": 0.1584576785950086, + "grad_norm": 0.5625, + "learning_rate": 0.0004998815106065771, + "loss": 3.0587, + "num_input_tokens_seen": 39321600, + "step": 600, + "train_runtime": 20498.9508, + "train_tokens_per_second": 1918.225 + }, + { + "epoch": 0.1610986399049254, + "grad_norm": 0.53125, + "learning_rate": 0.0004998644838661833, + "loss": 3.1037, + "num_input_tokens_seen": 39976960, + "step": 610, + "train_runtime": 20812.9691, + "train_tokens_per_second": 1920.772 + }, + { + "epoch": 0.1637396012148422, + "grad_norm": 0.56640625, + "learning_rate": 0.0004998463148179641, + "loss": 3.1465, + "num_input_tokens_seen": 40632320, + "step": 620, + "train_runtime": 21126.4959, + "train_tokens_per_second": 1923.287 + }, + { + "epoch": 0.16638056252475902, + "grad_norm": 0.546875, + "learning_rate": 0.0004998270035449831, + "loss": 3.1053, + "num_input_tokens_seen": 41287680, + "step": 630, + "train_runtime": 21440.6055, + "train_tokens_per_second": 1925.677 + }, + { + "epoch": 0.16902152383467584, + "grad_norm": 0.59375, + "learning_rate": 0.0004998065501355258, + "loss": 3.1986, + "num_input_tokens_seen": 41943040, + "step": 640, + "train_runtime": 21754.8128, + "train_tokens_per_second": 1927.989 + }, + { + "epoch": 0.17166248514459262, + "grad_norm": 0.58984375, + "learning_rate": 0.0004997849546830994, + "loss": 3.1209, + "num_input_tokens_seen": 42598400, + "step": 650, + "train_runtime": 22068.5213, + "train_tokens_per_second": 1930.279 + }, + { + "epoch": 0.17430344645450943, + "grad_norm": 0.53125, + "learning_rate": 0.0004997622172864317, + "loss": 3.136, + "num_input_tokens_seen": 43253760, + "step": 660, + "train_runtime": 22382.6292, + "train_tokens_per_second": 1932.47 + }, + { + "epoch": 0.17694440776442624, + "grad_norm": 0.53515625, + "learning_rate": 0.000499738338049472, + "loss": 3.0647, + "num_input_tokens_seen": 43909120, + "step": 670, + "train_runtime": 22696.8961, + "train_tokens_per_second": 1934.587 + }, + { + "epoch": 0.17958536907434305, + "grad_norm": 0.5390625, + "learning_rate": 0.0004997133170813886, + "loss": 3.1302, + "num_input_tokens_seen": 44564480, + "step": 680, + "train_runtime": 23012.1109, + "train_tokens_per_second": 1936.566 + }, + { + "epoch": 0.18222633038425987, + "grad_norm": 0.484375, + "learning_rate": 0.0004996871544965707, + "loss": 3.0429, + "num_input_tokens_seen": 45219840, + "step": 690, + "train_runtime": 23327.9941, + "train_tokens_per_second": 1938.437 + }, + { + "epoch": 0.18486729169417668, + "grad_norm": 0.50390625, + "learning_rate": 0.0004996598504146256, + "loss": 3.0483, + "num_input_tokens_seen": 45875200, + "step": 700, + "train_runtime": 23645.2585, + "train_tokens_per_second": 1940.144 + }, + { + "epoch": 0.1875082530040935, + "grad_norm": 0.51171875, + "learning_rate": 0.0004996314049603798, + "loss": 3.0258, + "num_input_tokens_seen": 46530560, + "step": 710, + "train_runtime": 23966.8706, + "train_tokens_per_second": 1941.453 + }, + { + "epoch": 0.1901492143140103, + "grad_norm": 0.490234375, + "learning_rate": 0.0004996018182638778, + "loss": 3.0474, + "num_input_tokens_seen": 47185920, + "step": 720, + "train_runtime": 24284.225, + "train_tokens_per_second": 1943.069 + }, + { + "epoch": 0.1927901756239271, + "grad_norm": 0.466796875, + "learning_rate": 0.000499571090460381, + "loss": 3.0208, + "num_input_tokens_seen": 47841280, + "step": 730, + "train_runtime": 24599.8854, + "train_tokens_per_second": 1944.777 + }, + { + "epoch": 0.19543113693384392, + "grad_norm": 0.4765625, + "learning_rate": 0.0004995392216903683, + "loss": 3.0995, + "num_input_tokens_seen": 48496640, + "step": 740, + "train_runtime": 24916.0996, + "train_tokens_per_second": 1946.398 + }, + { + "epoch": 0.19807209824376074, + "grad_norm": 0.59375, + "learning_rate": 0.0004995062120995344, + "loss": 2.9743, + "num_input_tokens_seen": 49152000, + "step": 750, + "train_runtime": 25232.3577, + "train_tokens_per_second": 1947.975 + }, + { + "epoch": 0.20071305955367755, + "grad_norm": 0.490234375, + "learning_rate": 0.0004994720618387896, + "loss": 3.04, + "num_input_tokens_seen": 49807360, + "step": 760, + "train_runtime": 25549.3423, + "train_tokens_per_second": 1949.458 + }, + { + "epoch": 0.20335402086359436, + "grad_norm": 0.57421875, + "learning_rate": 0.0004994367710642587, + "loss": 2.9516, + "num_input_tokens_seen": 50462720, + "step": 770, + "train_runtime": 25865.1856, + "train_tokens_per_second": 1950.99 + }, + { + "epoch": 0.20599498217351117, + "grad_norm": 0.51171875, + "learning_rate": 0.0004994003399372812, + "loss": 3.0103, + "num_input_tokens_seen": 51118080, + "step": 780, + "train_runtime": 26180.2975, + "train_tokens_per_second": 1952.54 + }, + { + "epoch": 0.20863594348342795, + "grad_norm": 0.490234375, + "learning_rate": 0.0004993627686244094, + "loss": 3.0131, + "num_input_tokens_seen": 51773440, + "step": 790, + "train_runtime": 26496.843, + "train_tokens_per_second": 1953.947 + }, + { + "epoch": 0.21127690479334477, + "grad_norm": 0.462890625, + "learning_rate": 0.0004993240572974086, + "loss": 3.031, + "num_input_tokens_seen": 52428800, + "step": 800, + "train_runtime": 26813.6464, + "train_tokens_per_second": 1955.303 + }, + { + "epoch": 0.21391786610326158, + "grad_norm": 0.470703125, + "learning_rate": 0.0004992842061332557, + "loss": 2.9113, + "num_input_tokens_seen": 53084160, + "step": 810, + "train_runtime": 27130.06, + "train_tokens_per_second": 1956.655 + }, + { + "epoch": 0.2165588274131784, + "grad_norm": 0.484375, + "learning_rate": 0.0004992432153141385, + "loss": 2.9847, + "num_input_tokens_seen": 53739520, + "step": 820, + "train_runtime": 27448.1176, + "train_tokens_per_second": 1957.858 + }, + { + "epoch": 0.2191997887230952, + "grad_norm": 0.455078125, + "learning_rate": 0.0004992010850274552, + "loss": 2.9724, + "num_input_tokens_seen": 54394880, + "step": 830, + "train_runtime": 27762.5366, + "train_tokens_per_second": 1959.291 + }, + { + "epoch": 0.221840750033012, + "grad_norm": 0.46484375, + "learning_rate": 0.0004991578154658133, + "loss": 3.0043, + "num_input_tokens_seen": 55050240, + "step": 840, + "train_runtime": 28069.7859, + "train_tokens_per_second": 1961.192 + }, + { + "epoch": 0.22448171134292882, + "grad_norm": 0.443359375, + "learning_rate": 0.0004991134068270287, + "loss": 2.9317, + "num_input_tokens_seen": 55705600, + "step": 850, + "train_runtime": 28319.1428, + "train_tokens_per_second": 1967.065 + }, + { + "epoch": 0.22712267265284564, + "grad_norm": 0.49609375, + "learning_rate": 0.0004990678593141246, + "loss": 2.9107, + "num_input_tokens_seen": 56360960, + "step": 860, + "train_runtime": 28526.2213, + "train_tokens_per_second": 1975.76 + }, + { + "epoch": 0.22976363396276245, + "grad_norm": 0.55859375, + "learning_rate": 0.0004990211731353312, + "loss": 2.9912, + "num_input_tokens_seen": 57016320, + "step": 870, + "train_runtime": 28732.8072, + "train_tokens_per_second": 1984.363 + }, + { + "epoch": 0.23240459527267926, + "grad_norm": 0.48046875, + "learning_rate": 0.000498973348504084, + "loss": 2.965, + "num_input_tokens_seen": 57671680, + "step": 880, + "train_runtime": 28942.9681, + "train_tokens_per_second": 1992.597 + }, + { + "epoch": 0.23504555658259607, + "grad_norm": 0.4921875, + "learning_rate": 0.0004989243856390233, + "loss": 3.0581, + "num_input_tokens_seen": 58327040, + "step": 890, + "train_runtime": 29154.1158, + "train_tokens_per_second": 2000.645 + }, + { + "epoch": 0.23768651789251288, + "grad_norm": 0.447265625, + "learning_rate": 0.0004988742847639932, + "loss": 2.9547, + "num_input_tokens_seen": 58982400, + "step": 900, + "train_runtime": 29364.6284, + "train_tokens_per_second": 2008.621 + }, + { + "epoch": 0.2403274792024297, + "grad_norm": 0.49609375, + "learning_rate": 0.0004988230461080403, + "loss": 2.9386, + "num_input_tokens_seen": 59637760, + "step": 910, + "train_runtime": 29574.7867, + "train_tokens_per_second": 2016.507 + }, + { + "epoch": 0.2429684405123465, + "grad_norm": 0.51171875, + "learning_rate": 0.0004987706699054129, + "loss": 2.9689, + "num_input_tokens_seen": 60293120, + "step": 920, + "train_runtime": 29786.007, + "train_tokens_per_second": 2024.21 + }, + { + "epoch": 0.2456094018222633, + "grad_norm": 0.50390625, + "learning_rate": 0.0004987171563955597, + "loss": 2.9408, + "num_input_tokens_seen": 60948480, + "step": 930, + "train_runtime": 29996.8903, + "train_tokens_per_second": 2031.827 + }, + { + "epoch": 0.2482503631321801, + "grad_norm": 0.5078125, + "learning_rate": 0.0004986625058231289, + "loss": 2.9308, + "num_input_tokens_seen": 61603840, + "step": 940, + "train_runtime": 30207.0952, + "train_tokens_per_second": 2039.383 + }, + { + "epoch": 0.25089132444209694, + "grad_norm": 0.455078125, + "learning_rate": 0.0004986067184379673, + "loss": 2.9415, + "num_input_tokens_seen": 62259200, + "step": 950, + "train_runtime": 30418.1436, + "train_tokens_per_second": 2046.778 + }, + { + "epoch": 0.25353228575201375, + "grad_norm": 0.447265625, + "learning_rate": 0.0004985497944951182, + "loss": 2.8731, + "num_input_tokens_seen": 62914560, + "step": 960, + "train_runtime": 30627.5422, + "train_tokens_per_second": 2054.182 + }, + { + "epoch": 0.25617324706193056, + "grad_norm": 0.431640625, + "learning_rate": 0.0004984917342548217, + "loss": 2.9454, + "num_input_tokens_seen": 63569920, + "step": 970, + "train_runtime": 30837.6788, + "train_tokens_per_second": 2061.437 + }, + { + "epoch": 0.2588142083718474, + "grad_norm": 0.431640625, + "learning_rate": 0.000498432537982512, + "loss": 2.9318, + "num_input_tokens_seen": 64225280, + "step": 980, + "train_runtime": 31048.345, + "train_tokens_per_second": 2068.557 + }, + { + "epoch": 0.2614551696817642, + "grad_norm": 0.4375, + "learning_rate": 0.0004983722059488176, + "loss": 2.885, + "num_input_tokens_seen": 64880640, + "step": 990, + "train_runtime": 31259.0861, + "train_tokens_per_second": 2075.577 + }, + { + "epoch": 0.264096130991681, + "grad_norm": 0.46484375, + "learning_rate": 0.0004983107384295588, + "loss": 2.9069, + "num_input_tokens_seen": 65536000, + "step": 1000, + "train_runtime": 31469.6335, + "train_tokens_per_second": 2082.516 + }, + { + "epoch": 0.26673709230159776, + "grad_norm": 0.435546875, + "learning_rate": 0.0004982481357057474, + "loss": 2.942, + "num_input_tokens_seen": 66191360, + "step": 1010, + "train_runtime": 31685.0895, + "train_tokens_per_second": 2089.038 + }, + { + "epoch": 0.26937805361151457, + "grad_norm": 0.416015625, + "learning_rate": 0.0004981843980635846, + "loss": 2.7781, + "num_input_tokens_seen": 66846720, + "step": 1020, + "train_runtime": 31889.6356, + "train_tokens_per_second": 2096.19 + }, + { + "epoch": 0.2720190149214314, + "grad_norm": 0.5390625, + "learning_rate": 0.0004981195257944607, + "loss": 2.9807, + "num_input_tokens_seen": 67502080, + "step": 1030, + "train_runtime": 32099.7982, + "train_tokens_per_second": 2102.882 + }, + { + "epoch": 0.2746599762313482, + "grad_norm": 0.458984375, + "learning_rate": 0.0004980535191949528, + "loss": 2.9739, + "num_input_tokens_seen": 68157440, + "step": 1040, + "train_runtime": 32310.6244, + "train_tokens_per_second": 2109.444 + }, + { + "epoch": 0.277300937541265, + "grad_norm": 0.5859375, + "learning_rate": 0.0004979863785668237, + "loss": 2.8292, + "num_input_tokens_seen": 68812800, + "step": 1050, + "train_runtime": 32520.9509, + "train_tokens_per_second": 2115.953 + }, + { + "epoch": 0.2799418988511818, + "grad_norm": 0.4765625, + "learning_rate": 0.000497918104217021, + "loss": 2.9008, + "num_input_tokens_seen": 69468160, + "step": 1060, + "train_runtime": 32730.4591, + "train_tokens_per_second": 2122.432 + }, + { + "epoch": 0.2825828601610986, + "grad_norm": 0.42578125, + "learning_rate": 0.0004978486964576752, + "loss": 2.8412, + "num_input_tokens_seen": 70123520, + "step": 1070, + "train_runtime": 32939.7476, + "train_tokens_per_second": 2128.842 + }, + { + "epoch": 0.28522382147101544, + "grad_norm": 0.41796875, + "learning_rate": 0.0004977781556060984, + "loss": 2.9267, + "num_input_tokens_seen": 70778880, + "step": 1080, + "train_runtime": 33149.2502, + "train_tokens_per_second": 2135.158 + }, + { + "epoch": 0.28786478278093225, + "grad_norm": 0.416015625, + "learning_rate": 0.0004977064819847828, + "loss": 2.8814, + "num_input_tokens_seen": 71434240, + "step": 1090, + "train_runtime": 33359.5416, + "train_tokens_per_second": 2141.344 + }, + { + "epoch": 0.29050574409084906, + "grad_norm": 0.42578125, + "learning_rate": 0.0004976336759213994, + "loss": 2.8877, + "num_input_tokens_seen": 72089600, + "step": 1100, + "train_runtime": 33569.673, + "train_tokens_per_second": 2147.462 + }, + { + "epoch": 0.2931467054007659, + "grad_norm": 0.470703125, + "learning_rate": 0.0004975597377487965, + "loss": 2.8939, + "num_input_tokens_seen": 72744960, + "step": 1110, + "train_runtime": 33779.1378, + "train_tokens_per_second": 2153.547 + }, + { + "epoch": 0.2957876667106827, + "grad_norm": 0.41015625, + "learning_rate": 0.0004974846678049977, + "loss": 2.9035, + "num_input_tokens_seen": 73400320, + "step": 1120, + "train_runtime": 33988.6643, + "train_tokens_per_second": 2159.553 + }, + { + "epoch": 0.2984286280205995, + "grad_norm": 0.423828125, + "learning_rate": 0.0004974084664332012, + "loss": 2.8358, + "num_input_tokens_seen": 74055680, + "step": 1130, + "train_runtime": 34197.6484, + "train_tokens_per_second": 2165.52 + }, + { + "epoch": 0.3010695893305163, + "grad_norm": 0.4140625, + "learning_rate": 0.0004973311339817774, + "loss": 2.8287, + "num_input_tokens_seen": 74711040, + "step": 1140, + "train_runtime": 34407.5712, + "train_tokens_per_second": 2171.355 + }, + { + "epoch": 0.3037105506404331, + "grad_norm": 0.494140625, + "learning_rate": 0.0004972526708042678, + "loss": 2.8682, + "num_input_tokens_seen": 75366400, + "step": 1150, + "train_runtime": 34617.1799, + "train_tokens_per_second": 2177.139 + }, + { + "epoch": 0.30635151195034993, + "grad_norm": 0.455078125, + "learning_rate": 0.0004971730772593834, + "loss": 2.9179, + "num_input_tokens_seen": 76021760, + "step": 1160, + "train_runtime": 34826.9259, + "train_tokens_per_second": 2182.844 + }, + { + "epoch": 0.30899247326026674, + "grad_norm": 0.45703125, + "learning_rate": 0.0004970923537110026, + "loss": 2.8583, + "num_input_tokens_seen": 76677120, + "step": 1170, + "train_runtime": 35036.9229, + "train_tokens_per_second": 2188.466 + }, + { + "epoch": 0.31163343457018355, + "grad_norm": 0.451171875, + "learning_rate": 0.00049701050052817, + "loss": 2.8784, + "num_input_tokens_seen": 77332480, + "step": 1180, + "train_runtime": 35246.7906, + "train_tokens_per_second": 2194.029 + }, + { + "epoch": 0.31427439588010037, + "grad_norm": 0.396484375, + "learning_rate": 0.0004969275180850948, + "loss": 2.8666, + "num_input_tokens_seen": 77987840, + "step": 1190, + "train_runtime": 35455.8558, + "train_tokens_per_second": 2199.576 + }, + { + "epoch": 0.3169153571900172, + "grad_norm": 0.390625, + "learning_rate": 0.0004968434067611483, + "loss": 2.7926, + "num_input_tokens_seen": 78643200, + "step": 1200, + "train_runtime": 35665.5735, + "train_tokens_per_second": 2205.017 + }, + { + "epoch": 0.319556318499934, + "grad_norm": 0.482421875, + "learning_rate": 0.0004967581669408632, + "loss": 2.8923, + "num_input_tokens_seen": 79298560, + "step": 1210, + "train_runtime": 35874.4887, + "train_tokens_per_second": 2210.444 + }, + { + "epoch": 0.3221972798098508, + "grad_norm": 0.62890625, + "learning_rate": 0.0004966717990139313, + "loss": 2.8482, + "num_input_tokens_seen": 79953920, + "step": 1220, + "train_runtime": 36083.0476, + "train_tokens_per_second": 2215.831 + }, + { + "epoch": 0.3248382411197676, + "grad_norm": 0.412109375, + "learning_rate": 0.0004965843033752015, + "loss": 2.8508, + "num_input_tokens_seen": 80609280, + "step": 1230, + "train_runtime": 36292.5082, + "train_tokens_per_second": 2221.1 + }, + { + "epoch": 0.3274792024296844, + "grad_norm": 0.41015625, + "learning_rate": 0.0004964956804246784, + "loss": 2.9238, + "num_input_tokens_seen": 81264640, + "step": 1240, + "train_runtime": 36502.8026, + "train_tokens_per_second": 2226.258 + }, + { + "epoch": 0.33012016373960124, + "grad_norm": 0.421875, + "learning_rate": 0.0004964059305675205, + "loss": 2.8232, + "num_input_tokens_seen": 81920000, + "step": 1250, + "train_runtime": 36711.7136, + "train_tokens_per_second": 2231.44 + }, + { + "epoch": 0.33276112504951805, + "grad_norm": 0.392578125, + "learning_rate": 0.0004963150542140381, + "loss": 2.8759, + "num_input_tokens_seen": 82575360, + "step": 1260, + "train_runtime": 36921.2047, + "train_tokens_per_second": 2236.529 + }, + { + "epoch": 0.33540208635943486, + "grad_norm": 0.39453125, + "learning_rate": 0.0004962230517796915, + "loss": 2.8602, + "num_input_tokens_seen": 83230720, + "step": 1270, + "train_runtime": 37131.0352, + "train_tokens_per_second": 2241.541 + }, + { + "epoch": 0.33804304766935167, + "grad_norm": 0.380859375, + "learning_rate": 0.0004961299236850889, + "loss": 2.7801, + "num_input_tokens_seen": 83886080, + "step": 1280, + "train_runtime": 37340.6942, + "train_tokens_per_second": 2246.506 + }, + { + "epoch": 0.3406840089792684, + "grad_norm": 0.40234375, + "learning_rate": 0.000496035670355985, + "loss": 2.8335, + "num_input_tokens_seen": 84541440, + "step": 1290, + "train_runtime": 37550.6167, + "train_tokens_per_second": 2251.4 + }, + { + "epoch": 0.34332497028918524, + "grad_norm": 0.41796875, + "learning_rate": 0.0004959402922232788, + "loss": 2.7464, + "num_input_tokens_seen": 85196800, + "step": 1300, + "train_runtime": 37760.9191, + "train_tokens_per_second": 2256.216 + }, + { + "epoch": 0.34596593159910205, + "grad_norm": 0.392578125, + "learning_rate": 0.0004958437897230112, + "loss": 2.7879, + "num_input_tokens_seen": 85852160, + "step": 1310, + "train_runtime": 37970.4444, + "train_tokens_per_second": 2261.026 + }, + { + "epoch": 0.34860689290901886, + "grad_norm": 0.416015625, + "learning_rate": 0.0004957461632963637, + "loss": 2.8249, + "num_input_tokens_seen": 86507520, + "step": 1320, + "train_runtime": 38180.3322, + "train_tokens_per_second": 2265.761 + }, + { + "epoch": 0.3512478542189357, + "grad_norm": 0.37890625, + "learning_rate": 0.0004956474133896558, + "loss": 2.8435, + "num_input_tokens_seen": 87162880, + "step": 1330, + "train_runtime": 38390.9946, + "train_tokens_per_second": 2270.399 + }, + { + "epoch": 0.3538888155288525, + "grad_norm": 0.365234375, + "learning_rate": 0.0004955475404543436, + "loss": 2.858, + "num_input_tokens_seen": 87818240, + "step": 1340, + "train_runtime": 38600.5942, + "train_tokens_per_second": 2275.049 + }, + { + "epoch": 0.3565297768387693, + "grad_norm": 0.39453125, + "learning_rate": 0.0004954465449470172, + "loss": 2.7555, + "num_input_tokens_seen": 88473600, + "step": 1350, + "train_runtime": 38810.0092, + "train_tokens_per_second": 2279.659 + }, + { + "epoch": 0.3591707381486861, + "grad_norm": 0.380859375, + "learning_rate": 0.0004953444273293983, + "loss": 2.785, + "num_input_tokens_seen": 89128960, + "step": 1360, + "train_runtime": 39020.2344, + "train_tokens_per_second": 2284.173 + }, + { + "epoch": 0.3618116994586029, + "grad_norm": 0.390625, + "learning_rate": 0.0004952411880683394, + "loss": 2.8254, + "num_input_tokens_seen": 89784320, + "step": 1370, + "train_runtime": 39228.1232, + "train_tokens_per_second": 2288.774 + }, + { + "epoch": 0.36445266076851973, + "grad_norm": 0.380859375, + "learning_rate": 0.0004951368276358201, + "loss": 2.7833, + "num_input_tokens_seen": 90439680, + "step": 1380, + "train_runtime": 39436.5908, + "train_tokens_per_second": 2293.294 + }, + { + "epoch": 0.36709362207843654, + "grad_norm": 0.416015625, + "learning_rate": 0.0004950313465089464, + "loss": 2.7848, + "num_input_tokens_seen": 91095040, + "step": 1390, + "train_runtime": 39645.6075, + "train_tokens_per_second": 2297.733 + }, + { + "epoch": 0.36973458338835336, + "grad_norm": 0.3828125, + "learning_rate": 0.0004949247451699468, + "loss": 2.7894, + "num_input_tokens_seen": 91750400, + "step": 1400, + "train_runtime": 39855.598, + "train_tokens_per_second": 2302.071 + }, + { + "epoch": 0.37237554469827017, + "grad_norm": 0.412109375, + "learning_rate": 0.0004948170241061721, + "loss": 2.8428, + "num_input_tokens_seen": 92405760, + "step": 1410, + "train_runtime": 40065.6311, + "train_tokens_per_second": 2306.36 + }, + { + "epoch": 0.375016506008187, + "grad_norm": 0.361328125, + "learning_rate": 0.0004947081838100916, + "loss": 2.7368, + "num_input_tokens_seen": 93061120, + "step": 1420, + "train_runtime": 40275.3038, + "train_tokens_per_second": 2310.625 + }, + { + "epoch": 0.3776574673181038, + "grad_norm": 0.40625, + "learning_rate": 0.0004945982247792913, + "loss": 2.8151, + "num_input_tokens_seen": 93716480, + "step": 1430, + "train_runtime": 40484.8888, + "train_tokens_per_second": 2314.851 + }, + { + "epoch": 0.3802984286280206, + "grad_norm": 0.373046875, + "learning_rate": 0.0004944871475164722, + "loss": 2.7474, + "num_input_tokens_seen": 94371840, + "step": 1440, + "train_runtime": 40694.6037, + "train_tokens_per_second": 2319.026 + }, + { + "epoch": 0.3829393899379374, + "grad_norm": 0.412109375, + "learning_rate": 0.0004943749525294471, + "loss": 2.8309, + "num_input_tokens_seen": 95027200, + "step": 1450, + "train_runtime": 40904.0292, + "train_tokens_per_second": 2323.175 + }, + { + "epoch": 0.3855803512478542, + "grad_norm": 0.37109375, + "learning_rate": 0.0004942616403311391, + "loss": 2.7217, + "num_input_tokens_seen": 95682560, + "step": 1460, + "train_runtime": 41113.147, + "train_tokens_per_second": 2327.298 + }, + { + "epoch": 0.38822131255777104, + "grad_norm": 0.369140625, + "learning_rate": 0.0004941472114395784, + "loss": 2.7462, + "num_input_tokens_seen": 96337920, + "step": 1470, + "train_runtime": 41322.714, + "train_tokens_per_second": 2331.355 + }, + { + "epoch": 0.39086227386768785, + "grad_norm": 0.41796875, + "learning_rate": 0.0004940316663779008, + "loss": 2.8393, + "num_input_tokens_seen": 96993280, + "step": 1480, + "train_runtime": 41531.818, + "train_tokens_per_second": 2335.397 + }, + { + "epoch": 0.39350323517760466, + "grad_norm": 0.404296875, + "learning_rate": 0.0004939150056743446, + "loss": 2.8199, + "num_input_tokens_seen": 97648640, + "step": 1490, + "train_runtime": 41740.5354, + "train_tokens_per_second": 2339.42 + }, + { + "epoch": 0.39614419648752147, + "grad_norm": 0.466796875, + "learning_rate": 0.000493797229862249, + "loss": 2.8282, + "num_input_tokens_seen": 98304000, + "step": 1500, + "train_runtime": 41949.9329, + "train_tokens_per_second": 2343.365 + }, + { + "epoch": 0.3987851577974383, + "grad_norm": 0.384765625, + "learning_rate": 0.0004936783394800504, + "loss": 2.8253, + "num_input_tokens_seen": 98959360, + "step": 1510, + "train_runtime": 42165.4456, + "train_tokens_per_second": 2346.93 + }, + { + "epoch": 0.4014261191073551, + "grad_norm": 0.369140625, + "learning_rate": 0.0004935583350712812, + "loss": 2.7516, + "num_input_tokens_seen": 99614720, + "step": 1520, + "train_runtime": 42365.9689, + "train_tokens_per_second": 2351.291 + }, + { + "epoch": 0.4040670804172719, + "grad_norm": 0.365234375, + "learning_rate": 0.0004934372171845667, + "loss": 2.7645, + "num_input_tokens_seen": 100270080, + "step": 1530, + "train_runtime": 42572.3174, + "train_tokens_per_second": 2355.288 + }, + { + "epoch": 0.4067080417271887, + "grad_norm": 0.373046875, + "learning_rate": 0.0004933149863736228, + "loss": 2.7305, + "num_input_tokens_seen": 100925440, + "step": 1540, + "train_runtime": 42778.6279, + "train_tokens_per_second": 2359.249 + }, + { + "epoch": 0.40934900303710553, + "grad_norm": 0.361328125, + "learning_rate": 0.0004931916431972531, + "loss": 2.7933, + "num_input_tokens_seen": 101580800, + "step": 1550, + "train_runtime": 42985.0685, + "train_tokens_per_second": 2363.165 + }, + { + "epoch": 0.41198996434702234, + "grad_norm": 0.38671875, + "learning_rate": 0.0004930671882193468, + "loss": 2.7548, + "num_input_tokens_seen": 102236160, + "step": 1560, + "train_runtime": 43191.7226, + "train_tokens_per_second": 2367.031 + }, + { + "epoch": 0.41463092565693915, + "grad_norm": 0.43359375, + "learning_rate": 0.000492941622008876, + "loss": 2.7387, + "num_input_tokens_seen": 102891520, + "step": 1570, + "train_runtime": 43397.9618, + "train_tokens_per_second": 2370.884 + }, + { + "epoch": 0.4172718869668559, + "grad_norm": 0.37890625, + "learning_rate": 0.000492814945139893, + "loss": 2.7517, + "num_input_tokens_seen": 103546880, + "step": 1580, + "train_runtime": 43603.6755, + "train_tokens_per_second": 2374.728 + }, + { + "epoch": 0.4199128482767727, + "grad_norm": 0.369140625, + "learning_rate": 0.0004926871581915273, + "loss": 2.6632, + "num_input_tokens_seen": 104202240, + "step": 1590, + "train_runtime": 43810.0188, + "train_tokens_per_second": 2378.503 + }, + { + "epoch": 0.42255380958668953, + "grad_norm": 0.3984375, + "learning_rate": 0.000492558261747984, + "loss": 2.7857, + "num_input_tokens_seen": 104857600, + "step": 1600, + "train_runtime": 44016.6706, + "train_tokens_per_second": 2382.225 + }, + { + "epoch": 0.42519477089660634, + "grad_norm": 0.361328125, + "learning_rate": 0.00049242825639854, + "loss": 2.715, + "num_input_tokens_seen": 105512960, + "step": 1610, + "train_runtime": 44223.1041, + "train_tokens_per_second": 2385.924 + }, + { + "epoch": 0.42783573220652316, + "grad_norm": 0.37890625, + "learning_rate": 0.0004922971427375422, + "loss": 2.7376, + "num_input_tokens_seen": 106168320, + "step": 1620, + "train_runtime": 44429.7495, + "train_tokens_per_second": 2389.577 + }, + { + "epoch": 0.43047669351643997, + "grad_norm": 0.37890625, + "learning_rate": 0.0004921649213644041, + "loss": 2.7056, + "num_input_tokens_seen": 106823680, + "step": 1630, + "train_runtime": 44636.392, + "train_tokens_per_second": 2393.197 + }, + { + "epoch": 0.4331176548263568, + "grad_norm": 0.35546875, + "learning_rate": 0.0004920315928836032, + "loss": 2.749, + "num_input_tokens_seen": 107479040, + "step": 1640, + "train_runtime": 44843.2607, + "train_tokens_per_second": 2396.771 + }, + { + "epoch": 0.4357586161362736, + "grad_norm": 0.34765625, + "learning_rate": 0.0004918971579046788, + "loss": 2.7726, + "num_input_tokens_seen": 108134400, + "step": 1650, + "train_runtime": 45049.4992, + "train_tokens_per_second": 2400.346 + }, + { + "epoch": 0.4383995774461904, + "grad_norm": 0.345703125, + "learning_rate": 0.0004917616170422286, + "loss": 2.7144, + "num_input_tokens_seen": 108789760, + "step": 1660, + "train_runtime": 45256.0425, + "train_tokens_per_second": 2403.873 + }, + { + "epoch": 0.4410405387561072, + "grad_norm": 0.39453125, + "learning_rate": 0.0004916249709159057, + "loss": 2.7613, + "num_input_tokens_seen": 109445120, + "step": 1670, + "train_runtime": 45463.2858, + "train_tokens_per_second": 2407.33 + }, + { + "epoch": 0.443681500066024, + "grad_norm": 0.44140625, + "learning_rate": 0.0004914872201504169, + "loss": 2.7181, + "num_input_tokens_seen": 110100480, + "step": 1680, + "train_runtime": 45669.7593, + "train_tokens_per_second": 2410.796 + }, + { + "epoch": 0.44632246137594084, + "grad_norm": 0.35546875, + "learning_rate": 0.0004913483653755184, + "loss": 2.7729, + "num_input_tokens_seen": 110755840, + "step": 1690, + "train_runtime": 45876.0618, + "train_tokens_per_second": 2414.24 + }, + { + "epoch": 0.44896342268585765, + "grad_norm": 0.353515625, + "learning_rate": 0.0004912084072260141, + "loss": 2.6527, + "num_input_tokens_seen": 111411200, + "step": 1700, + "train_runtime": 46082.1107, + "train_tokens_per_second": 2417.667 + }, + { + "epoch": 0.45160438399577446, + "grad_norm": 0.33984375, + "learning_rate": 0.0004910673463417519, + "loss": 2.6773, + "num_input_tokens_seen": 112066560, + "step": 1710, + "train_runtime": 46288.4468, + "train_tokens_per_second": 2421.048 + }, + { + "epoch": 0.4542453453056913, + "grad_norm": 0.412109375, + "learning_rate": 0.0004909251833676211, + "loss": 2.6628, + "num_input_tokens_seen": 112721920, + "step": 1720, + "train_runtime": 46494.9697, + "train_tokens_per_second": 2424.39 + }, + { + "epoch": 0.4568863066156081, + "grad_norm": 0.353515625, + "learning_rate": 0.0004907819189535496, + "loss": 2.7297, + "num_input_tokens_seen": 113377280, + "step": 1730, + "train_runtime": 46700.8195, + "train_tokens_per_second": 2427.736 + }, + { + "epoch": 0.4595272679255249, + "grad_norm": 0.359375, + "learning_rate": 0.0004906375537545006, + "loss": 2.7085, + "num_input_tokens_seen": 114032640, + "step": 1740, + "train_runtime": 46907.3536, + "train_tokens_per_second": 2431.018 + }, + { + "epoch": 0.4621682292354417, + "grad_norm": 0.349609375, + "learning_rate": 0.00049049208843047, + "loss": 2.6987, + "num_input_tokens_seen": 114688000, + "step": 1750, + "train_runtime": 47113.9027, + "train_tokens_per_second": 2434.271 + }, + { + "epoch": 0.4648091905453585, + "grad_norm": 0.3515625, + "learning_rate": 0.0004903455236464828, + "loss": 2.7408, + "num_input_tokens_seen": 115343360, + "step": 1760, + "train_runtime": 47320.3422, + "train_tokens_per_second": 2437.501 + }, + { + "epoch": 0.46745015185527533, + "grad_norm": 0.40625, + "learning_rate": 0.0004901978600725909, + "loss": 2.6906, + "num_input_tokens_seen": 115998720, + "step": 1770, + "train_runtime": 47526.7886, + "train_tokens_per_second": 2440.702 + }, + { + "epoch": 0.47009111316519214, + "grad_norm": 0.345703125, + "learning_rate": 0.000490049098383869, + "loss": 2.7082, + "num_input_tokens_seen": 116654080, + "step": 1780, + "train_runtime": 47733.0043, + "train_tokens_per_second": 2443.887 + }, + { + "epoch": 0.47273207447510895, + "grad_norm": 0.40234375, + "learning_rate": 0.0004898992392604124, + "loss": 2.751, + "num_input_tokens_seen": 117309440, + "step": 1790, + "train_runtime": 47939.0501, + "train_tokens_per_second": 2447.054 + }, + { + "epoch": 0.47537303578502577, + "grad_norm": 0.35546875, + "learning_rate": 0.0004897482833873334, + "loss": 2.687, + "num_input_tokens_seen": 117964800, + "step": 1800, + "train_runtime": 48145.2663, + "train_tokens_per_second": 2450.185 + }, + { + "epoch": 0.4780139970949426, + "grad_norm": 0.34375, + "learning_rate": 0.0004895962314547585, + "loss": 2.7141, + "num_input_tokens_seen": 118620160, + "step": 1810, + "train_runtime": 48351.9308, + "train_tokens_per_second": 2453.266 + }, + { + "epoch": 0.4806549584048594, + "grad_norm": 0.44921875, + "learning_rate": 0.0004894430841578249, + "loss": 2.7107, + "num_input_tokens_seen": 119275520, + "step": 1820, + "train_runtime": 48558.7752, + "train_tokens_per_second": 2456.312 + }, + { + "epoch": 0.4832959197147762, + "grad_norm": 0.390625, + "learning_rate": 0.0004892888421966776, + "loss": 2.5919, + "num_input_tokens_seen": 119930880, + "step": 1830, + "train_runtime": 48765.4151, + "train_tokens_per_second": 2459.343 + }, + { + "epoch": 0.485936881024693, + "grad_norm": 0.392578125, + "learning_rate": 0.000489133506276466, + "loss": 2.6917, + "num_input_tokens_seen": 120586240, + "step": 1840, + "train_runtime": 48972.0677, + "train_tokens_per_second": 2462.347 + }, + { + "epoch": 0.4885778423346098, + "grad_norm": 0.384765625, + "learning_rate": 0.0004889770771073407, + "loss": 2.7011, + "num_input_tokens_seen": 121241600, + "step": 1850, + "train_runtime": 49178.7023, + "train_tokens_per_second": 2465.327 + }, + { + "epoch": 0.4912188036445266, + "grad_norm": 0.365234375, + "learning_rate": 0.0004888195554044507, + "loss": 2.6365, + "num_input_tokens_seen": 121896960, + "step": 1860, + "train_runtime": 49385.3685, + "train_tokens_per_second": 2468.281 + }, + { + "epoch": 0.4938597649544434, + "grad_norm": 0.36328125, + "learning_rate": 0.0004886609418879391, + "loss": 2.7245, + "num_input_tokens_seen": 122552320, + "step": 1870, + "train_runtime": 49591.7934, + "train_tokens_per_second": 2471.222 + }, + { + "epoch": 0.4965007262643602, + "grad_norm": 0.39453125, + "learning_rate": 0.000488501237282941, + "loss": 2.7149, + "num_input_tokens_seen": 123207680, + "step": 1880, + "train_runtime": 49798.3338, + "train_tokens_per_second": 2474.133 + }, + { + "epoch": 0.499141687574277, + "grad_norm": 0.3359375, + "learning_rate": 0.0004883404423195795, + "loss": 2.6651, + "num_input_tokens_seen": 123863040, + "step": 1890, + "train_runtime": 50004.4658, + "train_tokens_per_second": 2477.04 + }, + { + "epoch": 0.5017826488841939, + "grad_norm": 0.34765625, + "learning_rate": 0.0004881785577329624, + "loss": 2.7421, + "num_input_tokens_seen": 124518400, + "step": 1900, + "train_runtime": 50211.1109, + "train_tokens_per_second": 2479.897 + }, + { + "epoch": 0.5044236101941106, + "grad_norm": 0.349609375, + "learning_rate": 0.0004880155842631789, + "loss": 2.6493, + "num_input_tokens_seen": 125173760, + "step": 1910, + "train_runtime": 50417.5524, + "train_tokens_per_second": 2482.742 + }, + { + "epoch": 0.5070645715040275, + "grad_norm": 0.34765625, + "learning_rate": 0.00048785152265529657, + "loss": 2.702, + "num_input_tokens_seen": 125829120, + "step": 1920, + "train_runtime": 50624.4207, + "train_tokens_per_second": 2485.542 + }, + { + "epoch": 0.5097055328139443, + "grad_norm": 0.328125, + "learning_rate": 0.0004876863736593572, + "loss": 2.658, + "num_input_tokens_seen": 126484480, + "step": 1930, + "train_runtime": 50831.0628, + "train_tokens_per_second": 2488.33 + }, + { + "epoch": 0.5123464941238611, + "grad_norm": 0.333984375, + "learning_rate": 0.0004875201380303742, + "loss": 2.6901, + "num_input_tokens_seen": 127139840, + "step": 1940, + "train_runtime": 51037.2011, + "train_tokens_per_second": 2491.121 + }, + { + "epoch": 0.5149874554337779, + "grad_norm": 0.3671875, + "learning_rate": 0.00048735281652832844, + "loss": 2.6371, + "num_input_tokens_seen": 127795200, + "step": 1950, + "train_runtime": 51243.725, + "train_tokens_per_second": 2493.87 + }, + { + "epoch": 0.5176284167436948, + "grad_norm": 0.376953125, + "learning_rate": 0.00048718440991816516, + "loss": 2.7281, + "num_input_tokens_seen": 128450560, + "step": 1960, + "train_runtime": 51450.1688, + "train_tokens_per_second": 2496.601 + }, + { + "epoch": 0.5202693780536115, + "grad_norm": 0.33984375, + "learning_rate": 0.0004870149189697906, + "loss": 2.6286, + "num_input_tokens_seen": 129105920, + "step": 1970, + "train_runtime": 51656.6086, + "train_tokens_per_second": 2499.311 + }, + { + "epoch": 0.5229103393635284, + "grad_norm": 0.361328125, + "learning_rate": 0.0004868443444580681, + "loss": 2.6763, + "num_input_tokens_seen": 129761280, + "step": 1980, + "train_runtime": 51863.4736, + "train_tokens_per_second": 2501.978 + }, + { + "epoch": 0.5255513006734451, + "grad_norm": 0.3515625, + "learning_rate": 0.0004866726871628147, + "loss": 2.6689, + "num_input_tokens_seen": 130416640, + "step": 1990, + "train_runtime": 52068.9916, + "train_tokens_per_second": 2504.689 + }, + { + "epoch": 0.528192261983362, + "grad_norm": 0.314453125, + "learning_rate": 0.00048649994786879777, + "loss": 2.7113, + "num_input_tokens_seen": 131072000, + "step": 2000, + "train_runtime": 52276.5627, + "train_tokens_per_second": 2507.28 + }, + { + "epoch": 0.5308332232932788, + "grad_norm": 0.359375, + "learning_rate": 0.0004863261273657311, + "loss": 2.7491, + "num_input_tokens_seen": 131727360, + "step": 2010, + "train_runtime": 52495.0703, + "train_tokens_per_second": 2509.328 + }, + { + "epoch": 0.5334741846031955, + "grad_norm": 0.337890625, + "learning_rate": 0.0004861512264482716, + "loss": 2.6846, + "num_input_tokens_seen": 132382720, + "step": 2020, + "train_runtime": 52696.2556, + "train_tokens_per_second": 2512.185 + }, + { + "epoch": 0.5361151459131124, + "grad_norm": 0.3515625, + "learning_rate": 0.0004859752459160154, + "loss": 2.7133, + "num_input_tokens_seen": 133038080, + "step": 2030, + "train_runtime": 52897.1759, + "train_tokens_per_second": 2515.032 + }, + { + "epoch": 0.5387561072230291, + "grad_norm": 0.326171875, + "learning_rate": 0.0004857981865734943, + "loss": 2.7025, + "num_input_tokens_seen": 133693440, + "step": 2040, + "train_runtime": 53098.6928, + "train_tokens_per_second": 2517.829 + }, + { + "epoch": 0.541397068532946, + "grad_norm": 0.32421875, + "learning_rate": 0.0004856200492301723, + "loss": 2.7283, + "num_input_tokens_seen": 134348800, + "step": 2050, + "train_runtime": 53300.3491, + "train_tokens_per_second": 2520.599 + }, + { + "epoch": 0.5440380298428628, + "grad_norm": 0.34375, + "learning_rate": 0.00048544083470044154, + "loss": 2.6277, + "num_input_tokens_seen": 135004160, + "step": 2060, + "train_runtime": 53506.3955, + "train_tokens_per_second": 2523.141 + }, + { + "epoch": 0.5466789911527796, + "grad_norm": 0.34375, + "learning_rate": 0.0004852605438036187, + "loss": 2.6809, + "num_input_tokens_seen": 135659520, + "step": 2070, + "train_runtime": 53713.2347, + "train_tokens_per_second": 2525.626 + }, + { + "epoch": 0.5493199524626964, + "grad_norm": 0.3828125, + "learning_rate": 0.00048507917736394154, + "loss": 2.7295, + "num_input_tokens_seen": 136314880, + "step": 2080, + "train_runtime": 53919.8743, + "train_tokens_per_second": 2528.101 + }, + { + "epoch": 0.5519609137726132, + "grad_norm": 0.3671875, + "learning_rate": 0.0004848967362105646, + "loss": 2.7353, + "num_input_tokens_seen": 136970240, + "step": 2090, + "train_runtime": 54126.737, + "train_tokens_per_second": 2530.547 + }, + { + "epoch": 0.55460187508253, + "grad_norm": 0.33203125, + "learning_rate": 0.00048471322117755577, + "loss": 2.5811, + "num_input_tokens_seen": 137625600, + "step": 2100, + "train_runtime": 54333.5849, + "train_tokens_per_second": 2532.975 + }, + { + "epoch": 0.5572428363924469, + "grad_norm": 0.318359375, + "learning_rate": 0.0004845286331038927, + "loss": 2.6682, + "num_input_tokens_seen": 138280960, + "step": 2110, + "train_runtime": 54600.3269, + "train_tokens_per_second": 2532.603 + }, + { + "epoch": 0.5598837977023636, + "grad_norm": 0.341796875, + "learning_rate": 0.0004843429728334582, + "loss": 2.6917, + "num_input_tokens_seen": 138936320, + "step": 2120, + "train_runtime": 54898.5911, + "train_tokens_per_second": 2530.781 + }, + { + "epoch": 0.5625247590122805, + "grad_norm": 0.333984375, + "learning_rate": 0.0004841562412150372, + "loss": 2.6355, + "num_input_tokens_seen": 139591680, + "step": 2130, + "train_runtime": 55206.542, + "train_tokens_per_second": 2528.535 + }, + { + "epoch": 0.5651657203221973, + "grad_norm": 0.32421875, + "learning_rate": 0.0004839684391023124, + "loss": 2.6901, + "num_input_tokens_seen": 140247040, + "step": 2140, + "train_runtime": 55513.4285, + "train_tokens_per_second": 2526.362 + }, + { + "epoch": 0.5678066816321141, + "grad_norm": 0.34375, + "learning_rate": 0.00048377956735386044, + "loss": 2.7062, + "num_input_tokens_seen": 140902400, + "step": 2150, + "train_runtime": 55825.6226, + "train_tokens_per_second": 2523.974 + }, + { + "epoch": 0.5704476429420309, + "grad_norm": 0.357421875, + "learning_rate": 0.00048358962683314803, + "loss": 2.666, + "num_input_tokens_seen": 141557760, + "step": 2160, + "train_runtime": 56107.6676, + "train_tokens_per_second": 2522.966 + }, + { + "epoch": 0.5730886042519477, + "grad_norm": 0.33984375, + "learning_rate": 0.0004833986184085283, + "loss": 2.6364, + "num_input_tokens_seen": 142213120, + "step": 2170, + "train_runtime": 56384.4245, + "train_tokens_per_second": 2522.206 + }, + { + "epoch": 0.5757295655618645, + "grad_norm": 0.326171875, + "learning_rate": 0.00048320654295323594, + "loss": 2.6361, + "num_input_tokens_seen": 142868480, + "step": 2180, + "train_runtime": 56627.8315, + "train_tokens_per_second": 2522.938 + }, + { + "epoch": 0.5783705268717814, + "grad_norm": 0.35546875, + "learning_rate": 0.0004830134013453844, + "loss": 2.6673, + "num_input_tokens_seen": 143523840, + "step": 2190, + "train_runtime": 56936.1007, + "train_tokens_per_second": 2520.788 + }, + { + "epoch": 0.5810114881816981, + "grad_norm": 0.337890625, + "learning_rate": 0.00048281919446796083, + "loss": 2.6553, + "num_input_tokens_seen": 144179200, + "step": 2200, + "train_runtime": 57250.9944, + "train_tokens_per_second": 2518.37 + }, + { + "epoch": 0.583652449491615, + "grad_norm": 0.3203125, + "learning_rate": 0.00048262392320882276, + "loss": 2.6588, + "num_input_tokens_seen": 144834560, + "step": 2210, + "train_runtime": 57548.402, + "train_tokens_per_second": 2516.743 + }, + { + "epoch": 0.5862934108015317, + "grad_norm": 0.357421875, + "learning_rate": 0.0004824275884606936, + "loss": 2.635, + "num_input_tokens_seen": 145489920, + "step": 2220, + "train_runtime": 57792.8878, + "train_tokens_per_second": 2517.436 + }, + { + "epoch": 0.5889343721114486, + "grad_norm": 0.396484375, + "learning_rate": 0.0004822301911211587, + "loss": 2.6666, + "num_input_tokens_seen": 146145280, + "step": 2230, + "train_runtime": 57950.0287, + "train_tokens_per_second": 2521.919 + }, + { + "epoch": 0.5915753334213654, + "grad_norm": 0.357421875, + "learning_rate": 0.0004820317320926615, + "loss": 2.7133, + "num_input_tokens_seen": 146800640, + "step": 2240, + "train_runtime": 58070.5268, + "train_tokens_per_second": 2527.972 + }, + { + "epoch": 0.5942162947312822, + "grad_norm": 0.33984375, + "learning_rate": 0.00048183221228249883, + "loss": 2.5816, + "num_input_tokens_seen": 147456000, + "step": 2250, + "train_runtime": 58181.0426, + "train_tokens_per_second": 2534.434 + }, + { + "epoch": 0.596857256041199, + "grad_norm": 0.3984375, + "learning_rate": 0.0004816316326028175, + "loss": 2.6768, + "num_input_tokens_seen": 148111360, + "step": 2260, + "train_runtime": 58304.3706, + "train_tokens_per_second": 2540.313 + }, + { + "epoch": 0.5994982173511159, + "grad_norm": 0.41796875, + "learning_rate": 0.0004814299939706094, + "loss": 2.6189, + "num_input_tokens_seen": 148766720, + "step": 2270, + "train_runtime": 58471.2938, + "train_tokens_per_second": 2544.269 + }, + { + "epoch": 0.6021391786610326, + "grad_norm": 0.328125, + "learning_rate": 0.0004812272973077079, + "loss": 2.6704, + "num_input_tokens_seen": 149422080, + "step": 2280, + "train_runtime": 58622.6079, + "train_tokens_per_second": 2548.881 + }, + { + "epoch": 0.6047801399709494, + "grad_norm": 0.3359375, + "learning_rate": 0.00048102354354078304, + "loss": 2.6815, + "num_input_tokens_seen": 150077440, + "step": 2290, + "train_runtime": 58738.1173, + "train_tokens_per_second": 2555.026 + }, + { + "epoch": 0.6074211012808662, + "grad_norm": 0.341796875, + "learning_rate": 0.000480818733601338, + "loss": 2.6316, + "num_input_tokens_seen": 150732800, + "step": 2300, + "train_runtime": 58855.0518, + "train_tokens_per_second": 2561.085 + }, + { + "epoch": 0.610062062590783, + "grad_norm": 0.31640625, + "learning_rate": 0.00048061286842570423, + "loss": 2.7123, + "num_input_tokens_seen": 151388160, + "step": 2310, + "train_runtime": 58974.3368, + "train_tokens_per_second": 2567.018 + }, + { + "epoch": 0.6127030239006999, + "grad_norm": 0.36328125, + "learning_rate": 0.0004804059489550376, + "loss": 2.6427, + "num_input_tokens_seen": 152043520, + "step": 2320, + "train_runtime": 59090.3855, + "train_tokens_per_second": 2573.067 + }, + { + "epoch": 0.6153439852106166, + "grad_norm": 0.330078125, + "learning_rate": 0.0004801979761353137, + "loss": 2.6757, + "num_input_tokens_seen": 152698880, + "step": 2330, + "train_runtime": 59206.2037, + "train_tokens_per_second": 2579.103 + }, + { + "epoch": 0.6179849465205335, + "grad_norm": 0.322265625, + "learning_rate": 0.000479988950917324, + "loss": 2.5902, + "num_input_tokens_seen": 153354240, + "step": 2340, + "train_runtime": 59321.7353, + "train_tokens_per_second": 2585.127 + }, + { + "epoch": 0.6206259078304502, + "grad_norm": 0.427734375, + "learning_rate": 0.0004797788742566709, + "loss": 2.704, + "num_input_tokens_seen": 154009600, + "step": 2350, + "train_runtime": 59436.8491, + "train_tokens_per_second": 2591.147 + }, + { + "epoch": 0.6232668691403671, + "grad_norm": 0.31640625, + "learning_rate": 0.00047956774711376395, + "loss": 2.5874, + "num_input_tokens_seen": 154664960, + "step": 2360, + "train_runtime": 59552.7183, + "train_tokens_per_second": 2597.11 + }, + { + "epoch": 0.6259078304502839, + "grad_norm": 0.31640625, + "learning_rate": 0.00047935557045381504, + "loss": 2.516, + "num_input_tokens_seen": 155320320, + "step": 2370, + "train_runtime": 59668.5711, + "train_tokens_per_second": 2603.051 + }, + { + "epoch": 0.6285487917602007, + "grad_norm": 0.37890625, + "learning_rate": 0.0004791423452468344, + "loss": 2.6803, + "num_input_tokens_seen": 155975680, + "step": 2380, + "train_runtime": 59784.5325, + "train_tokens_per_second": 2608.964 + }, + { + "epoch": 0.6311897530701175, + "grad_norm": 0.337890625, + "learning_rate": 0.0004789280724676255, + "loss": 2.5819, + "num_input_tokens_seen": 156631040, + "step": 2390, + "train_runtime": 59900.5071, + "train_tokens_per_second": 2614.853 + }, + { + "epoch": 0.6338307143800344, + "grad_norm": 0.330078125, + "learning_rate": 0.0004787127530957812, + "loss": 2.7263, + "num_input_tokens_seen": 157286400, + "step": 2400, + "train_runtime": 60023.3878, + "train_tokens_per_second": 2620.419 + }, + { + "epoch": 0.6364716756899511, + "grad_norm": 0.33203125, + "learning_rate": 0.00047849638811567943, + "loss": 2.6114, + "num_input_tokens_seen": 157941760, + "step": 2410, + "train_runtime": 60142.469, + "train_tokens_per_second": 2626.127 + }, + { + "epoch": 0.639112636999868, + "grad_norm": 0.32421875, + "learning_rate": 0.0004782789785164776, + "loss": 2.6163, + "num_input_tokens_seen": 158597120, + "step": 2420, + "train_runtime": 60260.8341, + "train_tokens_per_second": 2631.844 + }, + { + "epoch": 0.6417535983097847, + "grad_norm": 0.3515625, + "learning_rate": 0.00047806052529210966, + "loss": 2.5355, + "num_input_tokens_seen": 159252480, + "step": 2430, + "train_runtime": 60382.3435, + "train_tokens_per_second": 2637.401 + }, + { + "epoch": 0.6443945596197016, + "grad_norm": 0.326171875, + "learning_rate": 0.00047784102944127993, + "loss": 2.6617, + "num_input_tokens_seen": 159907840, + "step": 2440, + "train_runtime": 60503.0008, + "train_tokens_per_second": 2642.974 + }, + { + "epoch": 0.6470355209296184, + "grad_norm": 0.328125, + "learning_rate": 0.0004776204919674598, + "loss": 2.5774, + "num_input_tokens_seen": 160563200, + "step": 2450, + "train_runtime": 60626.6115, + "train_tokens_per_second": 2648.395 + }, + { + "epoch": 0.6496764822395352, + "grad_norm": 0.33203125, + "learning_rate": 0.0004773989138788826, + "loss": 2.6197, + "num_input_tokens_seen": 161218560, + "step": 2460, + "train_runtime": 60749.4747, + "train_tokens_per_second": 2653.826 + }, + { + "epoch": 0.652317443549452, + "grad_norm": 0.33203125, + "learning_rate": 0.00047717629618853886, + "loss": 2.5963, + "num_input_tokens_seen": 161873920, + "step": 2470, + "train_runtime": 60874.9324, + "train_tokens_per_second": 2659.123 + }, + { + "epoch": 0.6549584048593688, + "grad_norm": 0.34375, + "learning_rate": 0.0004769526399141721, + "loss": 2.6724, + "num_input_tokens_seen": 162529280, + "step": 2480, + "train_runtime": 60999.0162, + "train_tokens_per_second": 2664.457 + }, + { + "epoch": 0.6575993661692856, + "grad_norm": 0.34375, + "learning_rate": 0.0004767279460782737, + "loss": 2.5581, + "num_input_tokens_seen": 163184640, + "step": 2490, + "train_runtime": 61120.3538, + "train_tokens_per_second": 2669.89 + }, + { + "epoch": 0.6602403274792025, + "grad_norm": 0.314453125, + "learning_rate": 0.00047650221570807864, + "loss": 2.6499, + "num_input_tokens_seen": 163840000, + "step": 2500, + "train_runtime": 61245.0835, + "train_tokens_per_second": 2675.154 + }, + { + "epoch": 0.6628812887891192, + "grad_norm": 0.357421875, + "learning_rate": 0.0004762754498355606, + "loss": 2.6674, + "num_input_tokens_seen": 164495360, + "step": 2510, + "train_runtime": 61387.9006, + "train_tokens_per_second": 2679.606 + }, + { + "epoch": 0.6655222500990361, + "grad_norm": 0.32421875, + "learning_rate": 0.0004760476494974273, + "loss": 2.6745, + "num_input_tokens_seen": 165150720, + "step": 2520, + "train_runtime": 61511.164, + "train_tokens_per_second": 2684.89 + }, + { + "epoch": 0.6681632114089529, + "grad_norm": 0.30859375, + "learning_rate": 0.00047581881573511566, + "loss": 2.6319, + "num_input_tokens_seen": 165806080, + "step": 2530, + "train_runtime": 61635.1824, + "train_tokens_per_second": 2690.121 + }, + { + "epoch": 0.6708041727188697, + "grad_norm": 0.349609375, + "learning_rate": 0.0004755889495947872, + "loss": 2.6683, + "num_input_tokens_seen": 166461440, + "step": 2540, + "train_runtime": 61759.2578, + "train_tokens_per_second": 2695.328 + }, + { + "epoch": 0.6734451340287865, + "grad_norm": 0.318359375, + "learning_rate": 0.00047535805212732296, + "loss": 2.5637, + "num_input_tokens_seen": 167116800, + "step": 2550, + "train_runtime": 61882.2643, + "train_tokens_per_second": 2700.561 + }, + { + "epoch": 0.6760860953387033, + "grad_norm": 0.330078125, + "learning_rate": 0.00047512612438831934, + "loss": 2.6466, + "num_input_tokens_seen": 167772160, + "step": 2560, + "train_runtime": 62004.6058, + "train_tokens_per_second": 2705.802 + }, + { + "epoch": 0.6787270566486201, + "grad_norm": 0.326171875, + "learning_rate": 0.00047489316743808244, + "loss": 2.5368, + "num_input_tokens_seen": 168427520, + "step": 2570, + "train_runtime": 62126.926, + "train_tokens_per_second": 2711.023 + }, + { + "epoch": 0.6813680179585369, + "grad_norm": 0.33984375, + "learning_rate": 0.0004746591823416236, + "loss": 2.6437, + "num_input_tokens_seen": 169082880, + "step": 2580, + "train_runtime": 62250.0881, + "train_tokens_per_second": 2716.187 + }, + { + "epoch": 0.6840089792684537, + "grad_norm": 0.3359375, + "learning_rate": 0.0004744241701686551, + "loss": 2.5841, + "num_input_tokens_seen": 169738240, + "step": 2590, + "train_runtime": 62371.1278, + "train_tokens_per_second": 2721.423 + }, + { + "epoch": 0.6866499405783705, + "grad_norm": 0.34765625, + "learning_rate": 0.00047418813199358393, + "loss": 2.5932, + "num_input_tokens_seen": 170393600, + "step": 2600, + "train_runtime": 62492.6048, + "train_tokens_per_second": 2726.62 + }, + { + "epoch": 0.6892909018882873, + "grad_norm": 0.349609375, + "learning_rate": 0.0004739510688955082, + "loss": 2.6544, + "num_input_tokens_seen": 171048960, + "step": 2610, + "train_runtime": 62613.8962, + "train_tokens_per_second": 2731.805 + }, + { + "epoch": 0.6919318631982041, + "grad_norm": 0.314453125, + "learning_rate": 0.0004737129819582116, + "loss": 2.6383, + "num_input_tokens_seen": 171704320, + "step": 2620, + "train_runtime": 62734.3623, + "train_tokens_per_second": 2737.006 + }, + { + "epoch": 0.694572824508121, + "grad_norm": 0.314453125, + "learning_rate": 0.0004734738722701583, + "loss": 2.6641, + "num_input_tokens_seen": 172359680, + "step": 2630, + "train_runtime": 62851.6057, + "train_tokens_per_second": 2742.327 + }, + { + "epoch": 0.6972137858180377, + "grad_norm": 0.326171875, + "learning_rate": 0.00047323374092448836, + "loss": 2.6349, + "num_input_tokens_seen": 173015040, + "step": 2640, + "train_runtime": 62971.5987, + "train_tokens_per_second": 2747.509 + }, + { + "epoch": 0.6998547471279546, + "grad_norm": 0.330078125, + "learning_rate": 0.00047299258901901253, + "loss": 2.6536, + "num_input_tokens_seen": 173670400, + "step": 2650, + "train_runtime": 63090.7199, + "train_tokens_per_second": 2752.709 + }, + { + "epoch": 0.7024957084378713, + "grad_norm": 0.30859375, + "learning_rate": 0.0004727504176562073, + "loss": 2.5464, + "num_input_tokens_seen": 174325760, + "step": 2660, + "train_runtime": 63209.2983, + "train_tokens_per_second": 2757.913 + }, + { + "epoch": 0.7051366697477882, + "grad_norm": 0.32421875, + "learning_rate": 0.00047250722794320977, + "loss": 2.5616, + "num_input_tokens_seen": 174981120, + "step": 2670, + "train_runtime": 63328.2606, + "train_tokens_per_second": 2763.081 + }, + { + "epoch": 0.707777631057705, + "grad_norm": 0.310546875, + "learning_rate": 0.0004722630209918126, + "loss": 2.6033, + "num_input_tokens_seen": 175636480, + "step": 2680, + "train_runtime": 63447.7327, + "train_tokens_per_second": 2768.207 + }, + { + "epoch": 0.7104185923676218, + "grad_norm": 0.3203125, + "learning_rate": 0.000472017797918459, + "loss": 2.4887, + "num_input_tokens_seen": 176291840, + "step": 2690, + "train_runtime": 63567.9084, + "train_tokens_per_second": 2773.284 + }, + { + "epoch": 0.7130595536775386, + "grad_norm": 0.318359375, + "learning_rate": 0.00047177155984423776, + "loss": 2.6174, + "num_input_tokens_seen": 176947200, + "step": 2700, + "train_runtime": 63686.5563, + "train_tokens_per_second": 2778.407 + }, + { + "epoch": 0.7157005149874555, + "grad_norm": 0.353515625, + "learning_rate": 0.00047152430789487764, + "loss": 2.6112, + "num_input_tokens_seen": 177602560, + "step": 2710, + "train_runtime": 63807.847, + "train_tokens_per_second": 2783.397 + }, + { + "epoch": 0.7183414762973722, + "grad_norm": 0.314453125, + "learning_rate": 0.00047127604320074286, + "loss": 2.5791, + "num_input_tokens_seen": 178257920, + "step": 2720, + "train_runtime": 63924.7415, + "train_tokens_per_second": 2788.559 + }, + { + "epoch": 0.7209824376072891, + "grad_norm": 0.3125, + "learning_rate": 0.00047102676689682733, + "loss": 2.5559, + "num_input_tokens_seen": 178913280, + "step": 2730, + "train_runtime": 64045.3328, + "train_tokens_per_second": 2793.541 + }, + { + "epoch": 0.7236233989172058, + "grad_norm": 0.30859375, + "learning_rate": 0.00047077648012275005, + "loss": 2.5332, + "num_input_tokens_seen": 179568640, + "step": 2740, + "train_runtime": 64163.5594, + "train_tokens_per_second": 2798.608 + }, + { + "epoch": 0.7262643602271227, + "grad_norm": 0.3046875, + "learning_rate": 0.00047052518402274936, + "loss": 2.582, + "num_input_tokens_seen": 180224000, + "step": 2750, + "train_runtime": 64280.2933, + "train_tokens_per_second": 2803.721 + }, + { + "epoch": 0.7289053215370395, + "grad_norm": 0.328125, + "learning_rate": 0.0004702728797456779, + "loss": 2.6733, + "num_input_tokens_seen": 180879360, + "step": 2760, + "train_runtime": 64397.0611, + "train_tokens_per_second": 2808.814 + }, + { + "epoch": 0.7315462828469563, + "grad_norm": 0.318359375, + "learning_rate": 0.00047001956844499774, + "loss": 2.566, + "num_input_tokens_seen": 181534720, + "step": 2770, + "train_runtime": 64514.3091, + "train_tokens_per_second": 2813.868 + }, + { + "epoch": 0.7341872441568731, + "grad_norm": 0.333984375, + "learning_rate": 0.00046976525127877434, + "loss": 2.5983, + "num_input_tokens_seen": 182190080, + "step": 2780, + "train_runtime": 64637.4081, + "train_tokens_per_second": 2818.648 + }, + { + "epoch": 0.73682820546679, + "grad_norm": 0.318359375, + "learning_rate": 0.00046950992940967206, + "loss": 2.6204, + "num_input_tokens_seen": 182845440, + "step": 2790, + "train_runtime": 64760.9482, + "train_tokens_per_second": 2823.39 + }, + { + "epoch": 0.7394691667767067, + "grad_norm": 0.35546875, + "learning_rate": 0.0004692536040049482, + "loss": 2.5219, + "num_input_tokens_seen": 183500800, + "step": 2800, + "train_runtime": 64885.4696, + "train_tokens_per_second": 2828.072 + }, + { + "epoch": 0.7421101280866236, + "grad_norm": 0.337890625, + "learning_rate": 0.00046899627623644817, + "loss": 2.4908, + "num_input_tokens_seen": 184156160, + "step": 2810, + "train_runtime": 65004.5941, + "train_tokens_per_second": 2832.971 + }, + { + "epoch": 0.7447510893965403, + "grad_norm": 0.357421875, + "learning_rate": 0.0004687379472805996, + "loss": 2.6228, + "num_input_tokens_seen": 184811520, + "step": 2820, + "train_runtime": 65122.5778, + "train_tokens_per_second": 2837.902 + }, + { + "epoch": 0.7473920507064572, + "grad_norm": 0.3125, + "learning_rate": 0.0004684786183184074, + "loss": 2.6018, + "num_input_tokens_seen": 185466880, + "step": 2830, + "train_runtime": 65239.4763, + "train_tokens_per_second": 2842.863 + }, + { + "epoch": 0.750033012016374, + "grad_norm": 0.326171875, + "learning_rate": 0.0004682182905354485, + "loss": 2.5533, + "num_input_tokens_seen": 186122240, + "step": 2840, + "train_runtime": 65361.0354, + "train_tokens_per_second": 2847.602 + }, + { + "epoch": 0.7526739733262908, + "grad_norm": 0.314453125, + "learning_rate": 0.0004679569651218657, + "loss": 2.5546, + "num_input_tokens_seen": 186777600, + "step": 2850, + "train_runtime": 65479.2513, + "train_tokens_per_second": 2852.47 + }, + { + "epoch": 0.7553149346362076, + "grad_norm": 0.306640625, + "learning_rate": 0.0004676946432723628, + "loss": 2.5145, + "num_input_tokens_seen": 187432960, + "step": 2860, + "train_runtime": 65597.8854, + "train_tokens_per_second": 2857.302 + }, + { + "epoch": 0.7579558959461243, + "grad_norm": 0.314453125, + "learning_rate": 0.00046743132618619923, + "loss": 2.5676, + "num_input_tokens_seen": 188088320, + "step": 2870, + "train_runtime": 65716.9951, + "train_tokens_per_second": 2862.096 + }, + { + "epoch": 0.7605968572560412, + "grad_norm": 0.361328125, + "learning_rate": 0.00046716701506718415, + "loss": 2.5907, + "num_input_tokens_seen": 188743680, + "step": 2880, + "train_runtime": 65840.4604, + "train_tokens_per_second": 2866.682 + }, + { + "epoch": 0.763237818565958, + "grad_norm": 0.31640625, + "learning_rate": 0.0004669017111236712, + "loss": 2.5965, + "num_input_tokens_seen": 189399040, + "step": 2890, + "train_runtime": 65957.414, + "train_tokens_per_second": 2871.535 + }, + { + "epoch": 0.7658787798758748, + "grad_norm": 0.3125, + "learning_rate": 0.00046663541556855295, + "loss": 2.5296, + "num_input_tokens_seen": 190054400, + "step": 2900, + "train_runtime": 66075.1886, + "train_tokens_per_second": 2876.335 + }, + { + "epoch": 0.7685197411857916, + "grad_norm": 0.30859375, + "learning_rate": 0.0004663681296192552, + "loss": 2.5497, + "num_input_tokens_seen": 190709760, + "step": 2910, + "train_runtime": 66192.3246, + "train_tokens_per_second": 2881.146 + }, + { + "epoch": 0.7711607024957084, + "grad_norm": 0.306640625, + "learning_rate": 0.00046609985449773183, + "loss": 2.5649, + "num_input_tokens_seen": 191365120, + "step": 2920, + "train_runtime": 66311.303, + "train_tokens_per_second": 2885.86 + }, + { + "epoch": 0.7738016638056252, + "grad_norm": 0.3046875, + "learning_rate": 0.00046583059143045857, + "loss": 2.5338, + "num_input_tokens_seen": 192020480, + "step": 2930, + "train_runtime": 66429.1291, + "train_tokens_per_second": 2890.607 + }, + { + "epoch": 0.7764426251155421, + "grad_norm": 0.302734375, + "learning_rate": 0.00046556034164842814, + "loss": 2.5703, + "num_input_tokens_seen": 192675840, + "step": 2940, + "train_runtime": 66546.2716, + "train_tokens_per_second": 2895.366 + }, + { + "epoch": 0.7790835864254588, + "grad_norm": 0.333984375, + "learning_rate": 0.00046528910638714386, + "loss": 2.6228, + "num_input_tokens_seen": 193331200, + "step": 2950, + "train_runtime": 66663.6787, + "train_tokens_per_second": 2900.098 + }, + { + "epoch": 0.7817245477353757, + "grad_norm": 0.314453125, + "learning_rate": 0.0004650168868866146, + "loss": 2.4879, + "num_input_tokens_seen": 193986560, + "step": 2960, + "train_runtime": 66780.8493, + "train_tokens_per_second": 2904.823 + }, + { + "epoch": 0.7843655090452925, + "grad_norm": 0.33203125, + "learning_rate": 0.0004647436843913488, + "loss": 2.5265, + "num_input_tokens_seen": 194641920, + "step": 2970, + "train_runtime": 66899.303, + "train_tokens_per_second": 2909.476 + }, + { + "epoch": 0.7870064703552093, + "grad_norm": 0.3515625, + "learning_rate": 0.00046446950015034894, + "loss": 2.5625, + "num_input_tokens_seen": 195297280, + "step": 2980, + "train_runtime": 67018.546, + "train_tokens_per_second": 2914.078 + }, + { + "epoch": 0.7896474316651261, + "grad_norm": 0.35546875, + "learning_rate": 0.00046419433541710573, + "loss": 2.5502, + "num_input_tokens_seen": 195952640, + "step": 2990, + "train_runtime": 67137.0732, + "train_tokens_per_second": 2918.695 + }, + { + "epoch": 0.7922883929750429, + "grad_norm": 0.318359375, + "learning_rate": 0.00046391819144959225, + "loss": 2.5597, + "num_input_tokens_seen": 196608000, + "step": 3000, + "train_runtime": 67258.6768, + "train_tokens_per_second": 2923.162 + }, + { + "epoch": 0.7949293542849597, + "grad_norm": 0.31640625, + "learning_rate": 0.00046364106951025865, + "loss": 2.501, + "num_input_tokens_seen": 197263360, + "step": 3010, + "train_runtime": 67390.2208, + "train_tokens_per_second": 2927.181 + }, + { + "epoch": 0.7975703155948766, + "grad_norm": 0.3125, + "learning_rate": 0.0004633629708660258, + "loss": 2.6074, + "num_input_tokens_seen": 197918720, + "step": 3020, + "train_runtime": 67511.9884, + "train_tokens_per_second": 2931.609 + }, + { + "epoch": 0.8002112769047933, + "grad_norm": 0.30859375, + "learning_rate": 0.00046308389678828, + "loss": 2.5246, + "num_input_tokens_seen": 198574080, + "step": 3030, + "train_runtime": 67633.7618, + "train_tokens_per_second": 2936.02 + }, + { + "epoch": 0.8028522382147102, + "grad_norm": 0.333984375, + "learning_rate": 0.0004628038485528667, + "loss": 2.5615, + "num_input_tokens_seen": 199229440, + "step": 3040, + "train_runtime": 67751.2595, + "train_tokens_per_second": 2940.601 + }, + { + "epoch": 0.805493199524627, + "grad_norm": 0.296875, + "learning_rate": 0.0004625228274400853, + "loss": 2.5714, + "num_input_tokens_seen": 199884800, + "step": 3050, + "train_runtime": 67868.5083, + "train_tokens_per_second": 2945.177 + }, + { + "epoch": 0.8081341608345438, + "grad_norm": 0.369140625, + "learning_rate": 0.00046224083473468246, + "loss": 2.5129, + "num_input_tokens_seen": 200540160, + "step": 3060, + "train_runtime": 67989.2765, + "train_tokens_per_second": 2949.585 + }, + { + "epoch": 0.8107751221444606, + "grad_norm": 0.3046875, + "learning_rate": 0.0004619578717258471, + "loss": 2.5505, + "num_input_tokens_seen": 201195520, + "step": 3070, + "train_runtime": 68109.2186, + "train_tokens_per_second": 2954.013 + }, + { + "epoch": 0.8134160834543774, + "grad_norm": 0.306640625, + "learning_rate": 0.0004616739397072037, + "loss": 2.5338, + "num_input_tokens_seen": 201850880, + "step": 3080, + "train_runtime": 68229.9922, + "train_tokens_per_second": 2958.389 + }, + { + "epoch": 0.8160570447642942, + "grad_norm": 0.30078125, + "learning_rate": 0.00046138903997680706, + "loss": 2.5715, + "num_input_tokens_seen": 202506240, + "step": 3090, + "train_runtime": 68347.1664, + "train_tokens_per_second": 2962.906 + }, + { + "epoch": 0.8186980060742111, + "grad_norm": 0.3203125, + "learning_rate": 0.00046110317383713596, + "loss": 2.5556, + "num_input_tokens_seen": 203161600, + "step": 3100, + "train_runtime": 68463.2499, + "train_tokens_per_second": 2967.455 + }, + { + "epoch": 0.8213389673841278, + "grad_norm": 0.330078125, + "learning_rate": 0.0004608163425950873, + "loss": 2.5133, + "num_input_tokens_seen": 203816960, + "step": 3110, + "train_runtime": 68582.6904, + "train_tokens_per_second": 2971.843 + }, + { + "epoch": 0.8239799286940447, + "grad_norm": 0.3046875, + "learning_rate": 0.00046052854756197014, + "loss": 2.5604, + "num_input_tokens_seen": 204472320, + "step": 3120, + "train_runtime": 68702.7138, + "train_tokens_per_second": 2976.19 + }, + { + "epoch": 0.8266208900039614, + "grad_norm": 0.298828125, + "learning_rate": 0.0004602397900534999, + "loss": 2.4993, + "num_input_tokens_seen": 205127680, + "step": 3130, + "train_runtime": 68820.0311, + "train_tokens_per_second": 2980.639 + }, + { + "epoch": 0.8292618513138783, + "grad_norm": 0.322265625, + "learning_rate": 0.00045995007138979196, + "loss": 2.495, + "num_input_tokens_seen": 205783040, + "step": 3140, + "train_runtime": 68936.7593, + "train_tokens_per_second": 2985.099 + }, + { + "epoch": 0.8319028126237951, + "grad_norm": 0.3359375, + "learning_rate": 0.00045965939289535586, + "loss": 2.558, + "num_input_tokens_seen": 206438400, + "step": 3150, + "train_runtime": 69054.1993, + "train_tokens_per_second": 2989.513 + }, + { + "epoch": 0.8345437739337118, + "grad_norm": 0.36328125, + "learning_rate": 0.00045936775589908917, + "loss": 2.6297, + "num_input_tokens_seen": 207093760, + "step": 3160, + "train_runtime": 69172.7498, + "train_tokens_per_second": 2993.863 + }, + { + "epoch": 0.8371847352436287, + "grad_norm": 0.3359375, + "learning_rate": 0.0004590751617342716, + "loss": 2.5844, + "num_input_tokens_seen": 207749120, + "step": 3170, + "train_runtime": 69292.1584, + "train_tokens_per_second": 2998.162 + }, + { + "epoch": 0.8398256965535454, + "grad_norm": 0.306640625, + "learning_rate": 0.0004587816117385586, + "loss": 2.5686, + "num_input_tokens_seen": 208404480, + "step": 3180, + "train_runtime": 69410.4443, + "train_tokens_per_second": 3002.495 + }, + { + "epoch": 0.8424666578634623, + "grad_norm": 0.32421875, + "learning_rate": 0.0004584871072539755, + "loss": 2.5584, + "num_input_tokens_seen": 209059840, + "step": 3190, + "train_runtime": 69530.3644, + "train_tokens_per_second": 3006.742 + }, + { + "epoch": 0.8451076191733791, + "grad_norm": 0.3046875, + "learning_rate": 0.0004581916496269112, + "loss": 2.5357, + "num_input_tokens_seen": 209715200, + "step": 3200, + "train_runtime": 69649.7436, + "train_tokens_per_second": 3010.997 + }, + { + "epoch": 0.8477485804832959, + "grad_norm": 0.322265625, + "learning_rate": 0.00045789524020811213, + "loss": 2.6667, + "num_input_tokens_seen": 210370560, + "step": 3210, + "train_runtime": 69767.2675, + "train_tokens_per_second": 3015.319 + }, + { + "epoch": 0.8503895417932127, + "grad_norm": 0.310546875, + "learning_rate": 0.0004575978803526761, + "loss": 2.5453, + "num_input_tokens_seen": 211025920, + "step": 3220, + "train_runtime": 69887.2044, + "train_tokens_per_second": 3019.522 + }, + { + "epoch": 0.8530305031031296, + "grad_norm": 0.30859375, + "learning_rate": 0.00045729957142004587, + "loss": 2.5007, + "num_input_tokens_seen": 211681280, + "step": 3230, + "train_runtime": 70005.0651, + "train_tokens_per_second": 3023.799 + }, + { + "epoch": 0.8556714644130463, + "grad_norm": 0.294921875, + "learning_rate": 0.00045700031477400335, + "loss": 2.5585, + "num_input_tokens_seen": 212336640, + "step": 3240, + "train_runtime": 70121.88, + "train_tokens_per_second": 3028.108 + }, + { + "epoch": 0.8583124257229632, + "grad_norm": 0.318359375, + "learning_rate": 0.0004567001117826628, + "loss": 2.5284, + "num_input_tokens_seen": 212992000, + "step": 3250, + "train_runtime": 70238.6702, + "train_tokens_per_second": 3032.404 + }, + { + "epoch": 0.8609533870328799, + "grad_norm": 0.314453125, + "learning_rate": 0.00045639896381846525, + "loss": 2.564, + "num_input_tokens_seen": 213647360, + "step": 3260, + "train_runtime": 70356.4627, + "train_tokens_per_second": 3036.642 + }, + { + "epoch": 0.8635943483427968, + "grad_norm": 0.3046875, + "learning_rate": 0.0004560968722581716, + "loss": 2.5174, + "num_input_tokens_seen": 214302720, + "step": 3270, + "train_runtime": 70473.0866, + "train_tokens_per_second": 3040.916 + }, + { + "epoch": 0.8662353096527136, + "grad_norm": 0.33203125, + "learning_rate": 0.00045579383848285673, + "loss": 2.5079, + "num_input_tokens_seen": 214958080, + "step": 3280, + "train_runtime": 70590.8319, + "train_tokens_per_second": 3045.127 + }, + { + "epoch": 0.8688762709626304, + "grad_norm": 0.3046875, + "learning_rate": 0.000455489863877903, + "loss": 2.5546, + "num_input_tokens_seen": 215613440, + "step": 3290, + "train_runtime": 70707.7468, + "train_tokens_per_second": 3049.361 + }, + { + "epoch": 0.8715172322725472, + "grad_norm": 0.33203125, + "learning_rate": 0.00045518494983299397, + "loss": 2.5635, + "num_input_tokens_seen": 216268800, + "step": 3300, + "train_runtime": 70825.237, + "train_tokens_per_second": 3053.556 + }, + { + "epoch": 0.874158193582464, + "grad_norm": 0.30859375, + "learning_rate": 0.00045487909774210813, + "loss": 2.5723, + "num_input_tokens_seen": 216924160, + "step": 3310, + "train_runtime": 70944.418, + "train_tokens_per_second": 3057.664 + }, + { + "epoch": 0.8767991548923808, + "grad_norm": 0.3203125, + "learning_rate": 0.0004545723090035123, + "loss": 2.5792, + "num_input_tokens_seen": 217579520, + "step": 3320, + "train_runtime": 71062.7059, + "train_tokens_per_second": 3061.796 + }, + { + "epoch": 0.8794401162022977, + "grad_norm": 0.3125, + "learning_rate": 0.0004542645850197555, + "loss": 2.4991, + "num_input_tokens_seen": 218234880, + "step": 3330, + "train_runtime": 71179.9107, + "train_tokens_per_second": 3065.962 + }, + { + "epoch": 0.8820810775122144, + "grad_norm": 0.30859375, + "learning_rate": 0.0004539559271976624, + "loss": 2.4421, + "num_input_tokens_seen": 218890240, + "step": 3340, + "train_runtime": 71299.1948, + "train_tokens_per_second": 3070.024 + }, + { + "epoch": 0.8847220388221313, + "grad_norm": 0.341796875, + "learning_rate": 0.0004536463369483269, + "loss": 2.526, + "num_input_tokens_seen": 219545600, + "step": 3350, + "train_runtime": 71416.6115, + "train_tokens_per_second": 3074.153 + }, + { + "epoch": 0.887363000132048, + "grad_norm": 0.31640625, + "learning_rate": 0.00045333581568710556, + "loss": 2.5114, + "num_input_tokens_seen": 220200960, + "step": 3360, + "train_runtime": 71537.1781, + "train_tokens_per_second": 3078.133 + }, + { + "epoch": 0.8900039614419649, + "grad_norm": 0.30859375, + "learning_rate": 0.0004530243648336115, + "loss": 2.4122, + "num_input_tokens_seen": 220856320, + "step": 3370, + "train_runtime": 71654.2281, + "train_tokens_per_second": 3082.251 + }, + { + "epoch": 0.8926449227518817, + "grad_norm": 0.2890625, + "learning_rate": 0.00045271198581170745, + "loss": 2.5761, + "num_input_tokens_seen": 221511680, + "step": 3380, + "train_runtime": 71771.9695, + "train_tokens_per_second": 3086.326 + }, + { + "epoch": 0.8952858840617985, + "grad_norm": 0.302734375, + "learning_rate": 0.00045239868004949964, + "loss": 2.4979, + "num_input_tokens_seen": 222167040, + "step": 3390, + "train_runtime": 71888.7516, + "train_tokens_per_second": 3090.428 + }, + { + "epoch": 0.8979268453717153, + "grad_norm": 0.322265625, + "learning_rate": 0.0004520844489793309, + "loss": 2.5003, + "num_input_tokens_seen": 222822400, + "step": 3400, + "train_runtime": 72007.6371, + "train_tokens_per_second": 3094.427 + }, + { + "epoch": 0.9005678066816322, + "grad_norm": 0.30078125, + "learning_rate": 0.0004517692940377743, + "loss": 2.5321, + "num_input_tokens_seen": 223477760, + "step": 3410, + "train_runtime": 72126.9538, + "train_tokens_per_second": 3098.395 + }, + { + "epoch": 0.9032087679915489, + "grad_norm": 0.330078125, + "learning_rate": 0.00045145321666562683, + "loss": 2.5793, + "num_input_tokens_seen": 224133120, + "step": 3420, + "train_runtime": 72248.5725, + "train_tokens_per_second": 3102.25 + }, + { + "epoch": 0.9058497293014657, + "grad_norm": 0.30078125, + "learning_rate": 0.0004511362183079023, + "loss": 2.5039, + "num_input_tokens_seen": 224788480, + "step": 3430, + "train_runtime": 72367.5334, + "train_tokens_per_second": 3106.206 + }, + { + "epoch": 0.9084906906113825, + "grad_norm": 0.29296875, + "learning_rate": 0.00045081830041382524, + "loss": 2.5551, + "num_input_tokens_seen": 225443840, + "step": 3440, + "train_runtime": 72486.8809, + "train_tokens_per_second": 3110.133 + }, + { + "epoch": 0.9111316519212993, + "grad_norm": 0.294921875, + "learning_rate": 0.0004504994644368238, + "loss": 2.5077, + "num_input_tokens_seen": 226099200, + "step": 3450, + "train_runtime": 72604.9764, + "train_tokens_per_second": 3114.101 + }, + { + "epoch": 0.9137726132312162, + "grad_norm": 0.34765625, + "learning_rate": 0.00045017971183452333, + "loss": 2.5018, + "num_input_tokens_seen": 226754560, + "step": 3460, + "train_runtime": 72722.5169, + "train_tokens_per_second": 3118.079 + }, + { + "epoch": 0.9164135745411329, + "grad_norm": 0.296875, + "learning_rate": 0.00044985904406874, + "loss": 2.4891, + "num_input_tokens_seen": 227409920, + "step": 3470, + "train_runtime": 72842.3214, + "train_tokens_per_second": 3121.948 + }, + { + "epoch": 0.9190545358510498, + "grad_norm": 0.318359375, + "learning_rate": 0.0004495374626054736, + "loss": 2.4945, + "num_input_tokens_seen": 228065280, + "step": 3480, + "train_runtime": 72959.9692, + "train_tokens_per_second": 3125.896 + }, + { + "epoch": 0.9216954971609665, + "grad_norm": 0.31640625, + "learning_rate": 0.0004492149689149011, + "loss": 2.5043, + "num_input_tokens_seen": 228720640, + "step": 3490, + "train_runtime": 73079.5466, + "train_tokens_per_second": 3129.749 + }, + { + "epoch": 0.9243364584708834, + "grad_norm": 0.3046875, + "learning_rate": 0.00044889156447137007, + "loss": 2.5296, + "num_input_tokens_seen": 229376000, + "step": 3500, + "train_runtime": 73199.3203, + "train_tokens_per_second": 3133.581 + }, + { + "epoch": 0.9269774197808002, + "grad_norm": 0.296875, + "learning_rate": 0.0004485672507533916, + "loss": 2.5304, + "num_input_tokens_seen": 230031360, + "step": 3510, + "train_runtime": 73330.1712, + "train_tokens_per_second": 3136.927 + }, + { + "epoch": 0.929618381090717, + "grad_norm": 0.30078125, + "learning_rate": 0.0004482420292436338, + "loss": 2.4987, + "num_input_tokens_seen": 230686720, + "step": 3520, + "train_runtime": 73453.3714, + "train_tokens_per_second": 3140.587 + }, + { + "epoch": 0.9322593424006338, + "grad_norm": 0.341796875, + "learning_rate": 0.00044791590142891487, + "loss": 2.523, + "num_input_tokens_seen": 231342080, + "step": 3530, + "train_runtime": 73578.5986, + "train_tokens_per_second": 3144.149 + }, + { + "epoch": 0.9349003037105507, + "grad_norm": 0.3125, + "learning_rate": 0.00044758886880019646, + "loss": 2.5653, + "num_input_tokens_seen": 231997440, + "step": 3540, + "train_runtime": 73698.2351, + "train_tokens_per_second": 3147.938 + }, + { + "epoch": 0.9375412650204674, + "grad_norm": 0.29296875, + "learning_rate": 0.00044726093285257676, + "loss": 2.5118, + "num_input_tokens_seen": 232652800, + "step": 3550, + "train_runtime": 73816.1535, + "train_tokens_per_second": 3151.787 + }, + { + "epoch": 0.9401822263303843, + "grad_norm": 0.30859375, + "learning_rate": 0.00044693209508528365, + "loss": 2.5051, + "num_input_tokens_seen": 233308160, + "step": 3560, + "train_runtime": 73934.7982, + "train_tokens_per_second": 3155.593 + }, + { + "epoch": 0.942823187640301, + "grad_norm": 0.30078125, + "learning_rate": 0.00044660235700166786, + "loss": 2.5131, + "num_input_tokens_seen": 233963520, + "step": 3570, + "train_runtime": 74054.392, + "train_tokens_per_second": 3159.347 + }, + { + "epoch": 0.9454641489502179, + "grad_norm": 0.337890625, + "learning_rate": 0.000446271720109196, + "loss": 2.5192, + "num_input_tokens_seen": 234618880, + "step": 3580, + "train_runtime": 74172.2787, + "train_tokens_per_second": 3163.161 + }, + { + "epoch": 0.9481051102601347, + "grad_norm": 0.37109375, + "learning_rate": 0.000445940185919444, + "loss": 2.5561, + "num_input_tokens_seen": 235274240, + "step": 3590, + "train_runtime": 74292.5192, + "train_tokens_per_second": 3166.863 + }, + { + "epoch": 0.9507460715700515, + "grad_norm": 0.345703125, + "learning_rate": 0.0004456077559480898, + "loss": 2.5362, + "num_input_tokens_seen": 235929600, + "step": 3600, + "train_runtime": 74410.8817, + "train_tokens_per_second": 3170.633 + }, + { + "epoch": 0.9533870328799683, + "grad_norm": 0.29296875, + "learning_rate": 0.0004452744317149067, + "loss": 2.4484, + "num_input_tokens_seen": 236584960, + "step": 3610, + "train_runtime": 74528.8241, + "train_tokens_per_second": 3174.409 + }, + { + "epoch": 0.9560279941898852, + "grad_norm": 0.298828125, + "learning_rate": 0.00044494021474375626, + "loss": 2.4905, + "num_input_tokens_seen": 237240320, + "step": 3620, + "train_runtime": 74646.6587, + "train_tokens_per_second": 3178.177 + }, + { + "epoch": 0.9586689554998019, + "grad_norm": 0.3046875, + "learning_rate": 0.00044460510656258123, + "loss": 2.5725, + "num_input_tokens_seen": 237895680, + "step": 3630, + "train_runtime": 74767.2065, + "train_tokens_per_second": 3181.818 + }, + { + "epoch": 0.9613099168097188, + "grad_norm": 0.291015625, + "learning_rate": 0.000444269108703399, + "loss": 2.4727, + "num_input_tokens_seen": 238551040, + "step": 3640, + "train_runtime": 74887.4523, + "train_tokens_per_second": 3185.461 + }, + { + "epoch": 0.9639508781196355, + "grad_norm": 0.31640625, + "learning_rate": 0.0004439322227022941, + "loss": 2.6083, + "num_input_tokens_seen": 239206400, + "step": 3650, + "train_runtime": 75006.6107, + "train_tokens_per_second": 3189.138 + }, + { + "epoch": 0.9665918394295524, + "grad_norm": 0.298828125, + "learning_rate": 0.00044359445009941147, + "loss": 2.4778, + "num_input_tokens_seen": 239861760, + "step": 3660, + "train_runtime": 75125.5365, + "train_tokens_per_second": 3192.813 + }, + { + "epoch": 0.9692328007394692, + "grad_norm": 0.314453125, + "learning_rate": 0.0004432557924389493, + "loss": 2.4564, + "num_input_tokens_seen": 240517120, + "step": 3670, + "train_runtime": 75245.5579, + "train_tokens_per_second": 3196.429 + }, + { + "epoch": 0.971873762049386, + "grad_norm": 0.30078125, + "learning_rate": 0.00044291625126915204, + "loss": 2.4725, + "num_input_tokens_seen": 241172480, + "step": 3680, + "train_runtime": 75365.6311, + "train_tokens_per_second": 3200.033 + }, + { + "epoch": 0.9745147233593028, + "grad_norm": 0.296875, + "learning_rate": 0.0004425758281423032, + "loss": 2.4829, + "num_input_tokens_seen": 241827840, + "step": 3690, + "train_runtime": 75488.5475, + "train_tokens_per_second": 3203.504 + }, + { + "epoch": 0.9771556846692196, + "grad_norm": 0.29296875, + "learning_rate": 0.0004422345246147185, + "loss": 2.5562, + "num_input_tokens_seen": 242483200, + "step": 3700, + "train_runtime": 75606.4806, + "train_tokens_per_second": 3207.175 + }, + { + "epoch": 0.9797966459791364, + "grad_norm": 0.3359375, + "learning_rate": 0.0004418923422467385, + "loss": 2.5714, + "num_input_tokens_seen": 243138560, + "step": 3710, + "train_runtime": 75724.8015, + "train_tokens_per_second": 3210.818 + }, + { + "epoch": 0.9824376072890532, + "grad_norm": 0.298828125, + "learning_rate": 0.00044154928260272145, + "loss": 2.4656, + "num_input_tokens_seen": 243793920, + "step": 3720, + "train_runtime": 75843.5826, + "train_tokens_per_second": 3214.43 + }, + { + "epoch": 0.98507856859897, + "grad_norm": 0.294921875, + "learning_rate": 0.00044120534725103653, + "loss": 2.4655, + "num_input_tokens_seen": 244449280, + "step": 3730, + "train_runtime": 75964.9389, + "train_tokens_per_second": 3217.922 + }, + { + "epoch": 0.9877195299088868, + "grad_norm": 0.296875, + "learning_rate": 0.0004408605377640562, + "loss": 2.5045, + "num_input_tokens_seen": 245104640, + "step": 3740, + "train_runtime": 76084.5912, + "train_tokens_per_second": 3221.475 + }, + { + "epoch": 0.9903604912188037, + "grad_norm": 0.302734375, + "learning_rate": 0.0004405148557181492, + "loss": 2.5, + "num_input_tokens_seen": 245760000, + "step": 3750, + "train_runtime": 76204.0097, + "train_tokens_per_second": 3225.027 + }, + { + "epoch": 0.9930014525287204, + "grad_norm": 0.333984375, + "learning_rate": 0.00044016830269367346, + "loss": 2.5142, + "num_input_tokens_seen": 246415360, + "step": 3760, + "train_runtime": 76322.0106, + "train_tokens_per_second": 3228.628 + }, + { + "epoch": 0.9956424138386373, + "grad_norm": 0.30859375, + "learning_rate": 0.0004398208802749687, + "loss": 2.5142, + "num_input_tokens_seen": 247070720, + "step": 3770, + "train_runtime": 76439.2146, + "train_tokens_per_second": 3232.251 + }, + { + "epoch": 0.998283375148554, + "grad_norm": 0.345703125, + "learning_rate": 0.0004394725900503493, + "loss": 2.5362, + "num_input_tokens_seen": 247726080, + "step": 3780, + "train_runtime": 76559.5869, + "train_tokens_per_second": 3235.729 + }, + { + "epoch": 1.000792288392975, + "grad_norm": 0.3125, + "learning_rate": 0.0004391234336120969, + "loss": 2.4494, + "num_input_tokens_seen": 248340480, + "step": 3790, + "train_runtime": 76675.4597, + "train_tokens_per_second": 3238.852 + }, + { + "epoch": 1.003433249702892, + "grad_norm": 0.349609375, + "learning_rate": 0.00043877341255645335, + "loss": 2.2429, + "num_input_tokens_seen": 248995840, + "step": 3800, + "train_runtime": 76794.0486, + "train_tokens_per_second": 3242.385 + }, + { + "epoch": 1.0060742110128087, + "grad_norm": 0.3125, + "learning_rate": 0.0004384225284836133, + "loss": 2.3023, + "num_input_tokens_seen": 249651200, + "step": 3810, + "train_runtime": 76911.1715, + "train_tokens_per_second": 3245.968 + }, + { + "epoch": 1.0087151723227255, + "grad_norm": 0.33203125, + "learning_rate": 0.0004380707829977166, + "loss": 2.2617, + "num_input_tokens_seen": 250306560, + "step": 3820, + "train_runtime": 77029.4457, + "train_tokens_per_second": 3249.492 + }, + { + "epoch": 1.0113561336326422, + "grad_norm": 0.328125, + "learning_rate": 0.0004377181777068416, + "loss": 2.251, + "num_input_tokens_seen": 250961920, + "step": 3830, + "train_runtime": 77147.4148, + "train_tokens_per_second": 3253.018 + }, + { + "epoch": 1.0139970949425592, + "grad_norm": 0.37109375, + "learning_rate": 0.0004373647142229972, + "loss": 2.3025, + "num_input_tokens_seen": 251617280, + "step": 3840, + "train_runtime": 77264.2759, + "train_tokens_per_second": 3256.58 + }, + { + "epoch": 1.016638056252476, + "grad_norm": 0.349609375, + "learning_rate": 0.00043701039416211564, + "loss": 2.2215, + "num_input_tokens_seen": 252272640, + "step": 3850, + "train_runtime": 77382.3779, + "train_tokens_per_second": 3260.079 + }, + { + "epoch": 1.0192790175623927, + "grad_norm": 0.32421875, + "learning_rate": 0.00043665521914404545, + "loss": 2.2709, + "num_input_tokens_seen": 252928000, + "step": 3860, + "train_runtime": 77505.1764, + "train_tokens_per_second": 3263.369 + }, + { + "epoch": 1.0219199788723095, + "grad_norm": 0.337890625, + "learning_rate": 0.0004362991907925435, + "loss": 2.1754, + "num_input_tokens_seen": 253583360, + "step": 3870, + "train_runtime": 77624.7271, + "train_tokens_per_second": 3266.786 + }, + { + "epoch": 1.0245609401822264, + "grad_norm": 0.3125, + "learning_rate": 0.000435942310735268, + "loss": 2.3135, + "num_input_tokens_seen": 254238720, + "step": 3880, + "train_runtime": 77750.1998, + "train_tokens_per_second": 3269.943 + }, + { + "epoch": 1.0272019014921432, + "grad_norm": 0.30078125, + "learning_rate": 0.000435584580603771, + "loss": 2.2239, + "num_input_tokens_seen": 254894080, + "step": 3890, + "train_runtime": 77869.7265, + "train_tokens_per_second": 3273.34 + }, + { + "epoch": 1.02984286280206, + "grad_norm": 0.314453125, + "learning_rate": 0.00043522600203349055, + "loss": 2.2953, + "num_input_tokens_seen": 255549440, + "step": 3900, + "train_runtime": 77989.4549, + "train_tokens_per_second": 3276.718 + }, + { + "epoch": 1.0324838241119767, + "grad_norm": 0.314453125, + "learning_rate": 0.00043486657666374384, + "loss": 2.318, + "num_input_tokens_seen": 256204800, + "step": 3910, + "train_runtime": 78108.7097, + "train_tokens_per_second": 3280.105 + }, + { + "epoch": 1.0351247854218935, + "grad_norm": 0.318359375, + "learning_rate": 0.00043450630613771916, + "loss": 2.2676, + "num_input_tokens_seen": 256860160, + "step": 3920, + "train_runtime": 78226.6613, + "train_tokens_per_second": 3283.537 + }, + { + "epoch": 1.0377657467318104, + "grad_norm": 0.353515625, + "learning_rate": 0.0004341451921024687, + "loss": 2.2883, + "num_input_tokens_seen": 257515520, + "step": 3930, + "train_runtime": 78344.3741, + "train_tokens_per_second": 3286.969 + }, + { + "epoch": 1.0404067080417272, + "grad_norm": 0.31640625, + "learning_rate": 0.0004337832362089009, + "loss": 2.2167, + "num_input_tokens_seen": 258170880, + "step": 3940, + "train_runtime": 78463.8699, + "train_tokens_per_second": 3290.315 + }, + { + "epoch": 1.043047669351644, + "grad_norm": 0.314453125, + "learning_rate": 0.00043342044011177293, + "loss": 2.2106, + "num_input_tokens_seen": 258826240, + "step": 3950, + "train_runtime": 78585.3237, + "train_tokens_per_second": 3293.57 + }, + { + "epoch": 1.0456886306615607, + "grad_norm": 0.333984375, + "learning_rate": 0.00043305680546968316, + "loss": 2.3089, + "num_input_tokens_seen": 259481600, + "step": 3960, + "train_runtime": 78702.6123, + "train_tokens_per_second": 3296.988 + }, + { + "epoch": 1.0483295919714777, + "grad_norm": 0.3203125, + "learning_rate": 0.0004326923339450636, + "loss": 2.3229, + "num_input_tokens_seen": 260136960, + "step": 3970, + "train_runtime": 78820.7623, + "train_tokens_per_second": 3300.361 + }, + { + "epoch": 1.0509705532813944, + "grad_norm": 0.33203125, + "learning_rate": 0.00043232702720417206, + "loss": 2.2907, + "num_input_tokens_seen": 260792320, + "step": 3980, + "train_runtime": 78940.7675, + "train_tokens_per_second": 3303.646 + }, + { + "epoch": 1.0536115145913112, + "grad_norm": 0.314453125, + "learning_rate": 0.0004319608869170848, + "loss": 2.2815, + "num_input_tokens_seen": 261447680, + "step": 3990, + "train_runtime": 79060.2279, + "train_tokens_per_second": 3306.943 + }, + { + "epoch": 1.056252475901228, + "grad_norm": 0.328125, + "learning_rate": 0.00043159391475768895, + "loss": 2.1302, + "num_input_tokens_seen": 262103040, + "step": 4000, + "train_runtime": 79180.0707, + "train_tokens_per_second": 3310.215 + }, + { + "epoch": 1.058893437211145, + "grad_norm": 0.318359375, + "learning_rate": 0.0004312261124036746, + "loss": 2.3044, + "num_input_tokens_seen": 262758400, + "step": 4010, + "train_runtime": 79313.4214, + "train_tokens_per_second": 3312.912 + }, + { + "epoch": 1.0615343985210617, + "grad_norm": 0.33203125, + "learning_rate": 0.0004308574815365271, + "loss": 2.342, + "num_input_tokens_seen": 263413760, + "step": 4020, + "train_runtime": 79435.9458, + "train_tokens_per_second": 3316.052 + }, + { + "epoch": 1.0641753598309784, + "grad_norm": 0.34765625, + "learning_rate": 0.0004304880238415199, + "loss": 2.3127, + "num_input_tokens_seen": 264069120, + "step": 4030, + "train_runtime": 79561.0668, + "train_tokens_per_second": 3319.075 + }, + { + "epoch": 1.0668163211408952, + "grad_norm": 0.33203125, + "learning_rate": 0.000430117741007706, + "loss": 2.2446, + "num_input_tokens_seen": 264724480, + "step": 4040, + "train_runtime": 79685.8835, + "train_tokens_per_second": 3322.1 + }, + { + "epoch": 1.0694572824508122, + "grad_norm": 0.3046875, + "learning_rate": 0.0004297466347279111, + "loss": 2.2921, + "num_input_tokens_seen": 265379840, + "step": 4050, + "train_runtime": 79812.4232, + "train_tokens_per_second": 3325.044 + }, + { + "epoch": 1.072098243760729, + "grad_norm": 0.322265625, + "learning_rate": 0.0004293747066987252, + "loss": 2.3192, + "num_input_tokens_seen": 266035200, + "step": 4060, + "train_runtime": 79935.0637, + "train_tokens_per_second": 3328.141 + }, + { + "epoch": 1.0747392050706457, + "grad_norm": 0.333984375, + "learning_rate": 0.0004290019586204953, + "loss": 2.2752, + "num_input_tokens_seen": 266690560, + "step": 4070, + "train_runtime": 80056.5141, + "train_tokens_per_second": 3331.279 + }, + { + "epoch": 1.0773801663805624, + "grad_norm": 0.322265625, + "learning_rate": 0.0004286283921973172, + "loss": 2.2939, + "num_input_tokens_seen": 267345920, + "step": 4080, + "train_runtime": 80180.0022, + "train_tokens_per_second": 3334.322 + }, + { + "epoch": 1.0800211276904794, + "grad_norm": 0.3125, + "learning_rate": 0.0004282540091370281, + "loss": 2.2694, + "num_input_tokens_seen": 268001280, + "step": 4090, + "train_runtime": 80304.7403, + "train_tokens_per_second": 3337.303 + }, + { + "epoch": 1.0826620890003962, + "grad_norm": 0.306640625, + "learning_rate": 0.00042787881115119864, + "loss": 2.2958, + "num_input_tokens_seen": 268656640, + "step": 4100, + "train_runtime": 80428.3833, + "train_tokens_per_second": 3340.321 + }, + { + "epoch": 1.085303050310313, + "grad_norm": 0.345703125, + "learning_rate": 0.0004275027999551249, + "loss": 2.3079, + "num_input_tokens_seen": 269312000, + "step": 4110, + "train_runtime": 80554.4198, + "train_tokens_per_second": 3343.231 + }, + { + "epoch": 1.0879440116202297, + "grad_norm": 0.31640625, + "learning_rate": 0.00042712597726782085, + "loss": 2.2273, + "num_input_tokens_seen": 269967360, + "step": 4120, + "train_runtime": 80679.6787, + "train_tokens_per_second": 3346.163 + }, + { + "epoch": 1.0905849729301467, + "grad_norm": 0.341796875, + "learning_rate": 0.0004267483448120104, + "loss": 2.3295, + "num_input_tokens_seen": 270622720, + "step": 4130, + "train_runtime": 80804.7272, + "train_tokens_per_second": 3349.095 + }, + { + "epoch": 1.0932259342400634, + "grad_norm": 0.34765625, + "learning_rate": 0.00042636990431411937, + "loss": 2.2563, + "num_input_tokens_seen": 271278080, + "step": 4140, + "train_runtime": 80929.6656, + "train_tokens_per_second": 3352.023 + }, + { + "epoch": 1.0958668955499802, + "grad_norm": 0.31640625, + "learning_rate": 0.00042599065750426767, + "loss": 2.2812, + "num_input_tokens_seen": 271933440, + "step": 4150, + "train_runtime": 81054.998, + "train_tokens_per_second": 3354.925 + }, + { + "epoch": 1.098507856859897, + "grad_norm": 0.3046875, + "learning_rate": 0.00042561060611626177, + "loss": 2.3289, + "num_input_tokens_seen": 272588800, + "step": 4160, + "train_runtime": 81178.5522, + "train_tokens_per_second": 3357.892 + }, + { + "epoch": 1.101148818169814, + "grad_norm": 0.3046875, + "learning_rate": 0.000425229751887586, + "loss": 2.1831, + "num_input_tokens_seen": 273244160, + "step": 4170, + "train_runtime": 81300.0149, + "train_tokens_per_second": 3360.936 + }, + { + "epoch": 1.1037897794797307, + "grad_norm": 0.296875, + "learning_rate": 0.00042484809655939527, + "loss": 2.2654, + "num_input_tokens_seen": 273899520, + "step": 4180, + "train_runtime": 81418.7735, + "train_tokens_per_second": 3364.083 + }, + { + "epoch": 1.1064307407896474, + "grad_norm": 0.298828125, + "learning_rate": 0.0004244656418765069, + "loss": 2.2485, + "num_input_tokens_seen": 274554880, + "step": 4190, + "train_runtime": 81540.3091, + "train_tokens_per_second": 3367.106 + }, + { + "epoch": 1.1090717020995642, + "grad_norm": 0.328125, + "learning_rate": 0.00042408238958739267, + "loss": 2.2779, + "num_input_tokens_seen": 275210240, + "step": 4200, + "train_runtime": 81662.2475, + "train_tokens_per_second": 3370.104 + }, + { + "epoch": 1.111712663409481, + "grad_norm": 0.3125, + "learning_rate": 0.0004236983414441705, + "loss": 2.2195, + "num_input_tokens_seen": 275865600, + "step": 4210, + "train_runtime": 81786.6331, + "train_tokens_per_second": 3372.991 + }, + { + "epoch": 1.114353624719398, + "grad_norm": 0.34375, + "learning_rate": 0.000423313499202597, + "loss": 2.2841, + "num_input_tokens_seen": 276520960, + "step": 4220, + "train_runtime": 81906.7637, + "train_tokens_per_second": 3376.045 + }, + { + "epoch": 1.1169945860293147, + "grad_norm": 0.314453125, + "learning_rate": 0.00042292786462205914, + "loss": 2.2479, + "num_input_tokens_seen": 277176320, + "step": 4230, + "train_runtime": 82028.3019, + "train_tokens_per_second": 3379.033 + }, + { + "epoch": 1.1196355473392314, + "grad_norm": 0.310546875, + "learning_rate": 0.00042254143946556606, + "loss": 2.2651, + "num_input_tokens_seen": 277831680, + "step": 4240, + "train_runtime": 82151.0352, + "train_tokens_per_second": 3381.962 + }, + { + "epoch": 1.1222765086491484, + "grad_norm": 0.330078125, + "learning_rate": 0.00042215422549974144, + "loss": 2.2704, + "num_input_tokens_seen": 278487040, + "step": 4250, + "train_runtime": 82270.3576, + "train_tokens_per_second": 3385.023 + }, + { + "epoch": 1.1249174699590652, + "grad_norm": 0.318359375, + "learning_rate": 0.000421766224494815, + "loss": 2.3028, + "num_input_tokens_seen": 279142400, + "step": 4260, + "train_runtime": 82390.9274, + "train_tokens_per_second": 3388.024 + }, + { + "epoch": 1.127558431268982, + "grad_norm": 0.314453125, + "learning_rate": 0.0004213774382246146, + "loss": 2.3083, + "num_input_tokens_seen": 279797760, + "step": 4270, + "train_runtime": 82510.8106, + "train_tokens_per_second": 3391.044 + }, + { + "epoch": 1.1301993925788987, + "grad_norm": 0.30078125, + "learning_rate": 0.000420987868466558, + "loss": 2.3101, + "num_input_tokens_seen": 280453120, + "step": 4280, + "train_runtime": 82633.193, + "train_tokens_per_second": 3393.952 + }, + { + "epoch": 1.1328403538888154, + "grad_norm": 0.31640625, + "learning_rate": 0.00042059751700164515, + "loss": 2.3364, + "num_input_tokens_seen": 281108480, + "step": 4290, + "train_runtime": 82755.5866, + "train_tokens_per_second": 3396.852 + }, + { + "epoch": 1.1354813151987324, + "grad_norm": 0.33203125, + "learning_rate": 0.0004202063856144494, + "loss": 2.3146, + "num_input_tokens_seen": 281763840, + "step": 4300, + "train_runtime": 82876.6686, + "train_tokens_per_second": 3399.797 + }, + { + "epoch": 1.1381222765086492, + "grad_norm": 0.30859375, + "learning_rate": 0.00041981447609310983, + "loss": 2.2821, + "num_input_tokens_seen": 282419200, + "step": 4310, + "train_runtime": 83002.1117, + "train_tokens_per_second": 3402.554 + }, + { + "epoch": 1.140763237818566, + "grad_norm": 0.31640625, + "learning_rate": 0.0004194217902293229, + "loss": 2.2739, + "num_input_tokens_seen": 283074560, + "step": 4320, + "train_runtime": 83121.4284, + "train_tokens_per_second": 3405.555 + }, + { + "epoch": 1.1434041991284827, + "grad_norm": 0.306640625, + "learning_rate": 0.0004190283298183344, + "loss": 2.2125, + "num_input_tokens_seen": 283729920, + "step": 4330, + "train_runtime": 83243.228, + "train_tokens_per_second": 3408.444 + }, + { + "epoch": 1.1460451604383997, + "grad_norm": 0.302734375, + "learning_rate": 0.0004186340966589309, + "loss": 2.249, + "num_input_tokens_seen": 284385280, + "step": 4340, + "train_runtime": 83363.4964, + "train_tokens_per_second": 3411.389 + }, + { + "epoch": 1.1486861217483164, + "grad_norm": 0.310546875, + "learning_rate": 0.00041823909255343187, + "loss": 2.2658, + "num_input_tokens_seen": 285040640, + "step": 4350, + "train_runtime": 83485.0706, + "train_tokens_per_second": 3414.271 + }, + { + "epoch": 1.1513270830582332, + "grad_norm": 0.298828125, + "learning_rate": 0.00041784331930768125, + "loss": 2.2801, + "num_input_tokens_seen": 285696000, + "step": 4360, + "train_runtime": 83606.0455, + "train_tokens_per_second": 3417.169 + }, + { + "epoch": 1.15396804436815, + "grad_norm": 0.3046875, + "learning_rate": 0.0004174467787310396, + "loss": 2.3529, + "num_input_tokens_seen": 286351360, + "step": 4370, + "train_runtime": 83728.7506, + "train_tokens_per_second": 3419.988 + }, + { + "epoch": 1.156609005678067, + "grad_norm": 0.337890625, + "learning_rate": 0.00041704947263637493, + "loss": 2.3014, + "num_input_tokens_seen": 287006720, + "step": 4380, + "train_runtime": 83848.4792, + "train_tokens_per_second": 3422.921 + }, + { + "epoch": 1.1592499669879837, + "grad_norm": 0.29296875, + "learning_rate": 0.0004166514028400554, + "loss": 2.2544, + "num_input_tokens_seen": 287662080, + "step": 4390, + "train_runtime": 83974.8578, + "train_tokens_per_second": 3425.574 + }, + { + "epoch": 1.1618909282979004, + "grad_norm": 0.33203125, + "learning_rate": 0.0004162525711619405, + "loss": 2.2893, + "num_input_tokens_seen": 288317440, + "step": 4400, + "train_runtime": 84093.0934, + "train_tokens_per_second": 3428.551 + }, + { + "epoch": 1.1645318896078172, + "grad_norm": 0.33203125, + "learning_rate": 0.0004158529794253727, + "loss": 2.2578, + "num_input_tokens_seen": 288972800, + "step": 4410, + "train_runtime": 84210.3873, + "train_tokens_per_second": 3431.558 + }, + { + "epoch": 1.1671728509177342, + "grad_norm": 0.310546875, + "learning_rate": 0.00041545262945716946, + "loss": 2.3109, + "num_input_tokens_seen": 289628160, + "step": 4420, + "train_runtime": 84327.684, + "train_tokens_per_second": 3434.556 + }, + { + "epoch": 1.169813812227651, + "grad_norm": 0.322265625, + "learning_rate": 0.00041505152308761434, + "loss": 2.2651, + "num_input_tokens_seen": 290283520, + "step": 4430, + "train_runtime": 84449.3471, + "train_tokens_per_second": 3437.368 + }, + { + "epoch": 1.1724547735375677, + "grad_norm": 0.333984375, + "learning_rate": 0.00041464966215044917, + "loss": 2.2561, + "num_input_tokens_seen": 290938880, + "step": 4440, + "train_runtime": 84570.0712, + "train_tokens_per_second": 3440.211 + }, + { + "epoch": 1.1750957348474844, + "grad_norm": 0.306640625, + "learning_rate": 0.00041424704848286553, + "loss": 2.2728, + "num_input_tokens_seen": 291594240, + "step": 4450, + "train_runtime": 84688.7888, + "train_tokens_per_second": 3443.127 + }, + { + "epoch": 1.1777366961574014, + "grad_norm": 0.3125, + "learning_rate": 0.000413843683925496, + "loss": 2.3104, + "num_input_tokens_seen": 292249600, + "step": 4460, + "train_runtime": 84807.008, + "train_tokens_per_second": 3446.055 + }, + { + "epoch": 1.1803776574673182, + "grad_norm": 0.298828125, + "learning_rate": 0.0004134395703224062, + "loss": 2.3648, + "num_input_tokens_seen": 292904960, + "step": 4470, + "train_runtime": 84924.5715, + "train_tokens_per_second": 3449.001 + }, + { + "epoch": 1.183018618777235, + "grad_norm": 0.310546875, + "learning_rate": 0.00041303470952108615, + "loss": 2.2891, + "num_input_tokens_seen": 293560320, + "step": 4480, + "train_runtime": 85042.4478, + "train_tokens_per_second": 3451.927 + }, + { + "epoch": 1.1856595800871517, + "grad_norm": 0.337890625, + "learning_rate": 0.0004126291033724417, + "loss": 2.2063, + "num_input_tokens_seen": 294215680, + "step": 4490, + "train_runtime": 85160.8956, + "train_tokens_per_second": 3454.821 + }, + { + "epoch": 1.1883005413970684, + "grad_norm": 0.318359375, + "learning_rate": 0.0004122227537307864, + "loss": 2.3004, + "num_input_tokens_seen": 294871040, + "step": 4500, + "train_runtime": 85278.8135, + "train_tokens_per_second": 3457.729 + }, + { + "epoch": 1.1909415027069854, + "grad_norm": 0.298828125, + "learning_rate": 0.00041181566245383273, + "loss": 2.2272, + "num_input_tokens_seen": 295526400, + "step": 4510, + "train_runtime": 85410.3019, + "train_tokens_per_second": 3460.079 + }, + { + "epoch": 1.1935824640169022, + "grad_norm": 0.314453125, + "learning_rate": 0.00041140783140268365, + "loss": 2.303, + "num_input_tokens_seen": 296181760, + "step": 4520, + "train_runtime": 85528.7014, + "train_tokens_per_second": 3462.952 + }, + { + "epoch": 1.196223425326819, + "grad_norm": 0.31640625, + "learning_rate": 0.0004109992624418244, + "loss": 2.3099, + "num_input_tokens_seen": 296837120, + "step": 4530, + "train_runtime": 85651.1363, + "train_tokens_per_second": 3465.653 + }, + { + "epoch": 1.198864386636736, + "grad_norm": 0.30859375, + "learning_rate": 0.0004105899574391135, + "loss": 2.2171, + "num_input_tokens_seen": 297492480, + "step": 4540, + "train_runtime": 85772.0031, + "train_tokens_per_second": 3468.41 + }, + { + "epoch": 1.2015053479466526, + "grad_norm": 0.314453125, + "learning_rate": 0.00041017991826577444, + "loss": 2.2966, + "num_input_tokens_seen": 298147840, + "step": 4550, + "train_runtime": 85892.1066, + "train_tokens_per_second": 3471.19 + }, + { + "epoch": 1.2041463092565694, + "grad_norm": 0.310546875, + "learning_rate": 0.00040976914679638726, + "loss": 2.1979, + "num_input_tokens_seen": 298803200, + "step": 4560, + "train_runtime": 86010.3013, + "train_tokens_per_second": 3474.04 + }, + { + "epoch": 1.2067872705664862, + "grad_norm": 0.2890625, + "learning_rate": 0.0004093576449088797, + "loss": 2.2558, + "num_input_tokens_seen": 299458560, + "step": 4570, + "train_runtime": 86128.1932, + "train_tokens_per_second": 3476.894 + }, + { + "epoch": 1.209428231876403, + "grad_norm": 0.30078125, + "learning_rate": 0.00040894541448451894, + "loss": 2.2734, + "num_input_tokens_seen": 300113920, + "step": 4580, + "train_runtime": 86246.2901, + "train_tokens_per_second": 3479.731 + }, + { + "epoch": 1.21206919318632, + "grad_norm": 0.318359375, + "learning_rate": 0.0004085324574079027, + "loss": 2.2513, + "num_input_tokens_seen": 300769280, + "step": 4590, + "train_runtime": 86365.5307, + "train_tokens_per_second": 3482.515 + }, + { + "epoch": 1.2147101544962366, + "grad_norm": 0.314453125, + "learning_rate": 0.0004081187755669506, + "loss": 2.2718, + "num_input_tokens_seen": 301424640, + "step": 4600, + "train_runtime": 86483.7904, + "train_tokens_per_second": 3485.331 + }, + { + "epoch": 1.2173511158061534, + "grad_norm": 0.294921875, + "learning_rate": 0.000407704370852896, + "loss": 2.3453, + "num_input_tokens_seen": 302080000, + "step": 4610, + "train_runtime": 86601.8135, + "train_tokens_per_second": 3488.149 + }, + { + "epoch": 1.2199920771160702, + "grad_norm": 0.30859375, + "learning_rate": 0.00040728924516027676, + "loss": 2.318, + "num_input_tokens_seen": 302735360, + "step": 4620, + "train_runtime": 86719.9873, + "train_tokens_per_second": 3490.953 + }, + { + "epoch": 1.2226330384259871, + "grad_norm": 0.30078125, + "learning_rate": 0.000406873400386927, + "loss": 2.2749, + "num_input_tokens_seen": 303390720, + "step": 4630, + "train_runtime": 86841.9478, + "train_tokens_per_second": 3493.596 + }, + { + "epoch": 1.225273999735904, + "grad_norm": 0.296875, + "learning_rate": 0.00040645683843396817, + "loss": 2.2283, + "num_input_tokens_seen": 304046080, + "step": 4640, + "train_runtime": 86962.7992, + "train_tokens_per_second": 3496.278 + }, + { + "epoch": 1.2279149610458207, + "grad_norm": 0.31640625, + "learning_rate": 0.00040603956120580044, + "loss": 2.27, + "num_input_tokens_seen": 304701440, + "step": 4650, + "train_runtime": 87081.6875, + "train_tokens_per_second": 3499.03 + }, + { + "epoch": 1.2305559223557374, + "grad_norm": 0.3203125, + "learning_rate": 0.00040562157061009417, + "loss": 2.3199, + "num_input_tokens_seen": 305356800, + "step": 4660, + "train_runtime": 87200.158, + "train_tokens_per_second": 3501.792 + }, + { + "epoch": 1.2331968836656544, + "grad_norm": 0.3203125, + "learning_rate": 0.0004052028685577809, + "loss": 2.2438, + "num_input_tokens_seen": 306012160, + "step": 4670, + "train_runtime": 87317.976, + "train_tokens_per_second": 3504.572 + }, + { + "epoch": 1.2358378449755711, + "grad_norm": 0.30078125, + "learning_rate": 0.0004047834569630447, + "loss": 2.2876, + "num_input_tokens_seen": 306667520, + "step": 4680, + "train_runtime": 87436.9012, + "train_tokens_per_second": 3507.301 + }, + { + "epoch": 1.238478806285488, + "grad_norm": 0.306640625, + "learning_rate": 0.0004043633377433136, + "loss": 2.3233, + "num_input_tokens_seen": 307322880, + "step": 4690, + "train_runtime": 87555.5652, + "train_tokens_per_second": 3510.033 + }, + { + "epoch": 1.2411197675954047, + "grad_norm": 0.296875, + "learning_rate": 0.00040394251281925046, + "loss": 2.2602, + "num_input_tokens_seen": 307978240, + "step": 4700, + "train_runtime": 87674.0469, + "train_tokens_per_second": 3512.764 + }, + { + "epoch": 1.2437607289053216, + "grad_norm": 0.314453125, + "learning_rate": 0.0004035209841147448, + "loss": 2.3335, + "num_input_tokens_seen": 308633600, + "step": 4710, + "train_runtime": 87792.5632, + "train_tokens_per_second": 3515.487 + }, + { + "epoch": 1.2464016902152384, + "grad_norm": 0.322265625, + "learning_rate": 0.0004030987535569032, + "loss": 2.2491, + "num_input_tokens_seen": 309288960, + "step": 4720, + "train_runtime": 87910.5629, + "train_tokens_per_second": 3518.223 + }, + { + "epoch": 1.2490426515251551, + "grad_norm": 0.326171875, + "learning_rate": 0.00040267582307604115, + "loss": 2.3001, + "num_input_tokens_seen": 309944320, + "step": 4730, + "train_runtime": 88028.8102, + "train_tokens_per_second": 3520.942 + }, + { + "epoch": 1.251683612835072, + "grad_norm": 0.3203125, + "learning_rate": 0.000402252194605674, + "loss": 2.3019, + "num_input_tokens_seen": 310599680, + "step": 4740, + "train_runtime": 88147.4335, + "train_tokens_per_second": 3523.638 + }, + { + "epoch": 1.2543245741449889, + "grad_norm": 0.306640625, + "learning_rate": 0.000401827870082508, + "loss": 2.2112, + "num_input_tokens_seen": 311255040, + "step": 4750, + "train_runtime": 88269.8181, + "train_tokens_per_second": 3526.177 + }, + { + "epoch": 1.2569655354549056, + "grad_norm": 0.29296875, + "learning_rate": 0.0004014028514464315, + "loss": 2.2837, + "num_input_tokens_seen": 311910400, + "step": 4760, + "train_runtime": 88388.6042, + "train_tokens_per_second": 3528.853 + }, + { + "epoch": 1.2596064967648224, + "grad_norm": 0.34375, + "learning_rate": 0.0004009771406405064, + "loss": 2.2713, + "num_input_tokens_seen": 312565760, + "step": 4770, + "train_runtime": 88507.0596, + "train_tokens_per_second": 3531.535 + }, + { + "epoch": 1.2622474580747391, + "grad_norm": 0.314453125, + "learning_rate": 0.0004005507396109586, + "loss": 2.3538, + "num_input_tokens_seen": 313221120, + "step": 4780, + "train_runtime": 88630.6163, + "train_tokens_per_second": 3534.006 + }, + { + "epoch": 1.264888419384656, + "grad_norm": 0.298828125, + "learning_rate": 0.0004001236503071699, + "loss": 2.3031, + "num_input_tokens_seen": 313876480, + "step": 4790, + "train_runtime": 88749.8818, + "train_tokens_per_second": 3536.641 + }, + { + "epoch": 1.2675293806945729, + "grad_norm": 0.328125, + "learning_rate": 0.0003996958746816684, + "loss": 2.3328, + "num_input_tokens_seen": 314531840, + "step": 4800, + "train_runtime": 88868.4023, + "train_tokens_per_second": 3539.299 + }, + { + "epoch": 1.2701703420044896, + "grad_norm": 0.298828125, + "learning_rate": 0.00039926741469012005, + "loss": 2.2023, + "num_input_tokens_seen": 315187200, + "step": 4810, + "train_runtime": 88986.5256, + "train_tokens_per_second": 3541.965 + }, + { + "epoch": 1.2728113033144064, + "grad_norm": 0.30078125, + "learning_rate": 0.0003988382722913194, + "loss": 2.278, + "num_input_tokens_seen": 315842560, + "step": 4820, + "train_runtime": 89104.5506, + "train_tokens_per_second": 3544.629 + }, + { + "epoch": 1.2754522646243234, + "grad_norm": 0.3203125, + "learning_rate": 0.00039840844944718086, + "loss": 2.3375, + "num_input_tokens_seen": 316497920, + "step": 4830, + "train_runtime": 89222.5592, + "train_tokens_per_second": 3547.286 + }, + { + "epoch": 1.2780932259342401, + "grad_norm": 0.30859375, + "learning_rate": 0.00039797794812272957, + "loss": 2.3218, + "num_input_tokens_seen": 317153280, + "step": 4840, + "train_runtime": 89340.9536, + "train_tokens_per_second": 3549.92 + }, + { + "epoch": 1.2807341872441569, + "grad_norm": 0.298828125, + "learning_rate": 0.00039754677028609254, + "loss": 2.2926, + "num_input_tokens_seen": 317808640, + "step": 4850, + "train_runtime": 89458.543, + "train_tokens_per_second": 3552.58 + }, + { + "epoch": 1.2833751485540736, + "grad_norm": 0.3046875, + "learning_rate": 0.00039711491790848946, + "loss": 2.2381, + "num_input_tokens_seen": 318464000, + "step": 4860, + "train_runtime": 89576.8503, + "train_tokens_per_second": 3555.204 + }, + { + "epoch": 1.2860161098639904, + "grad_norm": 0.302734375, + "learning_rate": 0.000396682392964224, + "loss": 2.3415, + "num_input_tokens_seen": 319119360, + "step": 4870, + "train_runtime": 89694.933, + "train_tokens_per_second": 3557.83 + }, + { + "epoch": 1.2886570711739074, + "grad_norm": 0.314453125, + "learning_rate": 0.0003962491974306746, + "loss": 2.2658, + "num_input_tokens_seen": 319774720, + "step": 4880, + "train_runtime": 89813.2074, + "train_tokens_per_second": 3560.442 + }, + { + "epoch": 1.2912980324838241, + "grad_norm": 0.310546875, + "learning_rate": 0.00039581533328828536, + "loss": 2.3088, + "num_input_tokens_seen": 320430080, + "step": 4890, + "train_runtime": 89931.5757, + "train_tokens_per_second": 3563.043 + }, + { + "epoch": 1.2939389937937409, + "grad_norm": 0.30859375, + "learning_rate": 0.000395380802520557, + "loss": 2.3227, + "num_input_tokens_seen": 321085440, + "step": 4900, + "train_runtime": 90049.479, + "train_tokens_per_second": 3565.656 + }, + { + "epoch": 1.2965799551036579, + "grad_norm": 0.306640625, + "learning_rate": 0.000394945607114038, + "loss": 2.3324, + "num_input_tokens_seen": 321740800, + "step": 4910, + "train_runtime": 90167.7361, + "train_tokens_per_second": 3568.248 + }, + { + "epoch": 1.2992209164135746, + "grad_norm": 0.291015625, + "learning_rate": 0.0003945097490583153, + "loss": 2.2403, + "num_input_tokens_seen": 322396160, + "step": 4920, + "train_runtime": 90285.8825, + "train_tokens_per_second": 3570.837 + }, + { + "epoch": 1.3018618777234914, + "grad_norm": 0.306640625, + "learning_rate": 0.0003940732303460053, + "loss": 2.2773, + "num_input_tokens_seen": 323051520, + "step": 4930, + "train_runtime": 90407.5925, + "train_tokens_per_second": 3573.279 + }, + { + "epoch": 1.3045028390334081, + "grad_norm": 0.302734375, + "learning_rate": 0.00039363605297274473, + "loss": 2.307, + "num_input_tokens_seen": 323706880, + "step": 4940, + "train_runtime": 90526.3987, + "train_tokens_per_second": 3575.829 + }, + { + "epoch": 1.307143800343325, + "grad_norm": 0.298828125, + "learning_rate": 0.00039319821893718163, + "loss": 2.2994, + "num_input_tokens_seen": 324362240, + "step": 4950, + "train_runtime": 90644.4098, + "train_tokens_per_second": 3578.403 + }, + { + "epoch": 1.3097847616532419, + "grad_norm": 0.30078125, + "learning_rate": 0.0003927597302409658, + "loss": 2.2691, + "num_input_tokens_seen": 325017600, + "step": 4960, + "train_runtime": 90762.6029, + "train_tokens_per_second": 3580.964 + }, + { + "epoch": 1.3124257229631586, + "grad_norm": 0.30078125, + "learning_rate": 0.00039232058888874033, + "loss": 2.2588, + "num_input_tokens_seen": 325672960, + "step": 4970, + "train_runtime": 90880.8201, + "train_tokens_per_second": 3583.517 + }, + { + "epoch": 1.3150666842730754, + "grad_norm": 0.302734375, + "learning_rate": 0.0003918807968881318, + "loss": 2.216, + "num_input_tokens_seen": 326328320, + "step": 4980, + "train_runtime": 90998.7304, + "train_tokens_per_second": 3586.076 + }, + { + "epoch": 1.3177076455829921, + "grad_norm": 0.30078125, + "learning_rate": 0.0003914403562497415, + "loss": 2.3125, + "num_input_tokens_seen": 326983680, + "step": 4990, + "train_runtime": 91119.3008, + "train_tokens_per_second": 3588.523 + }, + { + "epoch": 1.320348606892909, + "grad_norm": 0.294921875, + "learning_rate": 0.0003909992689871361, + "loss": 2.3088, + "num_input_tokens_seen": 327639040, + "step": 5000, + "train_runtime": 91239.2357, + "train_tokens_per_second": 3590.988 + }, + { + "epoch": 1.3229895682028259, + "grad_norm": 0.294921875, + "learning_rate": 0.0003905575371168383, + "loss": 2.2613, + "num_input_tokens_seen": 328294400, + "step": 5010, + "train_runtime": 91372.8031, + "train_tokens_per_second": 3592.912 + }, + { + "epoch": 1.3256305295127426, + "grad_norm": 0.310546875, + "learning_rate": 0.00039011516265831804, + "loss": 2.3633, + "num_input_tokens_seen": 328949760, + "step": 5020, + "train_runtime": 91492.3125, + "train_tokens_per_second": 3595.381 + }, + { + "epoch": 1.3282714908226594, + "grad_norm": 0.302734375, + "learning_rate": 0.00038967214763398265, + "loss": 2.3284, + "num_input_tokens_seen": 329605120, + "step": 5030, + "train_runtime": 91612.4203, + "train_tokens_per_second": 3597.821 + }, + { + "epoch": 1.3309124521325764, + "grad_norm": 0.3046875, + "learning_rate": 0.0003892284940691682, + "loss": 2.2438, + "num_input_tokens_seen": 330260480, + "step": 5040, + "train_runtime": 91732.8683, + "train_tokens_per_second": 3600.242 + }, + { + "epoch": 1.3335534134424931, + "grad_norm": 0.3046875, + "learning_rate": 0.00038878420399212987, + "loss": 2.3119, + "num_input_tokens_seen": 330915840, + "step": 5050, + "train_runtime": 91852.984, + "train_tokens_per_second": 3602.668 + }, + { + "epoch": 1.3361943747524099, + "grad_norm": 0.298828125, + "learning_rate": 0.0003883392794340328, + "loss": 2.2153, + "num_input_tokens_seen": 331571200, + "step": 5060, + "train_runtime": 91971.4591, + "train_tokens_per_second": 3605.153 + }, + { + "epoch": 1.3388353360623266, + "grad_norm": 0.314453125, + "learning_rate": 0.0003878937224289429, + "loss": 2.2827, + "num_input_tokens_seen": 332226560, + "step": 5070, + "train_runtime": 92089.9404, + "train_tokens_per_second": 3607.631 + }, + { + "epoch": 1.3414762973722434, + "grad_norm": 0.294921875, + "learning_rate": 0.0003874475350138171, + "loss": 2.2723, + "num_input_tokens_seen": 332881920, + "step": 5080, + "train_runtime": 92208.5982, + "train_tokens_per_second": 3610.096 + }, + { + "epoch": 1.3441172586821604, + "grad_norm": 0.291015625, + "learning_rate": 0.0003870007192284949, + "loss": 2.319, + "num_input_tokens_seen": 333537280, + "step": 5090, + "train_runtime": 92329.1823, + "train_tokens_per_second": 3612.48 + }, + { + "epoch": 1.3467582199920771, + "grad_norm": 0.296875, + "learning_rate": 0.0003865532771156882, + "loss": 2.2392, + "num_input_tokens_seen": 334192640, + "step": 5100, + "train_runtime": 92449.0591, + "train_tokens_per_second": 3614.884 + }, + { + "epoch": 1.3493991813019939, + "grad_norm": 0.30078125, + "learning_rate": 0.00038610521072097217, + "loss": 2.2927, + "num_input_tokens_seen": 334848000, + "step": 5110, + "train_runtime": 92567.0339, + "train_tokens_per_second": 3617.357 + }, + { + "epoch": 1.3520401426119109, + "grad_norm": 0.6328125, + "learning_rate": 0.00038565652209277636, + "loss": 2.3171, + "num_input_tokens_seen": 335503360, + "step": 5120, + "train_runtime": 92685.5042, + "train_tokens_per_second": 3619.804 + }, + { + "epoch": 1.3546811039218276, + "grad_norm": 0.328125, + "learning_rate": 0.0003852072132823747, + "loss": 2.2479, + "num_input_tokens_seen": 336158720, + "step": 5130, + "train_runtime": 92806.5462, + "train_tokens_per_second": 3622.144 + }, + { + "epoch": 1.3573220652317444, + "grad_norm": 0.35546875, + "learning_rate": 0.0003847572863438766, + "loss": 2.3483, + "num_input_tokens_seen": 336814080, + "step": 5140, + "train_runtime": 92926.2734, + "train_tokens_per_second": 3624.53 + }, + { + "epoch": 1.3599630265416611, + "grad_norm": 0.3203125, + "learning_rate": 0.0003843067433342173, + "loss": 2.3469, + "num_input_tokens_seen": 337469440, + "step": 5150, + "train_runtime": 93045.1657, + "train_tokens_per_second": 3626.942 + }, + { + "epoch": 1.3626039878515779, + "grad_norm": 0.337890625, + "learning_rate": 0.00038385558631314853, + "loss": 2.3005, + "num_input_tokens_seen": 338124800, + "step": 5160, + "train_runtime": 93162.7513, + "train_tokens_per_second": 3629.399 + }, + { + "epoch": 1.3652449491614949, + "grad_norm": 0.30859375, + "learning_rate": 0.0003834038173432292, + "loss": 2.288, + "num_input_tokens_seen": 338780160, + "step": 5170, + "train_runtime": 93280.87, + "train_tokens_per_second": 3631.829 + }, + { + "epoch": 1.3678859104714116, + "grad_norm": 0.30078125, + "learning_rate": 0.00038295143848981566, + "loss": 2.3067, + "num_input_tokens_seen": 339435520, + "step": 5180, + "train_runtime": 93398.4602, + "train_tokens_per_second": 3634.273 + }, + { + "epoch": 1.3705268717813284, + "grad_norm": 0.287109375, + "learning_rate": 0.00038249845182105257, + "loss": 2.2299, + "num_input_tokens_seen": 340090880, + "step": 5190, + "train_runtime": 93516.5785, + "train_tokens_per_second": 3636.691 + }, + { + "epoch": 1.3731678330912453, + "grad_norm": 0.30078125, + "learning_rate": 0.0003820448594078635, + "loss": 2.2228, + "num_input_tokens_seen": 340746240, + "step": 5200, + "train_runtime": 93635.4301, + "train_tokens_per_second": 3639.074 + }, + { + "epoch": 1.375808794401162, + "grad_norm": 0.333984375, + "learning_rate": 0.0003815906633239411, + "loss": 2.2324, + "num_input_tokens_seen": 341401600, + "step": 5210, + "train_runtime": 93753.8366, + "train_tokens_per_second": 3641.468 + }, + { + "epoch": 1.3784497557110789, + "grad_norm": 0.298828125, + "learning_rate": 0.000381135865645738, + "loss": 2.1961, + "num_input_tokens_seen": 342056960, + "step": 5220, + "train_runtime": 93871.0038, + "train_tokens_per_second": 3643.904 + }, + { + "epoch": 1.3810907170209956, + "grad_norm": 0.298828125, + "learning_rate": 0.0003806804684524568, + "loss": 2.2648, + "num_input_tokens_seen": 342712320, + "step": 5230, + "train_runtime": 93989.121, + "train_tokens_per_second": 3646.298 + }, + { + "epoch": 1.3837316783309124, + "grad_norm": 0.306640625, + "learning_rate": 0.0003802244738260414, + "loss": 2.3501, + "num_input_tokens_seen": 343367680, + "step": 5240, + "train_runtime": 94107.614, + "train_tokens_per_second": 3648.671 + }, + { + "epoch": 1.3863726396408294, + "grad_norm": 0.318359375, + "learning_rate": 0.00037976788385116666, + "loss": 2.3171, + "num_input_tokens_seen": 344023040, + "step": 5250, + "train_runtime": 94227.9178, + "train_tokens_per_second": 3650.967 + }, + { + "epoch": 1.389013600950746, + "grad_norm": 0.3046875, + "learning_rate": 0.0003793107006152293, + "loss": 2.274, + "num_input_tokens_seen": 344678400, + "step": 5260, + "train_runtime": 94347.6, + "train_tokens_per_second": 3653.282 + }, + { + "epoch": 1.3916545622606629, + "grad_norm": 0.302734375, + "learning_rate": 0.00037885292620833827, + "loss": 2.3022, + "num_input_tokens_seen": 345333760, + "step": 5270, + "train_runtime": 94467.1961, + "train_tokens_per_second": 3655.594 + }, + { + "epoch": 1.3942955235705796, + "grad_norm": 0.3046875, + "learning_rate": 0.0003783945627233052, + "loss": 2.303, + "num_input_tokens_seen": 345989120, + "step": 5280, + "train_runtime": 94586.2929, + "train_tokens_per_second": 3657.92 + }, + { + "epoch": 1.3969364848804964, + "grad_norm": 0.322265625, + "learning_rate": 0.0003779356122556347, + "loss": 2.2225, + "num_input_tokens_seen": 346644480, + "step": 5290, + "train_runtime": 94704.221, + "train_tokens_per_second": 3660.285 + }, + { + "epoch": 1.3995774461904134, + "grad_norm": 0.310546875, + "learning_rate": 0.0003774760769035148, + "loss": 2.2395, + "num_input_tokens_seen": 347299840, + "step": 5300, + "train_runtime": 94823.0943, + "train_tokens_per_second": 3662.608 + }, + { + "epoch": 1.40221840750033, + "grad_norm": 0.294921875, + "learning_rate": 0.00037701595876780795, + "loss": 2.2575, + "num_input_tokens_seen": 347955200, + "step": 5310, + "train_runtime": 94941.8464, + "train_tokens_per_second": 3664.93 + }, + { + "epoch": 1.4048593688102469, + "grad_norm": 0.294921875, + "learning_rate": 0.0003765552599520404, + "loss": 2.267, + "num_input_tokens_seen": 348610560, + "step": 5320, + "train_runtime": 95062.3926, + "train_tokens_per_second": 3667.176 + }, + { + "epoch": 1.4075003301201638, + "grad_norm": 0.30859375, + "learning_rate": 0.0003760939825623933, + "loss": 2.2573, + "num_input_tokens_seen": 349265920, + "step": 5330, + "train_runtime": 95181.5486, + "train_tokens_per_second": 3669.471 + }, + { + "epoch": 1.4101412914300806, + "grad_norm": 0.28125, + "learning_rate": 0.00037563212870769287, + "loss": 2.259, + "num_input_tokens_seen": 349921280, + "step": 5340, + "train_runtime": 95301.5728, + "train_tokens_per_second": 3671.726 + }, + { + "epoch": 1.4127822527399974, + "grad_norm": 0.287109375, + "learning_rate": 0.0003751697004994008, + "loss": 2.2742, + "num_input_tokens_seen": 350576640, + "step": 5350, + "train_runtime": 95419.7779, + "train_tokens_per_second": 3674.046 + }, + { + "epoch": 1.4154232140499141, + "grad_norm": 0.31640625, + "learning_rate": 0.0003747067000516044, + "loss": 2.2791, + "num_input_tokens_seen": 351232000, + "step": 5360, + "train_runtime": 95538.2795, + "train_tokens_per_second": 3676.348 + }, + { + "epoch": 1.4180641753598309, + "grad_norm": 0.29296875, + "learning_rate": 0.0003742431294810073, + "loss": 2.22, + "num_input_tokens_seen": 351887360, + "step": 5370, + "train_runtime": 95656.9994, + "train_tokens_per_second": 3678.637 + }, + { + "epoch": 1.4207051366697478, + "grad_norm": 0.30859375, + "learning_rate": 0.00037377899090691936, + "loss": 2.3062, + "num_input_tokens_seen": 352542720, + "step": 5380, + "train_runtime": 95775.7809, + "train_tokens_per_second": 3680.917 + }, + { + "epoch": 1.4233460979796646, + "grad_norm": 0.296875, + "learning_rate": 0.00037331428645124735, + "loss": 2.2998, + "num_input_tokens_seen": 353198080, + "step": 5390, + "train_runtime": 95896.8801, + "train_tokens_per_second": 3683.103 + }, + { + "epoch": 1.4259870592895814, + "grad_norm": 0.298828125, + "learning_rate": 0.0003728490182384851, + "loss": 2.285, + "num_input_tokens_seen": 353853440, + "step": 5400, + "train_runtime": 96017.8895, + "train_tokens_per_second": 3685.287 + }, + { + "epoch": 1.4286280205994983, + "grad_norm": 0.3125, + "learning_rate": 0.00037238318839570355, + "loss": 2.2884, + "num_input_tokens_seen": 354508800, + "step": 5410, + "train_runtime": 96137.8059, + "train_tokens_per_second": 3687.507 + }, + { + "epoch": 1.431268981909415, + "grad_norm": 0.30859375, + "learning_rate": 0.00037191679905254155, + "loss": 2.2288, + "num_input_tokens_seen": 355164160, + "step": 5420, + "train_runtime": 96256.2674, + "train_tokens_per_second": 3689.777 + }, + { + "epoch": 1.4339099432193319, + "grad_norm": 0.29296875, + "learning_rate": 0.00037144985234119555, + "loss": 2.2367, + "num_input_tokens_seen": 355819520, + "step": 5430, + "train_runtime": 96374.0976, + "train_tokens_per_second": 3692.066 + }, + { + "epoch": 1.4365509045292486, + "grad_norm": 0.2890625, + "learning_rate": 0.0003709823503964103, + "loss": 2.2153, + "num_input_tokens_seen": 356474880, + "step": 5440, + "train_runtime": 96493.1509, + "train_tokens_per_second": 3694.302 + }, + { + "epoch": 1.4391918658391654, + "grad_norm": 0.30859375, + "learning_rate": 0.0003705142953554689, + "loss": 2.259, + "num_input_tokens_seen": 357130240, + "step": 5450, + "train_runtime": 96611.7007, + "train_tokens_per_second": 3696.553 + }, + { + "epoch": 1.4418328271490823, + "grad_norm": 0.275390625, + "learning_rate": 0.00037004568935818295, + "loss": 2.2649, + "num_input_tokens_seen": 357785600, + "step": 5460, + "train_runtime": 96730.8038, + "train_tokens_per_second": 3698.776 + }, + { + "epoch": 1.444473788458999, + "grad_norm": 0.3046875, + "learning_rate": 0.000369576534546883, + "loss": 2.2521, + "num_input_tokens_seen": 358440960, + "step": 5470, + "train_runtime": 96851.4334, + "train_tokens_per_second": 3700.936 + }, + { + "epoch": 1.4471147497689159, + "grad_norm": 0.287109375, + "learning_rate": 0.00036910683306640846, + "loss": 2.2253, + "num_input_tokens_seen": 359096320, + "step": 5480, + "train_runtime": 96972.9536, + "train_tokens_per_second": 3703.056 + }, + { + "epoch": 1.4497557110788328, + "grad_norm": 0.296875, + "learning_rate": 0.00036863658706409806, + "loss": 2.2428, + "num_input_tokens_seen": 359751680, + "step": 5490, + "train_runtime": 97092.7885, + "train_tokens_per_second": 3705.236 + }, + { + "epoch": 1.4523966723887496, + "grad_norm": 0.310546875, + "learning_rate": 0.0003681657986897799, + "loss": 2.2753, + "num_input_tokens_seen": 360407040, + "step": 5500, + "train_runtime": 97211.0919, + "train_tokens_per_second": 3707.468 + }, + { + "epoch": 1.4550376336986663, + "grad_norm": 0.30078125, + "learning_rate": 0.00036769447009576164, + "loss": 2.2894, + "num_input_tokens_seen": 361062400, + "step": 5510, + "train_runtime": 97341.7182, + "train_tokens_per_second": 3709.226 + }, + { + "epoch": 1.457678595008583, + "grad_norm": 0.2890625, + "learning_rate": 0.0003672226034368207, + "loss": 2.2631, + "num_input_tokens_seen": 361717760, + "step": 5520, + "train_runtime": 97463.0139, + "train_tokens_per_second": 3711.334 + }, + { + "epoch": 1.4603195563184999, + "grad_norm": 0.29296875, + "learning_rate": 0.0003667502008701943, + "loss": 2.287, + "num_input_tokens_seen": 362373120, + "step": 5530, + "train_runtime": 97583.3565, + "train_tokens_per_second": 3713.473 + }, + { + "epoch": 1.4629605176284168, + "grad_norm": 0.294921875, + "learning_rate": 0.00036627726455556976, + "loss": 2.2132, + "num_input_tokens_seen": 363028480, + "step": 5540, + "train_runtime": 97702.306, + "train_tokens_per_second": 3715.659 + }, + { + "epoch": 1.4656014789383336, + "grad_norm": 0.302734375, + "learning_rate": 0.0003658037966550746, + "loss": 2.2597, + "num_input_tokens_seen": 363683840, + "step": 5550, + "train_runtime": 97820.7458, + "train_tokens_per_second": 3717.86 + }, + { + "epoch": 1.4682424402482503, + "grad_norm": 0.29296875, + "learning_rate": 0.00036532979933326626, + "loss": 2.2171, + "num_input_tokens_seen": 364339200, + "step": 5560, + "train_runtime": 97939.1074, + "train_tokens_per_second": 3720.058 + }, + { + "epoch": 1.470883401558167, + "grad_norm": 0.302734375, + "learning_rate": 0.000364855274757123, + "loss": 2.1656, + "num_input_tokens_seen": 364994560, + "step": 5570, + "train_runtime": 98059.806, + "train_tokens_per_second": 3722.163 + }, + { + "epoch": 1.4735243628680839, + "grad_norm": 0.306640625, + "learning_rate": 0.00036438022509603326, + "loss": 2.2898, + "num_input_tokens_seen": 365649920, + "step": 5580, + "train_runtime": 98180.6303, + "train_tokens_per_second": 3724.257 + }, + { + "epoch": 1.4761653241780008, + "grad_norm": 0.283203125, + "learning_rate": 0.00036390465252178597, + "loss": 2.2011, + "num_input_tokens_seen": 366305280, + "step": 5590, + "train_runtime": 98299.989, + "train_tokens_per_second": 3726.402 + }, + { + "epoch": 1.4788062854879176, + "grad_norm": 0.302734375, + "learning_rate": 0.00036342855920856086, + "loss": 2.2916, + "num_input_tokens_seen": 366960640, + "step": 5600, + "train_runtime": 98418.0582, + "train_tokens_per_second": 3728.591 + }, + { + "epoch": 1.4814472467978343, + "grad_norm": 0.298828125, + "learning_rate": 0.00036295194733291825, + "loss": 2.2948, + "num_input_tokens_seen": 367616000, + "step": 5610, + "train_runtime": 98537.2337, + "train_tokens_per_second": 3730.732 + }, + { + "epoch": 1.4840882081077513, + "grad_norm": 0.29296875, + "learning_rate": 0.00036247481907378915, + "loss": 2.2105, + "num_input_tokens_seen": 368271360, + "step": 5620, + "train_runtime": 98658.9979, + "train_tokens_per_second": 3732.77 + }, + { + "epoch": 1.486729169417668, + "grad_norm": 0.314453125, + "learning_rate": 0.0003619971766124653, + "loss": 2.2325, + "num_input_tokens_seen": 368926720, + "step": 5630, + "train_runtime": 98782.0276, + "train_tokens_per_second": 3734.755 + }, + { + "epoch": 1.4893701307275848, + "grad_norm": 0.291015625, + "learning_rate": 0.0003615190221325893, + "loss": 2.239, + "num_input_tokens_seen": 369582080, + "step": 5640, + "train_runtime": 98899.572, + "train_tokens_per_second": 3736.943 + }, + { + "epoch": 1.4920110920375016, + "grad_norm": 0.294921875, + "learning_rate": 0.0003610403578201445, + "loss": 2.226, + "num_input_tokens_seen": 370237440, + "step": 5650, + "train_runtime": 99018.6627, + "train_tokens_per_second": 3739.067 + }, + { + "epoch": 1.4946520533474184, + "grad_norm": 0.279296875, + "learning_rate": 0.00036056118586344504, + "loss": 2.2243, + "num_input_tokens_seen": 370892800, + "step": 5660, + "train_runtime": 99137.1022, + "train_tokens_per_second": 3741.211 + }, + { + "epoch": 1.4972930146573353, + "grad_norm": 0.296875, + "learning_rate": 0.00036008150845312595, + "loss": 2.202, + "num_input_tokens_seen": 371548160, + "step": 5670, + "train_runtime": 99254.9505, + "train_tokens_per_second": 3743.372 + }, + { + "epoch": 1.499933975967252, + "grad_norm": 0.32421875, + "learning_rate": 0.00035960132778213295, + "loss": 2.2371, + "num_input_tokens_seen": 372203520, + "step": 5680, + "train_runtime": 99373.1878, + "train_tokens_per_second": 3745.513 + }, + { + "epoch": 1.5025749372771688, + "grad_norm": 0.291015625, + "learning_rate": 0.00035912064604571247, + "loss": 2.2658, + "num_input_tokens_seen": 372858880, + "step": 5690, + "train_runtime": 99491.7647, + "train_tokens_per_second": 3747.636 + }, + { + "epoch": 1.5052158985870858, + "grad_norm": 0.3046875, + "learning_rate": 0.00035863946544140184, + "loss": 2.1924, + "num_input_tokens_seen": 373514240, + "step": 5700, + "train_runtime": 99610.9288, + "train_tokens_per_second": 3749.732 + }, + { + "epoch": 1.5078568598970024, + "grad_norm": 0.306640625, + "learning_rate": 0.00035815778816901904, + "loss": 2.256, + "num_input_tokens_seen": 374169600, + "step": 5710, + "train_runtime": 99730.1745, + "train_tokens_per_second": 3751.819 + }, + { + "epoch": 1.5104978212069193, + "grad_norm": 0.306640625, + "learning_rate": 0.00035767561643065257, + "loss": 2.1976, + "num_input_tokens_seen": 374824960, + "step": 5720, + "train_runtime": 99848.9497, + "train_tokens_per_second": 3753.92 + }, + { + "epoch": 1.513138782516836, + "grad_norm": 0.29296875, + "learning_rate": 0.0003571929524306515, + "loss": 2.2108, + "num_input_tokens_seen": 375480320, + "step": 5730, + "train_runtime": 99967.2615, + "train_tokens_per_second": 3756.033 + }, + { + "epoch": 1.5157797438267528, + "grad_norm": 0.294921875, + "learning_rate": 0.0003567097983756153, + "loss": 2.3204, + "num_input_tokens_seen": 376135680, + "step": 5740, + "train_runtime": 100085.7354, + "train_tokens_per_second": 3758.135 + }, + { + "epoch": 1.5184207051366698, + "grad_norm": 0.3046875, + "learning_rate": 0.00035622615647438425, + "loss": 2.2411, + "num_input_tokens_seen": 376791040, + "step": 5750, + "train_runtime": 100204.6838, + "train_tokens_per_second": 3760.214 + }, + { + "epoch": 1.5210616664465866, + "grad_norm": 0.29296875, + "learning_rate": 0.00035574202893802833, + "loss": 2.1838, + "num_input_tokens_seen": 377446400, + "step": 5760, + "train_runtime": 100323.8015, + "train_tokens_per_second": 3762.282 + }, + { + "epoch": 1.5237026277565033, + "grad_norm": 0.279296875, + "learning_rate": 0.000355257417979838, + "loss": 2.28, + "num_input_tokens_seen": 378101760, + "step": 5770, + "train_runtime": 100444.7715, + "train_tokens_per_second": 3764.275 + }, + { + "epoch": 1.5263435890664203, + "grad_norm": 0.287109375, + "learning_rate": 0.0003547723258153138, + "loss": 2.2991, + "num_input_tokens_seen": 378757120, + "step": 5780, + "train_runtime": 100563.6927, + "train_tokens_per_second": 3766.341 + }, + { + "epoch": 1.5289845503763368, + "grad_norm": 0.32421875, + "learning_rate": 0.0003542867546621563, + "loss": 2.3799, + "num_input_tokens_seen": 379412480, + "step": 5790, + "train_runtime": 100682.5497, + "train_tokens_per_second": 3768.404 + }, + { + "epoch": 1.5316255116862538, + "grad_norm": 0.29296875, + "learning_rate": 0.0003538007067402556, + "loss": 2.2693, + "num_input_tokens_seen": 380067840, + "step": 5800, + "train_runtime": 100803.4359, + "train_tokens_per_second": 3770.386 + }, + { + "epoch": 1.5342664729961706, + "grad_norm": 0.29296875, + "learning_rate": 0.0003533141842716816, + "loss": 2.2448, + "num_input_tokens_seen": 380723200, + "step": 5810, + "train_runtime": 100922.0068, + "train_tokens_per_second": 3772.45 + }, + { + "epoch": 1.5369074343060873, + "grad_norm": 0.30078125, + "learning_rate": 0.0003528271894806737, + "loss": 2.2343, + "num_input_tokens_seen": 381378560, + "step": 5820, + "train_runtime": 101041.4132, + "train_tokens_per_second": 3774.478 + }, + { + "epoch": 1.5395483956160043, + "grad_norm": 0.3046875, + "learning_rate": 0.00035233972459363056, + "loss": 2.2402, + "num_input_tokens_seen": 382033920, + "step": 5830, + "train_runtime": 101159.5687, + "train_tokens_per_second": 3776.548 + }, + { + "epoch": 1.542189356925921, + "grad_norm": 0.287109375, + "learning_rate": 0.0003518517918391001, + "loss": 2.2691, + "num_input_tokens_seen": 382689280, + "step": 5840, + "train_runtime": 101278.5167, + "train_tokens_per_second": 3778.583 + }, + { + "epoch": 1.5448303182358378, + "grad_norm": 0.29296875, + "learning_rate": 0.000351363393447769, + "loss": 2.219, + "num_input_tokens_seen": 383344640, + "step": 5850, + "train_runtime": 101397.4895, + "train_tokens_per_second": 3780.613 + }, + { + "epoch": 1.5474712795457548, + "grad_norm": 0.291015625, + "learning_rate": 0.0003508745316524528, + "loss": 2.2402, + "num_input_tokens_seen": 384000000, + "step": 5860, + "train_runtime": 101516.2815, + "train_tokens_per_second": 3782.644 + }, + { + "epoch": 1.5501122408556713, + "grad_norm": 0.29296875, + "learning_rate": 0.00035038520868808573, + "loss": 2.2339, + "num_input_tokens_seen": 384655360, + "step": 5870, + "train_runtime": 101634.5334, + "train_tokens_per_second": 3784.692 + }, + { + "epoch": 1.5527532021655883, + "grad_norm": 0.30078125, + "learning_rate": 0.00034989542679171007, + "loss": 2.2025, + "num_input_tokens_seen": 385310720, + "step": 5880, + "train_runtime": 101752.9132, + "train_tokens_per_second": 3786.729 + }, + { + "epoch": 1.555394163475505, + "grad_norm": 0.306640625, + "learning_rate": 0.0003494051882024665, + "loss": 2.2256, + "num_input_tokens_seen": 385966080, + "step": 5890, + "train_runtime": 101871.2073, + "train_tokens_per_second": 3788.765 + }, + { + "epoch": 1.5580351247854218, + "grad_norm": 0.294921875, + "learning_rate": 0.00034891449516158326, + "loss": 2.1937, + "num_input_tokens_seen": 386621440, + "step": 5900, + "train_runtime": 101989.8338, + "train_tokens_per_second": 3790.784 + }, + { + "epoch": 1.5606760860953388, + "grad_norm": 0.298828125, + "learning_rate": 0.0003484233499123665, + "loss": 2.2708, + "num_input_tokens_seen": 387276800, + "step": 5910, + "train_runtime": 102110.8845, + "train_tokens_per_second": 3792.708 + }, + { + "epoch": 1.5633170474052556, + "grad_norm": 0.2890625, + "learning_rate": 0.0003479317547001895, + "loss": 2.2746, + "num_input_tokens_seen": 387932160, + "step": 5920, + "train_runtime": 102229.8036, + "train_tokens_per_second": 3794.707 + }, + { + "epoch": 1.5659580087151723, + "grad_norm": 0.28515625, + "learning_rate": 0.0003474397117724829, + "loss": 2.1972, + "num_input_tokens_seen": 388587520, + "step": 5930, + "train_runtime": 102348.6911, + "train_tokens_per_second": 3796.702 + }, + { + "epoch": 1.568598970025089, + "grad_norm": 0.291015625, + "learning_rate": 0.0003469472233787238, + "loss": 2.1999, + "num_input_tokens_seen": 389242880, + "step": 5940, + "train_runtime": 102466.9399, + "train_tokens_per_second": 3798.717 + }, + { + "epoch": 1.5712399313350058, + "grad_norm": 0.296875, + "learning_rate": 0.0003464542917704262, + "loss": 2.2364, + "num_input_tokens_seen": 389898240, + "step": 5950, + "train_runtime": 102585.4476, + "train_tokens_per_second": 3800.717 + }, + { + "epoch": 1.5738808926449228, + "grad_norm": 0.298828125, + "learning_rate": 0.0003459609192011301, + "loss": 2.2147, + "num_input_tokens_seen": 390553600, + "step": 5960, + "train_runtime": 102704.0399, + "train_tokens_per_second": 3802.709 + }, + { + "epoch": 1.5765218539548396, + "grad_norm": 0.302734375, + "learning_rate": 0.00034546710792639164, + "loss": 2.2406, + "num_input_tokens_seen": 391208960, + "step": 5970, + "train_runtime": 102822.2437, + "train_tokens_per_second": 3804.711 + }, + { + "epoch": 1.5791628152647563, + "grad_norm": 0.283203125, + "learning_rate": 0.00034497286020377245, + "loss": 2.2814, + "num_input_tokens_seen": 391864320, + "step": 5980, + "train_runtime": 102940.9894, + "train_tokens_per_second": 3806.689 + }, + { + "epoch": 1.5818037765746733, + "grad_norm": 0.29296875, + "learning_rate": 0.00034447817829282945, + "loss": 2.2857, + "num_input_tokens_seen": 392519680, + "step": 5990, + "train_runtime": 103059.3455, + "train_tokens_per_second": 3808.676 + }, + { + "epoch": 1.5844447378845898, + "grad_norm": 0.2890625, + "learning_rate": 0.0003439830644551048, + "loss": 2.2429, + "num_input_tokens_seen": 393175040, + "step": 6000, + "train_runtime": 103177.906, + "train_tokens_per_second": 3810.651 + }, + { + "epoch": 1.5870856991945068, + "grad_norm": 0.294921875, + "learning_rate": 0.00034348752095411493, + "loss": 2.2563, + "num_input_tokens_seen": 393830400, + "step": 6010, + "train_runtime": 103310.1462, + "train_tokens_per_second": 3812.117 + }, + { + "epoch": 1.5897266605044236, + "grad_norm": 0.310546875, + "learning_rate": 0.00034299155005534086, + "loss": 2.2445, + "num_input_tokens_seen": 394485760, + "step": 6020, + "train_runtime": 103429.4102, + "train_tokens_per_second": 3814.058 + }, + { + "epoch": 1.5923676218143403, + "grad_norm": 0.318359375, + "learning_rate": 0.00034249515402621746, + "loss": 2.2211, + "num_input_tokens_seen": 395141120, + "step": 6030, + "train_runtime": 103548.2987, + "train_tokens_per_second": 3816.008 + }, + { + "epoch": 1.5950085831242573, + "grad_norm": 0.34765625, + "learning_rate": 0.000341998335136123, + "loss": 2.3039, + "num_input_tokens_seen": 395796480, + "step": 6040, + "train_runtime": 103666.3992, + "train_tokens_per_second": 3817.982 + }, + { + "epoch": 1.597649544434174, + "grad_norm": 0.30078125, + "learning_rate": 0.00034150109565636924, + "loss": 2.1969, + "num_input_tokens_seen": 396451840, + "step": 6050, + "train_runtime": 103786.543, + "train_tokens_per_second": 3819.877 + }, + { + "epoch": 1.6002905057440908, + "grad_norm": 0.279296875, + "learning_rate": 0.0003410034378601906, + "loss": 2.2174, + "num_input_tokens_seen": 397107200, + "step": 6060, + "train_runtime": 103907.9855, + "train_tokens_per_second": 3821.72 + }, + { + "epoch": 1.6029314670540078, + "grad_norm": 0.296875, + "learning_rate": 0.00034050536402273384, + "loss": 2.2401, + "num_input_tokens_seen": 397762560, + "step": 6070, + "train_runtime": 104028.9442, + "train_tokens_per_second": 3823.576 + }, + { + "epoch": 1.6055724283639243, + "grad_norm": 0.28125, + "learning_rate": 0.000340006876421048, + "loss": 2.2266, + "num_input_tokens_seen": 398417920, + "step": 6080, + "train_runtime": 104149.3397, + "train_tokens_per_second": 3825.448 + }, + { + "epoch": 1.6082133896738413, + "grad_norm": 0.291015625, + "learning_rate": 0.00033950797733407344, + "loss": 2.1852, + "num_input_tokens_seen": 399073280, + "step": 6090, + "train_runtime": 104268.6612, + "train_tokens_per_second": 3827.356 + }, + { + "epoch": 1.610854350983758, + "grad_norm": 0.302734375, + "learning_rate": 0.00033900866904263186, + "loss": 2.2782, + "num_input_tokens_seen": 399728640, + "step": 6100, + "train_runtime": 104386.4519, + "train_tokens_per_second": 3829.315 + }, + { + "epoch": 1.6134953122936748, + "grad_norm": 0.298828125, + "learning_rate": 0.0003385089538294158, + "loss": 2.276, + "num_input_tokens_seen": 400384000, + "step": 6110, + "train_runtime": 104505.1118, + "train_tokens_per_second": 3831.238 + }, + { + "epoch": 1.6161362736035918, + "grad_norm": 0.298828125, + "learning_rate": 0.0003380088339789779, + "loss": 2.2243, + "num_input_tokens_seen": 401039360, + "step": 6120, + "train_runtime": 104626.5766, + "train_tokens_per_second": 3833.054 + }, + { + "epoch": 1.6187772349135086, + "grad_norm": 0.306640625, + "learning_rate": 0.00033750831177772076, + "loss": 2.2221, + "num_input_tokens_seen": 401694720, + "step": 6130, + "train_runtime": 104744.9254, + "train_tokens_per_second": 3834.98 + }, + { + "epoch": 1.6214181962234253, + "grad_norm": 0.29296875, + "learning_rate": 0.0003370073895138866, + "loss": 2.2769, + "num_input_tokens_seen": 402350080, + "step": 6140, + "train_runtime": 104863.6451, + "train_tokens_per_second": 3836.888 + }, + { + "epoch": 1.6240591575333423, + "grad_norm": 0.2890625, + "learning_rate": 0.00033650606947754647, + "loss": 2.2258, + "num_input_tokens_seen": 403005440, + "step": 6150, + "train_runtime": 104981.952, + "train_tokens_per_second": 3838.807 + }, + { + "epoch": 1.6267001188432588, + "grad_norm": 0.291015625, + "learning_rate": 0.00033600435396058994, + "loss": 2.2582, + "num_input_tokens_seen": 403660800, + "step": 6160, + "train_runtime": 105099.9364, + "train_tokens_per_second": 3840.733 + }, + { + "epoch": 1.6293410801531758, + "grad_norm": 0.28515625, + "learning_rate": 0.0003355022452567144, + "loss": 2.2351, + "num_input_tokens_seen": 404316160, + "step": 6170, + "train_runtime": 105221.7088, + "train_tokens_per_second": 3842.517 + }, + { + "epoch": 1.6319820414630926, + "grad_norm": 0.30859375, + "learning_rate": 0.0003349997456614152, + "loss": 2.2935, + "num_input_tokens_seen": 404971520, + "step": 6180, + "train_runtime": 105341.9161, + "train_tokens_per_second": 3844.353 + }, + { + "epoch": 1.6346230027730093, + "grad_norm": 0.283203125, + "learning_rate": 0.0003344968574719744, + "loss": 2.1989, + "num_input_tokens_seen": 405626880, + "step": 6190, + "train_runtime": 105460.0525, + "train_tokens_per_second": 3846.261 + }, + { + "epoch": 1.6372639640829263, + "grad_norm": 0.29296875, + "learning_rate": 0.00033399358298745067, + "loss": 2.2285, + "num_input_tokens_seen": 406282240, + "step": 6200, + "train_runtime": 105578.4506, + "train_tokens_per_second": 3848.155 + }, + { + "epoch": 1.639904925392843, + "grad_norm": 0.294921875, + "learning_rate": 0.0003334899245086687, + "loss": 2.1826, + "num_input_tokens_seen": 406937600, + "step": 6210, + "train_runtime": 105697.191, + "train_tokens_per_second": 3850.032 + }, + { + "epoch": 1.6425458867027598, + "grad_norm": 0.30859375, + "learning_rate": 0.0003329858843382089, + "loss": 2.2519, + "num_input_tokens_seen": 407592960, + "step": 6220, + "train_runtime": 105815.1638, + "train_tokens_per_second": 3851.933 + }, + { + "epoch": 1.6451868480126766, + "grad_norm": 0.298828125, + "learning_rate": 0.0003324814647803962, + "loss": 2.2557, + "num_input_tokens_seen": 408248320, + "step": 6230, + "train_runtime": 105934.0778, + "train_tokens_per_second": 3853.796 + }, + { + "epoch": 1.6478278093225933, + "grad_norm": 0.279296875, + "learning_rate": 0.00033197666814129044, + "loss": 2.1799, + "num_input_tokens_seen": 408903680, + "step": 6240, + "train_runtime": 106052.3659, + "train_tokens_per_second": 3855.677 + }, + { + "epoch": 1.6504687706325103, + "grad_norm": 0.287109375, + "learning_rate": 0.0003314714967286753, + "loss": 2.267, + "num_input_tokens_seen": 409559040, + "step": 6250, + "train_runtime": 106172.7773, + "train_tokens_per_second": 3857.477 + }, + { + "epoch": 1.653109731942427, + "grad_norm": 0.283203125, + "learning_rate": 0.00033096595285204755, + "loss": 2.276, + "num_input_tokens_seen": 410214400, + "step": 6260, + "train_runtime": 106295.8445, + "train_tokens_per_second": 3859.176 + }, + { + "epoch": 1.6557506932523438, + "grad_norm": 0.291015625, + "learning_rate": 0.00033046003882260694, + "loss": 2.2193, + "num_input_tokens_seen": 410869760, + "step": 6270, + "train_runtime": 106414.384, + "train_tokens_per_second": 3861.036 + }, + { + "epoch": 1.6583916545622608, + "grad_norm": 0.283203125, + "learning_rate": 0.00032995375695324544, + "loss": 2.2459, + "num_input_tokens_seen": 411525120, + "step": 6280, + "train_runtime": 106532.7579, + "train_tokens_per_second": 3862.897 + }, + { + "epoch": 1.6610326158721773, + "grad_norm": 0.29296875, + "learning_rate": 0.00032944710955853663, + "loss": 2.2003, + "num_input_tokens_seen": 412180480, + "step": 6290, + "train_runtime": 106651.5193, + "train_tokens_per_second": 3864.741 + }, + { + "epoch": 1.6636735771820943, + "grad_norm": 0.302734375, + "learning_rate": 0.00032894009895472533, + "loss": 2.2881, + "num_input_tokens_seen": 412835840, + "step": 6300, + "train_runtime": 106771.4124, + "train_tokens_per_second": 3866.539 + }, + { + "epoch": 1.666314538492011, + "grad_norm": 0.298828125, + "learning_rate": 0.00032843272745971646, + "loss": 2.1741, + "num_input_tokens_seen": 413491200, + "step": 6310, + "train_runtime": 106890.3921, + "train_tokens_per_second": 3868.366 + }, + { + "epoch": 1.6689554998019278, + "grad_norm": 0.27734375, + "learning_rate": 0.00032792499739306533, + "loss": 2.2267, + "num_input_tokens_seen": 414146560, + "step": 6320, + "train_runtime": 107008.467, + "train_tokens_per_second": 3870.222 + }, + { + "epoch": 1.6715964611118448, + "grad_norm": 0.3046875, + "learning_rate": 0.00032741691107596616, + "loss": 2.2488, + "num_input_tokens_seen": 414801920, + "step": 6330, + "train_runtime": 107129.579, + "train_tokens_per_second": 3871.964 + }, + { + "epoch": 1.6742374224217615, + "grad_norm": 0.291015625, + "learning_rate": 0.0003269084708312421, + "loss": 2.269, + "num_input_tokens_seen": 415457280, + "step": 6340, + "train_runtime": 107250.1062, + "train_tokens_per_second": 3873.724 + }, + { + "epoch": 1.6768783837316783, + "grad_norm": 0.294921875, + "learning_rate": 0.0003263996789833341, + "loss": 2.2739, + "num_input_tokens_seen": 416112640, + "step": 6350, + "train_runtime": 107368.668, + "train_tokens_per_second": 3875.55 + }, + { + "epoch": 1.6795193450415953, + "grad_norm": 0.283203125, + "learning_rate": 0.0003258905378582907, + "loss": 2.2184, + "num_input_tokens_seen": 416768000, + "step": 6360, + "train_runtime": 107486.5584, + "train_tokens_per_second": 3877.396 + }, + { + "epoch": 1.6821603063515118, + "grad_norm": 0.283203125, + "learning_rate": 0.0003253810497837572, + "loss": 2.2112, + "num_input_tokens_seen": 417423360, + "step": 6370, + "train_runtime": 107604.7235, + "train_tokens_per_second": 3879.229 + }, + { + "epoch": 1.6848012676614288, + "grad_norm": 0.296875, + "learning_rate": 0.0003248712170889651, + "loss": 2.1952, + "num_input_tokens_seen": 418078720, + "step": 6380, + "train_runtime": 107722.9922, + "train_tokens_per_second": 3881.054 + }, + { + "epoch": 1.6874422289713455, + "grad_norm": 0.29296875, + "learning_rate": 0.0003243610421047213, + "loss": 2.2812, + "num_input_tokens_seen": 418734080, + "step": 6390, + "train_runtime": 107840.9421, + "train_tokens_per_second": 3882.886 + }, + { + "epoch": 1.6900831902812623, + "grad_norm": 0.310546875, + "learning_rate": 0.0003238505271633975, + "loss": 2.2397, + "num_input_tokens_seen": 419389440, + "step": 6400, + "train_runtime": 107959.8287, + "train_tokens_per_second": 3884.68 + }, + { + "epoch": 1.6927241515911793, + "grad_norm": 0.28125, + "learning_rate": 0.00032333967459892, + "loss": 2.1701, + "num_input_tokens_seen": 420044800, + "step": 6410, + "train_runtime": 108078.5028, + "train_tokens_per_second": 3886.479 + }, + { + "epoch": 1.695365112901096, + "grad_norm": 0.28125, + "learning_rate": 0.00032282848674675796, + "loss": 2.2112, + "num_input_tokens_seen": 420700160, + "step": 6420, + "train_runtime": 108196.7229, + "train_tokens_per_second": 3888.289 + }, + { + "epoch": 1.6980060742110128, + "grad_norm": 0.2890625, + "learning_rate": 0.00032231696594391395, + "loss": 2.2086, + "num_input_tokens_seen": 421355520, + "step": 6430, + "train_runtime": 108315.3892, + "train_tokens_per_second": 3890.08 + }, + { + "epoch": 1.7006470355209298, + "grad_norm": 0.283203125, + "learning_rate": 0.0003218051145289124, + "loss": 2.2006, + "num_input_tokens_seen": 422010880, + "step": 6440, + "train_runtime": 108435.7939, + "train_tokens_per_second": 3891.804 + }, + { + "epoch": 1.7032879968308463, + "grad_norm": 0.3046875, + "learning_rate": 0.00032129293484178925, + "loss": 2.3335, + "num_input_tokens_seen": 422666240, + "step": 6450, + "train_runtime": 108553.8734, + "train_tokens_per_second": 3893.608 + }, + { + "epoch": 1.7059289581407633, + "grad_norm": 0.294921875, + "learning_rate": 0.0003207804292240812, + "loss": 2.2537, + "num_input_tokens_seen": 423321600, + "step": 6460, + "train_runtime": 108672.8996, + "train_tokens_per_second": 3895.374 + }, + { + "epoch": 1.70856991945068, + "grad_norm": 0.287109375, + "learning_rate": 0.00032026760001881507, + "loss": 2.1872, + "num_input_tokens_seen": 423976960, + "step": 6470, + "train_runtime": 108793.2483, + "train_tokens_per_second": 3897.089 + }, + { + "epoch": 1.7112108807605968, + "grad_norm": 0.296875, + "learning_rate": 0.000319754449570497, + "loss": 2.1641, + "num_input_tokens_seen": 424632320, + "step": 6480, + "train_runtime": 108912.3022, + "train_tokens_per_second": 3898.846 + }, + { + "epoch": 1.7138518420705138, + "grad_norm": 0.287109375, + "learning_rate": 0.00031924098022510165, + "loss": 2.198, + "num_input_tokens_seen": 425287680, + "step": 6490, + "train_runtime": 109031.1858, + "train_tokens_per_second": 3900.606 + }, + { + "epoch": 1.7164928033804305, + "grad_norm": 0.2890625, + "learning_rate": 0.0003187271943300618, + "loss": 2.2146, + "num_input_tokens_seen": 425943040, + "step": 6500, + "train_runtime": 109152.7442, + "train_tokens_per_second": 3902.266 + }, + { + "epoch": 1.7191337646903473, + "grad_norm": 0.30859375, + "learning_rate": 0.0003182130942342573, + "loss": 2.2801, + "num_input_tokens_seen": 426598400, + "step": 6510, + "train_runtime": 109285.8918, + "train_tokens_per_second": 3903.508 + }, + { + "epoch": 1.721774726000264, + "grad_norm": 0.29296875, + "learning_rate": 0.00031769868228800435, + "loss": 2.2058, + "num_input_tokens_seen": 427253760, + "step": 6520, + "train_runtime": 109405.4925, + "train_tokens_per_second": 3905.231 + }, + { + "epoch": 1.7244156873101808, + "grad_norm": 0.283203125, + "learning_rate": 0.0003171839608430449, + "loss": 2.1824, + "num_input_tokens_seen": 427909120, + "step": 6530, + "train_runtime": 109526.7432, + "train_tokens_per_second": 3906.892 + }, + { + "epoch": 1.7270566486200978, + "grad_norm": 0.287109375, + "learning_rate": 0.000316668932252536, + "loss": 2.2827, + "num_input_tokens_seen": 428564480, + "step": 6540, + "train_runtime": 109645.4766, + "train_tokens_per_second": 3908.638 + }, + { + "epoch": 1.7296976099300145, + "grad_norm": 0.28515625, + "learning_rate": 0.00031615359887103854, + "loss": 2.2298, + "num_input_tokens_seen": 429219840, + "step": 6550, + "train_runtime": 109763.7338, + "train_tokens_per_second": 3910.398 + }, + { + "epoch": 1.7323385712399313, + "grad_norm": 0.2734375, + "learning_rate": 0.0003156379630545072, + "loss": 2.2086, + "num_input_tokens_seen": 429875200, + "step": 6560, + "train_runtime": 109882.5111, + "train_tokens_per_second": 3912.135 + }, + { + "epoch": 1.7349795325498483, + "grad_norm": 0.310546875, + "learning_rate": 0.0003151220271602789, + "loss": 2.2049, + "num_input_tokens_seen": 430530560, + "step": 6570, + "train_runtime": 110003.8157, + "train_tokens_per_second": 3913.778 + }, + { + "epoch": 1.7376204938597648, + "grad_norm": 0.306640625, + "learning_rate": 0.0003146057935470628, + "loss": 2.1743, + "num_input_tokens_seen": 431185920, + "step": 6580, + "train_runtime": 110124.2541, + "train_tokens_per_second": 3915.449 + }, + { + "epoch": 1.7402614551696818, + "grad_norm": 0.298828125, + "learning_rate": 0.00031408926457492895, + "loss": 2.2296, + "num_input_tokens_seen": 431841280, + "step": 6590, + "train_runtime": 110243.1176, + "train_tokens_per_second": 3917.172 + }, + { + "epoch": 1.7429024164795985, + "grad_norm": 0.287109375, + "learning_rate": 0.0003135724426052978, + "loss": 2.1756, + "num_input_tokens_seen": 432496640, + "step": 6600, + "train_runtime": 110363.0718, + "train_tokens_per_second": 3918.853 + }, + { + "epoch": 1.7455433777895153, + "grad_norm": 0.302734375, + "learning_rate": 0.0003130553300009291, + "loss": 2.1728, + "num_input_tokens_seen": 433152000, + "step": 6610, + "train_runtime": 110481.7805, + "train_tokens_per_second": 3920.574 + }, + { + "epoch": 1.7481843390994323, + "grad_norm": 0.287109375, + "learning_rate": 0.00031253792912591167, + "loss": 2.2178, + "num_input_tokens_seen": 433807360, + "step": 6620, + "train_runtime": 110600.6749, + "train_tokens_per_second": 3922.285 + }, + { + "epoch": 1.750825300409349, + "grad_norm": 0.279296875, + "learning_rate": 0.00031202024234565183, + "loss": 2.1999, + "num_input_tokens_seen": 434462720, + "step": 6630, + "train_runtime": 110719.7179, + "train_tokens_per_second": 3923.987 + }, + { + "epoch": 1.7534662617192658, + "grad_norm": 0.296875, + "learning_rate": 0.000311502272026863, + "loss": 2.1903, + "num_input_tokens_seen": 435118080, + "step": 6640, + "train_runtime": 110838.6311, + "train_tokens_per_second": 3925.69 + }, + { + "epoch": 1.7561072230291828, + "grad_norm": 0.294921875, + "learning_rate": 0.0003109840205375553, + "loss": 2.2394, + "num_input_tokens_seen": 435773440, + "step": 6650, + "train_runtime": 110957.3935, + "train_tokens_per_second": 3927.394 + }, + { + "epoch": 1.7587481843390993, + "grad_norm": 0.27734375, + "learning_rate": 0.0003104654902470238, + "loss": 2.2375, + "num_input_tokens_seen": 436428800, + "step": 6660, + "train_runtime": 111076.0541, + "train_tokens_per_second": 3929.099 + }, + { + "epoch": 1.7613891456490163, + "grad_norm": 0.294921875, + "learning_rate": 0.00030994668352583827, + "loss": 2.1452, + "num_input_tokens_seen": 437084160, + "step": 6670, + "train_runtime": 111195.0803, + "train_tokens_per_second": 3930.787 + }, + { + "epoch": 1.764030106958933, + "grad_norm": 0.28515625, + "learning_rate": 0.0003094276027458324, + "loss": 2.2098, + "num_input_tokens_seen": 437739520, + "step": 6680, + "train_runtime": 111313.5793, + "train_tokens_per_second": 3932.49 + }, + { + "epoch": 1.7666710682688498, + "grad_norm": 0.28125, + "learning_rate": 0.00030890825028009265, + "loss": 2.1948, + "num_input_tokens_seen": 438394880, + "step": 6690, + "train_runtime": 111432.7597, + "train_tokens_per_second": 3934.165 + }, + { + "epoch": 1.7693120295787668, + "grad_norm": 0.28515625, + "learning_rate": 0.00030838862850294775, + "loss": 2.1585, + "num_input_tokens_seen": 439050240, + "step": 6700, + "train_runtime": 111551.5656, + "train_tokens_per_second": 3935.85 + }, + { + "epoch": 1.7719529908886835, + "grad_norm": 0.287109375, + "learning_rate": 0.00030786873978995725, + "loss": 2.2294, + "num_input_tokens_seen": 439705600, + "step": 6710, + "train_runtime": 111670.9152, + "train_tokens_per_second": 3937.512 + }, + { + "epoch": 1.7745939521986003, + "grad_norm": 0.267578125, + "learning_rate": 0.00030734858651790156, + "loss": 2.0915, + "num_input_tokens_seen": 440360960, + "step": 6720, + "train_runtime": 111789.6956, + "train_tokens_per_second": 3939.191 + }, + { + "epoch": 1.7772349135085173, + "grad_norm": 0.314453125, + "learning_rate": 0.00030682817106477013, + "loss": 2.2075, + "num_input_tokens_seen": 441016320, + "step": 6730, + "train_runtime": 111908.1352, + "train_tokens_per_second": 3940.878 + }, + { + "epoch": 1.7798758748184338, + "grad_norm": 0.298828125, + "learning_rate": 0.00030630749580975124, + "loss": 2.2005, + "num_input_tokens_seen": 441671680, + "step": 6740, + "train_runtime": 112027.1622, + "train_tokens_per_second": 3942.541 + }, + { + "epoch": 1.7825168361283508, + "grad_norm": 0.310546875, + "learning_rate": 0.0003057865631332209, + "loss": 2.1084, + "num_input_tokens_seen": 442327040, + "step": 6750, + "train_runtime": 112146.1056, + "train_tokens_per_second": 3944.203 + }, + { + "epoch": 1.7851577974382675, + "grad_norm": 0.283203125, + "learning_rate": 0.0003052653754167319, + "loss": 2.1721, + "num_input_tokens_seen": 442982400, + "step": 6760, + "train_runtime": 112265.1664, + "train_tokens_per_second": 3945.858 + }, + { + "epoch": 1.7877987587481843, + "grad_norm": 0.291015625, + "learning_rate": 0.0003047439350430033, + "loss": 2.1935, + "num_input_tokens_seen": 443637760, + "step": 6770, + "train_runtime": 112384.0032, + "train_tokens_per_second": 3947.517 + }, + { + "epoch": 1.7904397200581013, + "grad_norm": 0.28515625, + "learning_rate": 0.0003042222443959087, + "loss": 2.1449, + "num_input_tokens_seen": 444293120, + "step": 6780, + "train_runtime": 112503.3088, + "train_tokens_per_second": 3949.156 + }, + { + "epoch": 1.7930806813680178, + "grad_norm": 0.27734375, + "learning_rate": 0.0003037003058604663, + "loss": 2.2176, + "num_input_tokens_seen": 444948480, + "step": 6790, + "train_runtime": 112622.2703, + "train_tokens_per_second": 3950.804 + }, + { + "epoch": 1.7957216426779348, + "grad_norm": 0.291015625, + "learning_rate": 0.00030317812182282746, + "loss": 2.1583, + "num_input_tokens_seen": 445603840, + "step": 6800, + "train_runtime": 112742.9301, + "train_tokens_per_second": 3952.388 + }, + { + "epoch": 1.7983626039878515, + "grad_norm": 0.287109375, + "learning_rate": 0.0003026556946702659, + "loss": 2.1611, + "num_input_tokens_seen": 446259200, + "step": 6810, + "train_runtime": 112861.2253, + "train_tokens_per_second": 3954.052 + }, + { + "epoch": 1.8010035652977683, + "grad_norm": 0.30078125, + "learning_rate": 0.00030213302679116656, + "loss": 2.1791, + "num_input_tokens_seen": 446914560, + "step": 6820, + "train_runtime": 112980.1671, + "train_tokens_per_second": 3955.69 + }, + { + "epoch": 1.8036445266076853, + "grad_norm": 0.306640625, + "learning_rate": 0.0003016101205750154, + "loss": 2.2046, + "num_input_tokens_seen": 447569920, + "step": 6830, + "train_runtime": 113098.8089, + "train_tokens_per_second": 3957.335 + }, + { + "epoch": 1.806285487917602, + "grad_norm": 0.287109375, + "learning_rate": 0.0003010869784123876, + "loss": 2.2256, + "num_input_tokens_seen": 448225280, + "step": 6840, + "train_runtime": 113218.0517, + "train_tokens_per_second": 3958.956 + }, + { + "epoch": 1.8089264492275188, + "grad_norm": 0.287109375, + "learning_rate": 0.00030056360269493715, + "loss": 2.1948, + "num_input_tokens_seen": 448880640, + "step": 6850, + "train_runtime": 113337.3183, + "train_tokens_per_second": 3960.572 + }, + { + "epoch": 1.8115674105374358, + "grad_norm": 0.27734375, + "learning_rate": 0.0003000399958153857, + "loss": 2.2073, + "num_input_tokens_seen": 449536000, + "step": 6860, + "train_runtime": 113456.1827, + "train_tokens_per_second": 3962.199 + }, + { + "epoch": 1.8142083718473523, + "grad_norm": 0.28125, + "learning_rate": 0.00029951616016751195, + "loss": 2.2603, + "num_input_tokens_seen": 450191360, + "step": 6870, + "train_runtime": 113574.9211, + "train_tokens_per_second": 3963.827 + }, + { + "epoch": 1.8168493331572693, + "grad_norm": 0.287109375, + "learning_rate": 0.0002989920981461401, + "loss": 2.2381, + "num_input_tokens_seen": 450846720, + "step": 6880, + "train_runtime": 113693.497, + "train_tokens_per_second": 3965.457 + }, + { + "epoch": 1.819490294467186, + "grad_norm": 0.2890625, + "learning_rate": 0.0002984678121471296, + "loss": 2.1604, + "num_input_tokens_seen": 451502080, + "step": 6890, + "train_runtime": 113812.7739, + "train_tokens_per_second": 3967.06 + }, + { + "epoch": 1.8221312557771028, + "grad_norm": 0.31640625, + "learning_rate": 0.00029794330456736363, + "loss": 2.1744, + "num_input_tokens_seen": 452157440, + "step": 6900, + "train_runtime": 113931.679, + "train_tokens_per_second": 3968.672 + }, + { + "epoch": 1.8247722170870198, + "grad_norm": 0.291015625, + "learning_rate": 0.00029741857780473855, + "loss": 2.1531, + "num_input_tokens_seen": 452812800, + "step": 6910, + "train_runtime": 114052.0605, + "train_tokens_per_second": 3970.229 + }, + { + "epoch": 1.8274131783969365, + "grad_norm": 0.28125, + "learning_rate": 0.00029689363425815246, + "loss": 2.2166, + "num_input_tokens_seen": 453468160, + "step": 6920, + "train_runtime": 114170.4795, + "train_tokens_per_second": 3971.851 + }, + { + "epoch": 1.8300541397068533, + "grad_norm": 0.28515625, + "learning_rate": 0.0002963684763274949, + "loss": 2.1864, + "num_input_tokens_seen": 454123520, + "step": 6930, + "train_runtime": 114289.4159, + "train_tokens_per_second": 3973.452 + }, + { + "epoch": 1.8326951010167702, + "grad_norm": 0.28515625, + "learning_rate": 0.00029584310641363534, + "loss": 2.1412, + "num_input_tokens_seen": 454778880, + "step": 6940, + "train_runtime": 114408.26, + "train_tokens_per_second": 3975.053 + }, + { + "epoch": 1.8353360623266868, + "grad_norm": 0.294921875, + "learning_rate": 0.00029531752691841235, + "loss": 2.1573, + "num_input_tokens_seen": 455434240, + "step": 6950, + "train_runtime": 114528.955, + "train_tokens_per_second": 3976.586 + }, + { + "epoch": 1.8379770236366038, + "grad_norm": 0.275390625, + "learning_rate": 0.00029479174024462274, + "loss": 2.1641, + "num_input_tokens_seen": 456089600, + "step": 6960, + "train_runtime": 114647.3935, + "train_tokens_per_second": 3978.194 + }, + { + "epoch": 1.8406179849465205, + "grad_norm": 0.287109375, + "learning_rate": 0.0002942657487960103, + "loss": 2.1352, + "num_input_tokens_seen": 456744960, + "step": 6970, + "train_runtime": 114766.5021, + "train_tokens_per_second": 3979.776 + }, + { + "epoch": 1.8432589462564373, + "grad_norm": 0.28515625, + "learning_rate": 0.0002937395549772553, + "loss": 2.2016, + "num_input_tokens_seen": 457400320, + "step": 6980, + "train_runtime": 114886.3677, + "train_tokens_per_second": 3981.328 + }, + { + "epoch": 1.8458999075663542, + "grad_norm": 0.287109375, + "learning_rate": 0.00029321316119396287, + "loss": 2.1595, + "num_input_tokens_seen": 458055680, + "step": 6990, + "train_runtime": 115007.1447, + "train_tokens_per_second": 3982.845 + }, + { + "epoch": 1.848540868876271, + "grad_norm": 0.306640625, + "learning_rate": 0.0002926865698526524, + "loss": 2.2249, + "num_input_tokens_seen": 458711040, + "step": 7000, + "train_runtime": 115126.0467, + "train_tokens_per_second": 3984.424 + }, + { + "epoch": 1.8511818301861878, + "grad_norm": 0.306640625, + "learning_rate": 0.00029215978336074666, + "loss": 2.1877, + "num_input_tokens_seen": 459366400, + "step": 7010, + "train_runtime": 115258.2273, + "train_tokens_per_second": 3985.541 + }, + { + "epoch": 1.8538227914961047, + "grad_norm": 0.287109375, + "learning_rate": 0.0002916328041265604, + "loss": 2.2082, + "num_input_tokens_seen": 460021760, + "step": 7020, + "train_runtime": 115376.4672, + "train_tokens_per_second": 3987.137 + }, + { + "epoch": 1.8564637528060213, + "grad_norm": 0.291015625, + "learning_rate": 0.00029110563455928944, + "loss": 2.2056, + "num_input_tokens_seen": 460677120, + "step": 7030, + "train_runtime": 115494.4951, + "train_tokens_per_second": 3988.737 + }, + { + "epoch": 1.8591047141159383, + "grad_norm": 0.294921875, + "learning_rate": 0.00029057827706899995, + "loss": 2.1202, + "num_input_tokens_seen": 461332480, + "step": 7040, + "train_runtime": 115613.3943, + "train_tokens_per_second": 3990.303 + }, + { + "epoch": 1.861745675425855, + "grad_norm": 0.298828125, + "learning_rate": 0.0002900507340666173, + "loss": 2.2726, + "num_input_tokens_seen": 461987840, + "step": 7050, + "train_runtime": 115732.6563, + "train_tokens_per_second": 3991.854 + }, + { + "epoch": 1.8643866367357718, + "grad_norm": 0.283203125, + "learning_rate": 0.00028952300796391466, + "loss": 2.1556, + "num_input_tokens_seen": 462643200, + "step": 7060, + "train_runtime": 115850.6045, + "train_tokens_per_second": 3993.447 + }, + { + "epoch": 1.8670275980456887, + "grad_norm": 0.291015625, + "learning_rate": 0.0002889951011735026, + "loss": 2.1617, + "num_input_tokens_seen": 463298560, + "step": 7070, + "train_runtime": 115969.3806, + "train_tokens_per_second": 3995.008 + }, + { + "epoch": 1.8696685593556053, + "grad_norm": 0.306640625, + "learning_rate": 0.00028846701610881734, + "loss": 2.1631, + "num_input_tokens_seen": 463953920, + "step": 7080, + "train_runtime": 116090.0824, + "train_tokens_per_second": 3996.499 + }, + { + "epoch": 1.8723095206655223, + "grad_norm": 0.29296875, + "learning_rate": 0.00028793875518411057, + "loss": 2.1641, + "num_input_tokens_seen": 464609280, + "step": 7090, + "train_runtime": 116209.9065, + "train_tokens_per_second": 3998.018 + }, + { + "epoch": 1.874950481975439, + "grad_norm": 0.28515625, + "learning_rate": 0.0002874103208144377, + "loss": 2.1655, + "num_input_tokens_seen": 465264640, + "step": 7100, + "train_runtime": 116329.9499, + "train_tokens_per_second": 3999.526 + }, + { + "epoch": 1.8775914432853558, + "grad_norm": 0.275390625, + "learning_rate": 0.00028688171541564714, + "loss": 2.1712, + "num_input_tokens_seen": 465920000, + "step": 7110, + "train_runtime": 116448.5946, + "train_tokens_per_second": 4001.079 + }, + { + "epoch": 1.8802324045952727, + "grad_norm": 0.302734375, + "learning_rate": 0.0002863529414043692, + "loss": 2.1668, + "num_input_tokens_seen": 466575360, + "step": 7120, + "train_runtime": 116567.0957, + "train_tokens_per_second": 4002.633 + }, + { + "epoch": 1.8828733659051895, + "grad_norm": 0.287109375, + "learning_rate": 0.000285824001198005, + "loss": 2.2046, + "num_input_tokens_seen": 467230720, + "step": 7130, + "train_runtime": 116685.8683, + "train_tokens_per_second": 4004.176 + }, + { + "epoch": 1.8855143272151063, + "grad_norm": 0.28515625, + "learning_rate": 0.00028529489721471556, + "loss": 2.1657, + "num_input_tokens_seen": 467886080, + "step": 7140, + "train_runtime": 116806.3913, + "train_tokens_per_second": 4005.655 + }, + { + "epoch": 1.8881552885250232, + "grad_norm": 0.27734375, + "learning_rate": 0.0002847656318734105, + "loss": 2.2341, + "num_input_tokens_seen": 468541440, + "step": 7150, + "train_runtime": 116930.1487, + "train_tokens_per_second": 4007.02 + }, + { + "epoch": 1.8907962498349398, + "grad_norm": 0.275390625, + "learning_rate": 0.0002842362075937372, + "loss": 2.1156, + "num_input_tokens_seen": 469196800, + "step": 7160, + "train_runtime": 117053.2263, + "train_tokens_per_second": 4008.406 + }, + { + "epoch": 1.8934372111448567, + "grad_norm": 0.27734375, + "learning_rate": 0.00028370662679606974, + "loss": 2.1697, + "num_input_tokens_seen": 469852160, + "step": 7170, + "train_runtime": 117176.5737, + "train_tokens_per_second": 4009.779 + }, + { + "epoch": 1.8960781724547735, + "grad_norm": 0.2890625, + "learning_rate": 0.0002831768919014975, + "loss": 2.186, + "num_input_tokens_seen": 470507520, + "step": 7180, + "train_runtime": 117299.8905, + "train_tokens_per_second": 4011.151 + }, + { + "epoch": 1.8987191337646903, + "grad_norm": 0.314453125, + "learning_rate": 0.0002826470053318146, + "loss": 2.1841, + "num_input_tokens_seen": 471162880, + "step": 7190, + "train_runtime": 117422.9798, + "train_tokens_per_second": 4012.527 + }, + { + "epoch": 1.9013600950746072, + "grad_norm": 0.306640625, + "learning_rate": 0.0002821169695095085, + "loss": 2.216, + "num_input_tokens_seen": 471818240, + "step": 7200, + "train_runtime": 117546.1568, + "train_tokens_per_second": 4013.898 + }, + { + "epoch": 1.904001056384524, + "grad_norm": 0.29296875, + "learning_rate": 0.00028158678685774894, + "loss": 2.1943, + "num_input_tokens_seen": 472473600, + "step": 7210, + "train_runtime": 117670.8304, + "train_tokens_per_second": 4015.214 + }, + { + "epoch": 1.9066420176944407, + "grad_norm": 0.279296875, + "learning_rate": 0.00028105645980037704, + "loss": 2.1354, + "num_input_tokens_seen": 473128960, + "step": 7220, + "train_runtime": 117794.3725, + "train_tokens_per_second": 4016.567 + }, + { + "epoch": 1.9092829790043577, + "grad_norm": 0.283203125, + "learning_rate": 0.00028052599076189397, + "loss": 2.1935, + "num_input_tokens_seen": 473784320, + "step": 7230, + "train_runtime": 117913.5927, + "train_tokens_per_second": 4018.064 + }, + { + "epoch": 1.9119239403142743, + "grad_norm": 0.30859375, + "learning_rate": 0.00027999538216745003, + "loss": 2.2211, + "num_input_tokens_seen": 474439680, + "step": 7240, + "train_runtime": 118031.8674, + "train_tokens_per_second": 4019.59 + }, + { + "epoch": 1.9145649016241912, + "grad_norm": 0.287109375, + "learning_rate": 0.00027946463644283365, + "loss": 2.1685, + "num_input_tokens_seen": 475095040, + "step": 7250, + "train_runtime": 118150.1803, + "train_tokens_per_second": 4021.111 + }, + { + "epoch": 1.917205862934108, + "grad_norm": 0.279296875, + "learning_rate": 0.0002789337560144599, + "loss": 2.1477, + "num_input_tokens_seen": 475750400, + "step": 7260, + "train_runtime": 118268.9666, + "train_tokens_per_second": 4022.614 + }, + { + "epoch": 1.9198468242440248, + "grad_norm": 0.265625, + "learning_rate": 0.00027840274330936005, + "loss": 2.1712, + "num_input_tokens_seen": 476405760, + "step": 7270, + "train_runtime": 118387.3002, + "train_tokens_per_second": 4024.129 + }, + { + "epoch": 1.9224877855539417, + "grad_norm": 0.283203125, + "learning_rate": 0.00027787160075516985, + "loss": 2.2009, + "num_input_tokens_seen": 477061120, + "step": 7280, + "train_runtime": 118505.5171, + "train_tokens_per_second": 4025.645 + }, + { + "epoch": 1.9251287468638585, + "grad_norm": 0.291015625, + "learning_rate": 0.0002773403307801187, + "loss": 2.1796, + "num_input_tokens_seen": 477716480, + "step": 7290, + "train_runtime": 118623.8596, + "train_tokens_per_second": 4027.153 + }, + { + "epoch": 1.9277697081737752, + "grad_norm": 0.27734375, + "learning_rate": 0.0002768089358130185, + "loss": 2.139, + "num_input_tokens_seen": 478371840, + "step": 7300, + "train_runtime": 118742.6025, + "train_tokens_per_second": 4028.645 + }, + { + "epoch": 1.9304106694836922, + "grad_norm": 0.275390625, + "learning_rate": 0.00027627741828325293, + "loss": 2.2045, + "num_input_tokens_seen": 479027200, + "step": 7310, + "train_runtime": 118860.2199, + "train_tokens_per_second": 4030.173 + }, + { + "epoch": 1.9330516307936088, + "grad_norm": 0.287109375, + "learning_rate": 0.00027574578062076544, + "loss": 2.2294, + "num_input_tokens_seen": 479682560, + "step": 7320, + "train_runtime": 118978.8305, + "train_tokens_per_second": 4031.663 + }, + { + "epoch": 1.9356925921035257, + "grad_norm": 0.28125, + "learning_rate": 0.000275214025256049, + "loss": 2.1407, + "num_input_tokens_seen": 480337920, + "step": 7330, + "train_runtime": 119098.2016, + "train_tokens_per_second": 4033.125 + }, + { + "epoch": 1.9383335534134425, + "grad_norm": 0.29296875, + "learning_rate": 0.0002746821546201347, + "loss": 2.1828, + "num_input_tokens_seen": 480993280, + "step": 7340, + "train_runtime": 119217.9125, + "train_tokens_per_second": 4034.572 + }, + { + "epoch": 1.9409745147233592, + "grad_norm": 0.279296875, + "learning_rate": 0.0002741501711445807, + "loss": 2.2349, + "num_input_tokens_seen": 481648640, + "step": 7350, + "train_runtime": 119336.0203, + "train_tokens_per_second": 4036.071 + }, + { + "epoch": 1.9436154760332762, + "grad_norm": 0.28125, + "learning_rate": 0.00027361807726146057, + "loss": 2.1359, + "num_input_tokens_seen": 482304000, + "step": 7360, + "train_runtime": 119454.4738, + "train_tokens_per_second": 4037.555 + }, + { + "epoch": 1.9462564373431928, + "grad_norm": 0.287109375, + "learning_rate": 0.0002730858754033532, + "loss": 2.1516, + "num_input_tokens_seen": 482959360, + "step": 7370, + "train_runtime": 119573.9009, + "train_tokens_per_second": 4039.003 + }, + { + "epoch": 1.9488973986531097, + "grad_norm": 0.291015625, + "learning_rate": 0.00027255356800333076, + "loss": 2.1146, + "num_input_tokens_seen": 483614720, + "step": 7380, + "train_runtime": 119692.3733, + "train_tokens_per_second": 4040.481 + }, + { + "epoch": 1.9515383599630265, + "grad_norm": 0.28125, + "learning_rate": 0.000272021157494948, + "loss": 2.225, + "num_input_tokens_seen": 484270080, + "step": 7390, + "train_runtime": 119810.8053, + "train_tokens_per_second": 4041.957 + }, + { + "epoch": 1.9541793212729432, + "grad_norm": 0.28515625, + "learning_rate": 0.0002714886463122312, + "loss": 2.1815, + "num_input_tokens_seen": 484925440, + "step": 7400, + "train_runtime": 119928.5985, + "train_tokens_per_second": 4043.451 + }, + { + "epoch": 1.9568202825828602, + "grad_norm": 0.291015625, + "learning_rate": 0.00027095603688966676, + "loss": 2.2085, + "num_input_tokens_seen": 485580800, + "step": 7410, + "train_runtime": 120049.2318, + "train_tokens_per_second": 4044.847 + }, + { + "epoch": 1.959461243892777, + "grad_norm": 0.27734375, + "learning_rate": 0.00027042333166219006, + "loss": 2.1333, + "num_input_tokens_seen": 486236160, + "step": 7420, + "train_runtime": 120173.7942, + "train_tokens_per_second": 4046.108 + }, + { + "epoch": 1.9621022052026937, + "grad_norm": 0.275390625, + "learning_rate": 0.0002698905330651748, + "loss": 2.1709, + "num_input_tokens_seen": 486891520, + "step": 7430, + "train_runtime": 120291.6868, + "train_tokens_per_second": 4047.591 + }, + { + "epoch": 1.9647431665126107, + "grad_norm": 0.310546875, + "learning_rate": 0.0002693576435344212, + "loss": 2.1971, + "num_input_tokens_seen": 487546880, + "step": 7440, + "train_runtime": 120410.078, + "train_tokens_per_second": 4049.054 + }, + { + "epoch": 1.9673841278225273, + "grad_norm": 0.294921875, + "learning_rate": 0.0002688246655061456, + "loss": 2.1605, + "num_input_tokens_seen": 488202240, + "step": 7450, + "train_runtime": 120529.2995, + "train_tokens_per_second": 4050.486 + }, + { + "epoch": 1.9700250891324442, + "grad_norm": 0.28125, + "learning_rate": 0.0002682916014169685, + "loss": 2.172, + "num_input_tokens_seen": 488857600, + "step": 7460, + "train_runtime": 120648.4611, + "train_tokens_per_second": 4051.917 + }, + { + "epoch": 1.972666050442361, + "grad_norm": 0.28515625, + "learning_rate": 0.0002677584537039041, + "loss": 2.0911, + "num_input_tokens_seen": 489512960, + "step": 7470, + "train_runtime": 120766.5398, + "train_tokens_per_second": 4053.382 + }, + { + "epoch": 1.9753070117522777, + "grad_norm": 0.2734375, + "learning_rate": 0.0002672252248043488, + "loss": 2.149, + "num_input_tokens_seen": 490168320, + "step": 7480, + "train_runtime": 120885.5921, + "train_tokens_per_second": 4054.812 + }, + { + "epoch": 1.9779479730621947, + "grad_norm": 0.28125, + "learning_rate": 0.0002666919171560703, + "loss": 2.2283, + "num_input_tokens_seen": 490823680, + "step": 7490, + "train_runtime": 121010.2529, + "train_tokens_per_second": 4056.05 + }, + { + "epoch": 1.9805889343721115, + "grad_norm": 0.2734375, + "learning_rate": 0.00026615853319719626, + "loss": 2.2299, + "num_input_tokens_seen": 491479040, + "step": 7500, + "train_runtime": 121128.4355, + "train_tokens_per_second": 4057.503 + }, + { + "epoch": 1.9832298956820282, + "grad_norm": 0.287109375, + "learning_rate": 0.00026562507536620294, + "loss": 2.2422, + "num_input_tokens_seen": 492134400, + "step": 7510, + "train_runtime": 121260.8652, + "train_tokens_per_second": 4058.477 + }, + { + "epoch": 1.9858708569919452, + "grad_norm": 0.27734375, + "learning_rate": 0.0002650915461019048, + "loss": 2.1926, + "num_input_tokens_seen": 492789760, + "step": 7520, + "train_runtime": 121378.9998, + "train_tokens_per_second": 4059.926 + }, + { + "epoch": 1.9885118183018617, + "grad_norm": 0.27734375, + "learning_rate": 0.0002645579478434426, + "loss": 2.1582, + "num_input_tokens_seen": 493445120, + "step": 7530, + "train_runtime": 121498.0518, + "train_tokens_per_second": 4061.342 + }, + { + "epoch": 1.9911527796117787, + "grad_norm": 0.28515625, + "learning_rate": 0.00026402428303027236, + "loss": 2.1812, + "num_input_tokens_seen": 494100480, + "step": 7540, + "train_runtime": 121616.5807, + "train_tokens_per_second": 4062.772 + }, + { + "epoch": 1.9937937409216955, + "grad_norm": 0.30078125, + "learning_rate": 0.00026349055410215474, + "loss": 2.1734, + "num_input_tokens_seen": 494755840, + "step": 7550, + "train_runtime": 121735.3205, + "train_tokens_per_second": 4064.193 + }, + { + "epoch": 1.9964347022316122, + "grad_norm": 0.287109375, + "learning_rate": 0.00026295676349914315, + "loss": 2.1836, + "num_input_tokens_seen": 495411200, + "step": 7560, + "train_runtime": 121853.5549, + "train_tokens_per_second": 4065.628 + }, + { + "epoch": 1.9990756635415292, + "grad_norm": 0.283203125, + "learning_rate": 0.0002624229136615734, + "loss": 2.1844, + "num_input_tokens_seen": 496066560, + "step": 7570, + "train_runtime": 121972.4187, + "train_tokens_per_second": 4067.039 + }, + { + "epoch": 2.00158457678595, + "grad_norm": 0.337890625, + "learning_rate": 0.00026188900703005163, + "loss": 1.8962, + "num_input_tokens_seen": 496680960, + "step": 7580, + "train_runtime": 122084.9792, + "train_tokens_per_second": 4068.322 + }, + { + "epoch": 2.004225538095867, + "grad_norm": 0.34375, + "learning_rate": 0.00026135504604544394, + "loss": 1.8112, + "num_input_tokens_seen": 497336320, + "step": 7590, + "train_runtime": 122202.9107, + "train_tokens_per_second": 4069.758 + }, + { + "epoch": 2.006866499405784, + "grad_norm": 0.32421875, + "learning_rate": 0.00026082103314886484, + "loss": 1.7385, + "num_input_tokens_seen": 497991680, + "step": 7600, + "train_runtime": 122323.6716, + "train_tokens_per_second": 4071.098 + }, + { + "epoch": 2.0095074607157004, + "grad_norm": 0.33203125, + "learning_rate": 0.0002602869707816661, + "loss": 1.7123, + "num_input_tokens_seen": 498647040, + "step": 7610, + "train_runtime": 122441.9533, + "train_tokens_per_second": 4072.518 + }, + { + "epoch": 2.0121484220256174, + "grad_norm": 0.3203125, + "learning_rate": 0.00025975286138542553, + "loss": 1.7411, + "num_input_tokens_seen": 499302400, + "step": 7620, + "train_runtime": 122560.5511, + "train_tokens_per_second": 4073.924 + }, + { + "epoch": 2.014789383335534, + "grad_norm": 0.326171875, + "learning_rate": 0.0002592187074019364, + "loss": 1.6976, + "num_input_tokens_seen": 499957760, + "step": 7630, + "train_runtime": 122679.9299, + "train_tokens_per_second": 4075.302 + }, + { + "epoch": 2.017430344645451, + "grad_norm": 0.3515625, + "learning_rate": 0.0002586845112731954, + "loss": 1.7751, + "num_input_tokens_seen": 500613120, + "step": 7640, + "train_runtime": 122799.0373, + "train_tokens_per_second": 4076.686 + }, + { + "epoch": 2.020071305955368, + "grad_norm": 0.341796875, + "learning_rate": 0.000258150275441392, + "loss": 1.7346, + "num_input_tokens_seen": 501268480, + "step": 7650, + "train_runtime": 122918.4582, + "train_tokens_per_second": 4078.057 + }, + { + "epoch": 2.0227122672652844, + "grad_norm": 0.3515625, + "learning_rate": 0.0002576160023488972, + "loss": 1.7168, + "num_input_tokens_seen": 501923840, + "step": 7660, + "train_runtime": 123037.1835, + "train_tokens_per_second": 4079.448 + }, + { + "epoch": 2.0253532285752014, + "grad_norm": 0.328125, + "learning_rate": 0.0002570816944382524, + "loss": 1.6782, + "num_input_tokens_seen": 502579200, + "step": 7670, + "train_runtime": 123156.4293, + "train_tokens_per_second": 4080.82 + }, + { + "epoch": 2.0279941898851184, + "grad_norm": 0.34375, + "learning_rate": 0.0002565473541521582, + "loss": 1.7674, + "num_input_tokens_seen": 503234560, + "step": 7680, + "train_runtime": 123275.4067, + "train_tokens_per_second": 4082.198 + }, + { + "epoch": 2.030635151195035, + "grad_norm": 0.341796875, + "learning_rate": 0.000256012983933463, + "loss": 1.7331, + "num_input_tokens_seen": 503889920, + "step": 7690, + "train_runtime": 123394.327, + "train_tokens_per_second": 4083.574 + }, + { + "epoch": 2.033276112504952, + "grad_norm": 0.345703125, + "learning_rate": 0.0002554785862251523, + "loss": 1.7731, + "num_input_tokens_seen": 504545280, + "step": 7700, + "train_runtime": 123517.1474, + "train_tokens_per_second": 4084.82 + }, + { + "epoch": 2.0359170738148684, + "grad_norm": 0.3203125, + "learning_rate": 0.00025494416347033704, + "loss": 1.7762, + "num_input_tokens_seen": 505200640, + "step": 7710, + "train_runtime": 123640.3767, + "train_tokens_per_second": 4086.049 + }, + { + "epoch": 2.0385580351247854, + "grad_norm": 0.34765625, + "learning_rate": 0.00025440971811224294, + "loss": 1.728, + "num_input_tokens_seen": 505856000, + "step": 7720, + "train_runtime": 123759.753, + "train_tokens_per_second": 4087.403 + }, + { + "epoch": 2.0411989964347024, + "grad_norm": 0.35546875, + "learning_rate": 0.00025387525259419874, + "loss": 1.7502, + "num_input_tokens_seen": 506511360, + "step": 7730, + "train_runtime": 123878.1799, + "train_tokens_per_second": 4088.786 + }, + { + "epoch": 2.043839957744619, + "grad_norm": 0.34375, + "learning_rate": 0.00025334076935962555, + "loss": 1.7492, + "num_input_tokens_seen": 507166720, + "step": 7740, + "train_runtime": 123997.0286, + "train_tokens_per_second": 4090.152 + }, + { + "epoch": 2.046480919054536, + "grad_norm": 0.36328125, + "learning_rate": 0.00025280627085202555, + "loss": 1.7439, + "num_input_tokens_seen": 507822080, + "step": 7750, + "train_runtime": 124115.8412, + "train_tokens_per_second": 4091.517 + }, + { + "epoch": 2.049121880364453, + "grad_norm": 0.3515625, + "learning_rate": 0.0002522717595149705, + "loss": 1.7854, + "num_input_tokens_seen": 508477440, + "step": 7760, + "train_runtime": 124234.7525, + "train_tokens_per_second": 4092.876 + }, + { + "epoch": 2.0517628416743694, + "grad_norm": 0.349609375, + "learning_rate": 0.000251737237792091, + "loss": 1.7746, + "num_input_tokens_seen": 509132800, + "step": 7770, + "train_runtime": 124353.8608, + "train_tokens_per_second": 4094.226 + }, + { + "epoch": 2.0544038029842864, + "grad_norm": 0.341796875, + "learning_rate": 0.0002512027081270651, + "loss": 1.776, + "num_input_tokens_seen": 509788160, + "step": 7780, + "train_runtime": 124474.9998, + "train_tokens_per_second": 4095.506 + }, + { + "epoch": 2.057044764294203, + "grad_norm": 0.34765625, + "learning_rate": 0.000250668172963607, + "loss": 1.7205, + "num_input_tokens_seen": 510443520, + "step": 7790, + "train_runtime": 124594.3313, + "train_tokens_per_second": 4096.844 + }, + { + "epoch": 2.05968572560412, + "grad_norm": 0.35546875, + "learning_rate": 0.0002501336347454562, + "loss": 1.7456, + "num_input_tokens_seen": 511098880, + "step": 7800, + "train_runtime": 124713.6451, + "train_tokens_per_second": 4098.179 + }, + { + "epoch": 2.062326686914037, + "grad_norm": 0.361328125, + "learning_rate": 0.00024959909591636625, + "loss": 1.771, + "num_input_tokens_seen": 511754240, + "step": 7810, + "train_runtime": 124832.3326, + "train_tokens_per_second": 4099.533 + }, + { + "epoch": 2.0649676482239534, + "grad_norm": 0.3515625, + "learning_rate": 0.00024906455892009327, + "loss": 1.7439, + "num_input_tokens_seen": 512409600, + "step": 7820, + "train_runtime": 124951.4924, + "train_tokens_per_second": 4100.868 + }, + { + "epoch": 2.0676086095338704, + "grad_norm": 0.337890625, + "learning_rate": 0.00024853002620038513, + "loss": 1.8217, + "num_input_tokens_seen": 513064960, + "step": 7830, + "train_runtime": 125070.5633, + "train_tokens_per_second": 4102.204 + }, + { + "epoch": 2.070249570843787, + "grad_norm": 0.345703125, + "learning_rate": 0.00024799550020097004, + "loss": 1.7468, + "num_input_tokens_seen": 513720320, + "step": 7840, + "train_runtime": 125188.9602, + "train_tokens_per_second": 4103.559 + }, + { + "epoch": 2.072890532153704, + "grad_norm": 0.3515625, + "learning_rate": 0.0002474609833655457, + "loss": 1.7804, + "num_input_tokens_seen": 514375680, + "step": 7850, + "train_runtime": 125308.4415, + "train_tokens_per_second": 4104.877 + }, + { + "epoch": 2.075531493463621, + "grad_norm": 0.345703125, + "learning_rate": 0.00024692647813776784, + "loss": 1.742, + "num_input_tokens_seen": 515031040, + "step": 7860, + "train_runtime": 125426.5875, + "train_tokens_per_second": 4106.235 + }, + { + "epoch": 2.0781724547735374, + "grad_norm": 0.35546875, + "learning_rate": 0.00024639198696123886, + "loss": 1.7549, + "num_input_tokens_seen": 515686400, + "step": 7870, + "train_runtime": 125545.2703, + "train_tokens_per_second": 4107.573 + }, + { + "epoch": 2.0808134160834544, + "grad_norm": 0.3359375, + "learning_rate": 0.0002458575122794973, + "loss": 1.7687, + "num_input_tokens_seen": 516341760, + "step": 7880, + "train_runtime": 125664.2574, + "train_tokens_per_second": 4108.899 + }, + { + "epoch": 2.0834543773933714, + "grad_norm": 0.345703125, + "learning_rate": 0.000245323056536006, + "loss": 1.7648, + "num_input_tokens_seen": 516997120, + "step": 7890, + "train_runtime": 125783.0612, + "train_tokens_per_second": 4110.228 + }, + { + "epoch": 2.086095338703288, + "grad_norm": 0.3359375, + "learning_rate": 0.0002447886221741414, + "loss": 1.7488, + "num_input_tokens_seen": 517652480, + "step": 7900, + "train_runtime": 125902.0967, + "train_tokens_per_second": 4111.548 + }, + { + "epoch": 2.088736300013205, + "grad_norm": 0.33984375, + "learning_rate": 0.00024425421163718207, + "loss": 1.7756, + "num_input_tokens_seen": 518307840, + "step": 7910, + "train_runtime": 126021.0352, + "train_tokens_per_second": 4112.868 + }, + { + "epoch": 2.0913772613231214, + "grad_norm": 0.345703125, + "learning_rate": 0.0002437198273682978, + "loss": 1.6993, + "num_input_tokens_seen": 518963200, + "step": 7920, + "train_runtime": 126140.657, + "train_tokens_per_second": 4114.163 + }, + { + "epoch": 2.0940182226330384, + "grad_norm": 0.349609375, + "learning_rate": 0.00024318547181053819, + "loss": 1.7315, + "num_input_tokens_seen": 519618560, + "step": 7930, + "train_runtime": 126259.02, + "train_tokens_per_second": 4115.497 + }, + { + "epoch": 2.0966591839429554, + "grad_norm": 0.357421875, + "learning_rate": 0.00024265114740682167, + "loss": 1.7431, + "num_input_tokens_seen": 520273920, + "step": 7940, + "train_runtime": 126378.0559, + "train_tokens_per_second": 4116.806 + }, + { + "epoch": 2.099300145252872, + "grad_norm": 0.36328125, + "learning_rate": 0.000242116856599924, + "loss": 1.7303, + "num_input_tokens_seen": 520929280, + "step": 7950, + "train_runtime": 126497.1651, + "train_tokens_per_second": 4118.11 + }, + { + "epoch": 2.101941106562789, + "grad_norm": 0.345703125, + "learning_rate": 0.00024158260183246757, + "loss": 1.7809, + "num_input_tokens_seen": 521584640, + "step": 7960, + "train_runtime": 126616.0381, + "train_tokens_per_second": 4119.42 + }, + { + "epoch": 2.104582067872706, + "grad_norm": 0.34765625, + "learning_rate": 0.00024104838554691015, + "loss": 1.7361, + "num_input_tokens_seen": 522240000, + "step": 7970, + "train_runtime": 126734.5896, + "train_tokens_per_second": 4120.738 + }, + { + "epoch": 2.1072230291826224, + "grad_norm": 0.353515625, + "learning_rate": 0.00024051421018553312, + "loss": 1.7758, + "num_input_tokens_seen": 522895360, + "step": 7980, + "train_runtime": 126853.6257, + "train_tokens_per_second": 4122.037 + }, + { + "epoch": 2.1098639904925394, + "grad_norm": 0.345703125, + "learning_rate": 0.00023998007819043122, + "loss": 1.775, + "num_input_tokens_seen": 523550720, + "step": 7990, + "train_runtime": 126972.9651, + "train_tokens_per_second": 4123.324 + }, + { + "epoch": 2.112504951802456, + "grad_norm": 0.37890625, + "learning_rate": 0.00023944599200350058, + "loss": 1.7898, + "num_input_tokens_seen": 524206080, + "step": 8000, + "train_runtime": 127092.5128, + "train_tokens_per_second": 4124.602 + }, + { + "epoch": 2.115145913112373, + "grad_norm": 0.353515625, + "learning_rate": 0.00023891195406642825, + "loss": 1.7563, + "num_input_tokens_seen": 524861440, + "step": 8010, + "train_runtime": 127224.1289, + "train_tokens_per_second": 4125.487 + }, + { + "epoch": 2.11778687442229, + "grad_norm": 0.375, + "learning_rate": 0.00023837796682068047, + "loss": 1.7687, + "num_input_tokens_seen": 525516800, + "step": 8020, + "train_runtime": 127342.628, + "train_tokens_per_second": 4126.794 + }, + { + "epoch": 2.1204278357322064, + "grad_norm": 0.32421875, + "learning_rate": 0.00023784403270749166, + "loss": 1.7389, + "num_input_tokens_seen": 526172160, + "step": 8030, + "train_runtime": 127461.3966, + "train_tokens_per_second": 4128.09 + }, + { + "epoch": 2.1230687970421234, + "grad_norm": 0.353515625, + "learning_rate": 0.0002373101541678536, + "loss": 1.7768, + "num_input_tokens_seen": 526827520, + "step": 8040, + "train_runtime": 127580.0605, + "train_tokens_per_second": 4129.388 + }, + { + "epoch": 2.1257097583520403, + "grad_norm": 0.34375, + "learning_rate": 0.00023677633364250388, + "loss": 1.8145, + "num_input_tokens_seen": 527482880, + "step": 8050, + "train_runtime": 127698.8762, + "train_tokens_per_second": 4130.678 + }, + { + "epoch": 2.128350719661957, + "grad_norm": 0.328125, + "learning_rate": 0.0002362425735719147, + "loss": 1.7534, + "num_input_tokens_seen": 528138240, + "step": 8060, + "train_runtime": 127819.112, + "train_tokens_per_second": 4131.919 + }, + { + "epoch": 2.130991680971874, + "grad_norm": 0.353515625, + "learning_rate": 0.0002357088763962821, + "loss": 1.791, + "num_input_tokens_seen": 528793600, + "step": 8070, + "train_runtime": 127937.2109, + "train_tokens_per_second": 4133.228 + }, + { + "epoch": 2.1336326422817904, + "grad_norm": 0.427734375, + "learning_rate": 0.00023517524455551463, + "loss": 1.801, + "num_input_tokens_seen": 529448960, + "step": 8080, + "train_runtime": 128056.2376, + "train_tokens_per_second": 4134.503 + }, + { + "epoch": 2.1362736035917074, + "grad_norm": 0.36328125, + "learning_rate": 0.0002346416804892218, + "loss": 1.7931, + "num_input_tokens_seen": 530104320, + "step": 8090, + "train_runtime": 128174.2709, + "train_tokens_per_second": 4135.809 + }, + { + "epoch": 2.1389145649016243, + "grad_norm": 0.33984375, + "learning_rate": 0.0002341081866367037, + "loss": 1.772, + "num_input_tokens_seen": 530759680, + "step": 8100, + "train_runtime": 128292.3229, + "train_tokens_per_second": 4137.112 + }, + { + "epoch": 2.141555526211541, + "grad_norm": 0.34375, + "learning_rate": 0.00023357476543693905, + "loss": 1.7975, + "num_input_tokens_seen": 531415040, + "step": 8110, + "train_runtime": 128414.6821, + "train_tokens_per_second": 4138.273 + }, + { + "epoch": 2.144196487521458, + "grad_norm": 0.3359375, + "learning_rate": 0.0002330414193285747, + "loss": 1.713, + "num_input_tokens_seen": 532070400, + "step": 8120, + "train_runtime": 128535.8516, + "train_tokens_per_second": 4139.471 + }, + { + "epoch": 2.1468374488313744, + "grad_norm": 0.353515625, + "learning_rate": 0.00023250815074991418, + "loss": 1.7528, + "num_input_tokens_seen": 532725760, + "step": 8130, + "train_runtime": 128657.89, + "train_tokens_per_second": 4140.638 + }, + { + "epoch": 2.1494784101412914, + "grad_norm": 0.37890625, + "learning_rate": 0.0002319749621389063, + "loss": 1.7682, + "num_input_tokens_seen": 533381120, + "step": 8140, + "train_runtime": 128776.6087, + "train_tokens_per_second": 4141.91 + }, + { + "epoch": 2.1521193714512084, + "grad_norm": 0.341796875, + "learning_rate": 0.0002314418559331346, + "loss": 1.8058, + "num_input_tokens_seen": 534036480, + "step": 8150, + "train_runtime": 128894.9712, + "train_tokens_per_second": 4143.191 + }, + { + "epoch": 2.154760332761125, + "grad_norm": 0.359375, + "learning_rate": 0.00023090883456980586, + "loss": 1.762, + "num_input_tokens_seen": 534691840, + "step": 8160, + "train_runtime": 129012.3112, + "train_tokens_per_second": 4144.502 + }, + { + "epoch": 2.157401294071042, + "grad_norm": 0.34765625, + "learning_rate": 0.00023037590048573866, + "loss": 1.738, + "num_input_tokens_seen": 535347200, + "step": 8170, + "train_runtime": 129131.7649, + "train_tokens_per_second": 4145.744 + }, + { + "epoch": 2.160042255380959, + "grad_norm": 0.369140625, + "learning_rate": 0.00022984305611735293, + "loss": 1.7308, + "num_input_tokens_seen": 536002560, + "step": 8180, + "train_runtime": 129250.2333, + "train_tokens_per_second": 4147.014 + }, + { + "epoch": 2.1626832166908754, + "grad_norm": 0.341796875, + "learning_rate": 0.0002293103039006583, + "loss": 1.787, + "num_input_tokens_seen": 536657920, + "step": 8190, + "train_runtime": 129370.1093, + "train_tokens_per_second": 4148.237 + }, + { + "epoch": 2.1653241780007924, + "grad_norm": 0.34375, + "learning_rate": 0.00022877764627124314, + "loss": 1.753, + "num_input_tokens_seen": 537313280, + "step": 8200, + "train_runtime": 129490.7161, + "train_tokens_per_second": 4149.435 + }, + { + "epoch": 2.1679651393107093, + "grad_norm": 0.33984375, + "learning_rate": 0.0002282450856642633, + "loss": 1.7668, + "num_input_tokens_seen": 537968640, + "step": 8210, + "train_runtime": 129611.9771, + "train_tokens_per_second": 4150.609 + }, + { + "epoch": 2.170606100620626, + "grad_norm": 0.35546875, + "learning_rate": 0.00022771262451443133, + "loss": 1.7946, + "num_input_tokens_seen": 538624000, + "step": 8220, + "train_runtime": 129730.1997, + "train_tokens_per_second": 4151.878 + }, + { + "epoch": 2.173247061930543, + "grad_norm": 0.353515625, + "learning_rate": 0.00022718026525600466, + "loss": 1.8128, + "num_input_tokens_seen": 539279360, + "step": 8230, + "train_runtime": 129849.1077, + "train_tokens_per_second": 4153.123 + }, + { + "epoch": 2.1758880232404594, + "grad_norm": 0.359375, + "learning_rate": 0.00022664801032277538, + "loss": 1.7647, + "num_input_tokens_seen": 539934720, + "step": 8240, + "train_runtime": 129967.5043, + "train_tokens_per_second": 4154.382 + }, + { + "epoch": 2.1785289845503764, + "grad_norm": 0.33984375, + "learning_rate": 0.00022611586214805817, + "loss": 1.8094, + "num_input_tokens_seen": 540590080, + "step": 8250, + "train_runtime": 130085.9211, + "train_tokens_per_second": 4155.639 + }, + { + "epoch": 2.1811699458602933, + "grad_norm": 0.34765625, + "learning_rate": 0.00022558382316468, + "loss": 1.7866, + "num_input_tokens_seen": 541245440, + "step": 8260, + "train_runtime": 130204.8497, + "train_tokens_per_second": 4156.876 + }, + { + "epoch": 2.18381090717021, + "grad_norm": 0.35546875, + "learning_rate": 0.0002250518958049686, + "loss": 1.7779, + "num_input_tokens_seen": 541900800, + "step": 8270, + "train_runtime": 130323.382, + "train_tokens_per_second": 4158.124 + }, + { + "epoch": 2.186451868480127, + "grad_norm": 0.36328125, + "learning_rate": 0.00022452008250074115, + "loss": 1.7684, + "num_input_tokens_seen": 542556160, + "step": 8280, + "train_runtime": 130441.9355, + "train_tokens_per_second": 4159.369 + }, + { + "epoch": 2.1890928297900434, + "grad_norm": 0.359375, + "learning_rate": 0.00022398838568329365, + "loss": 1.7176, + "num_input_tokens_seen": 543211520, + "step": 8290, + "train_runtime": 130563.1634, + "train_tokens_per_second": 4160.527 + }, + { + "epoch": 2.1917337910999604, + "grad_norm": 0.33984375, + "learning_rate": 0.00022345680778338963, + "loss": 1.7329, + "num_input_tokens_seen": 543866880, + "step": 8300, + "train_runtime": 130682.6085, + "train_tokens_per_second": 4161.739 + }, + { + "epoch": 2.1943747524098773, + "grad_norm": 0.359375, + "learning_rate": 0.0002229253512312485, + "loss": 1.7994, + "num_input_tokens_seen": 544522240, + "step": 8310, + "train_runtime": 130801.1059, + "train_tokens_per_second": 4162.979 + }, + { + "epoch": 2.197015713719794, + "grad_norm": 0.37109375, + "learning_rate": 0.00022239401845653534, + "loss": 1.7616, + "num_input_tokens_seen": 545177600, + "step": 8320, + "train_runtime": 130920.2945, + "train_tokens_per_second": 4164.195 + }, + { + "epoch": 2.199656675029711, + "grad_norm": 0.349609375, + "learning_rate": 0.00022186281188834938, + "loss": 1.8238, + "num_input_tokens_seen": 545832960, + "step": 8330, + "train_runtime": 131039.1858, + "train_tokens_per_second": 4165.418 + }, + { + "epoch": 2.202297636339628, + "grad_norm": 0.36328125, + "learning_rate": 0.00022133173395521248, + "loss": 1.7817, + "num_input_tokens_seen": 546488320, + "step": 8340, + "train_runtime": 131157.9036, + "train_tokens_per_second": 4166.644 + }, + { + "epoch": 2.2049385976495444, + "grad_norm": 0.349609375, + "learning_rate": 0.00022080078708505878, + "loss": 1.7631, + "num_input_tokens_seen": 547143680, + "step": 8350, + "train_runtime": 131276.9596, + "train_tokens_per_second": 4167.858 + }, + { + "epoch": 2.2075795589594613, + "grad_norm": 0.345703125, + "learning_rate": 0.00022026997370522302, + "loss": 1.7164, + "num_input_tokens_seen": 547799040, + "step": 8360, + "train_runtime": 131396.048, + "train_tokens_per_second": 4169.068 + }, + { + "epoch": 2.210220520269378, + "grad_norm": 0.34765625, + "learning_rate": 0.00021973929624242988, + "loss": 1.7892, + "num_input_tokens_seen": 548454400, + "step": 8370, + "train_runtime": 131514.9039, + "train_tokens_per_second": 4170.283 + }, + { + "epoch": 2.212861481579295, + "grad_norm": 0.365234375, + "learning_rate": 0.0002192087571227825, + "loss": 1.7302, + "num_input_tokens_seen": 549109760, + "step": 8380, + "train_runtime": 131634.3445, + "train_tokens_per_second": 4171.478 + }, + { + "epoch": 2.215502442889212, + "grad_norm": 0.365234375, + "learning_rate": 0.00021867835877175147, + "loss": 1.741, + "num_input_tokens_seen": 549765120, + "step": 8390, + "train_runtime": 131753.3361, + "train_tokens_per_second": 4172.685 + }, + { + "epoch": 2.2181434041991284, + "grad_norm": 0.34375, + "learning_rate": 0.00021814810361416403, + "loss": 1.7484, + "num_input_tokens_seen": 550420480, + "step": 8400, + "train_runtime": 131871.9634, + "train_tokens_per_second": 4173.901 + }, + { + "epoch": 2.2207843655090453, + "grad_norm": 0.359375, + "learning_rate": 0.00021761799407419286, + "loss": 1.8028, + "num_input_tokens_seen": 551075840, + "step": 8410, + "train_runtime": 131991.3808, + "train_tokens_per_second": 4175.09 + }, + { + "epoch": 2.223425326818962, + "grad_norm": 0.33984375, + "learning_rate": 0.00021708803257534451, + "loss": 1.7589, + "num_input_tokens_seen": 551731200, + "step": 8420, + "train_runtime": 132110.699, + "train_tokens_per_second": 4176.279 + }, + { + "epoch": 2.226066288128879, + "grad_norm": 0.345703125, + "learning_rate": 0.00021655822154044907, + "loss": 1.7806, + "num_input_tokens_seen": 552386560, + "step": 8430, + "train_runtime": 132231.161, + "train_tokens_per_second": 4177.431 + }, + { + "epoch": 2.228707249438796, + "grad_norm": 0.33984375, + "learning_rate": 0.00021602856339164882, + "loss": 1.7687, + "num_input_tokens_seen": 553041920, + "step": 8440, + "train_runtime": 132350.5384, + "train_tokens_per_second": 4178.615 + }, + { + "epoch": 2.2313482107487124, + "grad_norm": 0.34375, + "learning_rate": 0.00021549906055038666, + "loss": 1.7181, + "num_input_tokens_seen": 553697280, + "step": 8450, + "train_runtime": 132469.8126, + "train_tokens_per_second": 4179.8 + }, + { + "epoch": 2.2339891720586293, + "grad_norm": 0.361328125, + "learning_rate": 0.0002149697154373959, + "loss": 1.7959, + "num_input_tokens_seen": 554352640, + "step": 8460, + "train_runtime": 132588.637, + "train_tokens_per_second": 4180.997 + }, + { + "epoch": 2.2366301333685463, + "grad_norm": 0.3515625, + "learning_rate": 0.00021444053047268852, + "loss": 1.7816, + "num_input_tokens_seen": 555008000, + "step": 8470, + "train_runtime": 132708.1769, + "train_tokens_per_second": 4182.169 + }, + { + "epoch": 2.239271094678463, + "grad_norm": 0.349609375, + "learning_rate": 0.0002139115080755445, + "loss": 1.7698, + "num_input_tokens_seen": 555663360, + "step": 8480, + "train_runtime": 132826.6113, + "train_tokens_per_second": 4183.374 + }, + { + "epoch": 2.24191205598838, + "grad_norm": 0.34765625, + "learning_rate": 0.00021338265066450063, + "loss": 1.7496, + "num_input_tokens_seen": 556318720, + "step": 8490, + "train_runtime": 132945.6949, + "train_tokens_per_second": 4184.556 + }, + { + "epoch": 2.244553017298297, + "grad_norm": 0.353515625, + "learning_rate": 0.00021285396065733915, + "loss": 1.7712, + "num_input_tokens_seen": 556974080, + "step": 8500, + "train_runtime": 133064.2087, + "train_tokens_per_second": 4185.754 + }, + { + "epoch": 2.2471939786082133, + "grad_norm": 0.3359375, + "learning_rate": 0.00021232544047107723, + "loss": 1.7846, + "num_input_tokens_seen": 557629440, + "step": 8510, + "train_runtime": 133196.9536, + "train_tokens_per_second": 4186.503 + }, + { + "epoch": 2.2498349399181303, + "grad_norm": 0.353515625, + "learning_rate": 0.00021179709252195573, + "loss": 1.777, + "num_input_tokens_seen": 558284800, + "step": 8520, + "train_runtime": 133315.2493, + "train_tokens_per_second": 4187.704 + }, + { + "epoch": 2.252475901228047, + "grad_norm": 0.365234375, + "learning_rate": 0.00021126891922542773, + "loss": 1.8324, + "num_input_tokens_seen": 558940160, + "step": 8530, + "train_runtime": 133434.1312, + "train_tokens_per_second": 4188.884 + }, + { + "epoch": 2.255116862537964, + "grad_norm": 0.359375, + "learning_rate": 0.00021074092299614827, + "loss": 1.778, + "num_input_tokens_seen": 559595520, + "step": 8540, + "train_runtime": 133555.8445, + "train_tokens_per_second": 4189.974 + }, + { + "epoch": 2.257757823847881, + "grad_norm": 0.34765625, + "learning_rate": 0.00021021310624796269, + "loss": 1.8027, + "num_input_tokens_seen": 560250880, + "step": 8550, + "train_runtime": 133674.2163, + "train_tokens_per_second": 4191.166 + }, + { + "epoch": 2.2603987851577974, + "grad_norm": 0.361328125, + "learning_rate": 0.00020968547139389577, + "loss": 1.8377, + "num_input_tokens_seen": 560906240, + "step": 8560, + "train_runtime": 133793.3182, + "train_tokens_per_second": 4192.334 + }, + { + "epoch": 2.2630397464677143, + "grad_norm": 0.353515625, + "learning_rate": 0.00020915802084614085, + "loss": 1.7932, + "num_input_tokens_seen": 561561600, + "step": 8570, + "train_runtime": 133912.0166, + "train_tokens_per_second": 4193.512 + }, + { + "epoch": 2.265680707777631, + "grad_norm": 0.34765625, + "learning_rate": 0.00020863075701604844, + "loss": 1.7853, + "num_input_tokens_seen": 562216960, + "step": 8580, + "train_runtime": 134031.1527, + "train_tokens_per_second": 4194.674 + }, + { + "epoch": 2.268321669087548, + "grad_norm": 0.37890625, + "learning_rate": 0.00020810368231411564, + "loss": 1.8313, + "num_input_tokens_seen": 562872320, + "step": 8590, + "train_runtime": 134149.8193, + "train_tokens_per_second": 4195.849 + }, + { + "epoch": 2.270962630397465, + "grad_norm": 0.341796875, + "learning_rate": 0.00020757679914997502, + "loss": 1.7764, + "num_input_tokens_seen": 563527680, + "step": 8600, + "train_runtime": 134270.6757, + "train_tokens_per_second": 4196.953 + }, + { + "epoch": 2.2736035917073814, + "grad_norm": 0.34375, + "learning_rate": 0.00020705010993238304, + "loss": 1.7626, + "num_input_tokens_seen": 564183040, + "step": 8610, + "train_runtime": 134389.173, + "train_tokens_per_second": 4198.129 + }, + { + "epoch": 2.2762445530172983, + "grad_norm": 0.349609375, + "learning_rate": 0.00020652361706920995, + "loss": 1.7221, + "num_input_tokens_seen": 564838400, + "step": 8620, + "train_runtime": 134508.1004, + "train_tokens_per_second": 4199.289 + }, + { + "epoch": 2.2788855143272153, + "grad_norm": 0.3671875, + "learning_rate": 0.0002059973229674282, + "loss": 1.7946, + "num_input_tokens_seen": 565493760, + "step": 8630, + "train_runtime": 134626.5271, + "train_tokens_per_second": 4200.463 + }, + { + "epoch": 2.281526475637132, + "grad_norm": 0.373046875, + "learning_rate": 0.00020547123003310133, + "loss": 1.7064, + "num_input_tokens_seen": 566149120, + "step": 8640, + "train_runtime": 134745.557, + "train_tokens_per_second": 4201.616 + }, + { + "epoch": 2.284167436947049, + "grad_norm": 0.359375, + "learning_rate": 0.00020494534067137351, + "loss": 1.7773, + "num_input_tokens_seen": 566804480, + "step": 8650, + "train_runtime": 134864.1759, + "train_tokens_per_second": 4202.78 + }, + { + "epoch": 2.2868083982569654, + "grad_norm": 0.337890625, + "learning_rate": 0.00020441965728645826, + "loss": 1.7799, + "num_input_tokens_seen": 567459840, + "step": 8660, + "train_runtime": 134983.5623, + "train_tokens_per_second": 4203.918 + }, + { + "epoch": 2.2894493595668823, + "grad_norm": 0.353515625, + "learning_rate": 0.00020389418228162698, + "loss": 1.7674, + "num_input_tokens_seen": 568115200, + "step": 8670, + "train_runtime": 135102.3609, + "train_tokens_per_second": 4205.072 + }, + { + "epoch": 2.2920903208767993, + "grad_norm": 0.345703125, + "learning_rate": 0.0002033689180591989, + "loss": 1.8209, + "num_input_tokens_seen": 568770560, + "step": 8680, + "train_runtime": 135221.6573, + "train_tokens_per_second": 4206.209 + }, + { + "epoch": 2.294731282186716, + "grad_norm": 0.337890625, + "learning_rate": 0.00020284386702052948, + "loss": 1.7821, + "num_input_tokens_seen": 569425920, + "step": 8690, + "train_runtime": 135341.5766, + "train_tokens_per_second": 4207.324 + }, + { + "epoch": 2.297372243496633, + "grad_norm": 0.357421875, + "learning_rate": 0.00020231903156599934, + "loss": 1.7809, + "num_input_tokens_seen": 570081280, + "step": 8700, + "train_runtime": 135460.9632, + "train_tokens_per_second": 4208.454 + }, + { + "epoch": 2.3000132048065494, + "grad_norm": 0.3515625, + "learning_rate": 0.00020179441409500388, + "loss": 1.8015, + "num_input_tokens_seen": 570736640, + "step": 8710, + "train_runtime": 135580.0584, + "train_tokens_per_second": 4209.591 + }, + { + "epoch": 2.3026541661164663, + "grad_norm": 0.35546875, + "learning_rate": 0.00020127001700594163, + "loss": 1.7868, + "num_input_tokens_seen": 571392000, + "step": 8720, + "train_runtime": 135699.7542, + "train_tokens_per_second": 4210.708 + }, + { + "epoch": 2.3052951274263833, + "grad_norm": 0.349609375, + "learning_rate": 0.00020074584269620378, + "loss": 1.7298, + "num_input_tokens_seen": 572047360, + "step": 8730, + "train_runtime": 135819.4122, + "train_tokens_per_second": 4211.823 + }, + { + "epoch": 2.3079360887363, + "grad_norm": 0.373046875, + "learning_rate": 0.00020022189356216303, + "loss": 1.8011, + "num_input_tokens_seen": 572702720, + "step": 8740, + "train_runtime": 135938.2393, + "train_tokens_per_second": 4212.963 + }, + { + "epoch": 2.310577050046217, + "grad_norm": 0.345703125, + "learning_rate": 0.0001996981719991625, + "loss": 1.8052, + "num_input_tokens_seen": 573358080, + "step": 8750, + "train_runtime": 136057.4104, + "train_tokens_per_second": 4214.089 + }, + { + "epoch": 2.313218011356134, + "grad_norm": 0.35546875, + "learning_rate": 0.00019917468040150498, + "loss": 1.7657, + "num_input_tokens_seen": 574013440, + "step": 8760, + "train_runtime": 136179.4226, + "train_tokens_per_second": 4215.126 + }, + { + "epoch": 2.3158589726660503, + "grad_norm": 0.357421875, + "learning_rate": 0.00019865142116244223, + "loss": 1.7685, + "num_input_tokens_seen": 574668800, + "step": 8770, + "train_runtime": 136299.568, + "train_tokens_per_second": 4216.219 + }, + { + "epoch": 2.3184999339759673, + "grad_norm": 0.353515625, + "learning_rate": 0.0001981283966741631, + "loss": 1.7421, + "num_input_tokens_seen": 575324160, + "step": 8780, + "train_runtime": 136422.5383, + "train_tokens_per_second": 4217.222 + }, + { + "epoch": 2.3211408952858843, + "grad_norm": 0.345703125, + "learning_rate": 0.0001976056093277838, + "loss": 1.7527, + "num_input_tokens_seen": 575979520, + "step": 8790, + "train_runtime": 136541.3031, + "train_tokens_per_second": 4218.354 + }, + { + "epoch": 2.323781856595801, + "grad_norm": 0.357421875, + "learning_rate": 0.0001970830615133362, + "loss": 1.7837, + "num_input_tokens_seen": 576634880, + "step": 8800, + "train_runtime": 136661.341, + "train_tokens_per_second": 4219.444 + }, + { + "epoch": 2.326422817905718, + "grad_norm": 0.353515625, + "learning_rate": 0.000196560755619757, + "loss": 1.7746, + "num_input_tokens_seen": 577290240, + "step": 8810, + "train_runtime": 136781.0065, + "train_tokens_per_second": 4220.544 + }, + { + "epoch": 2.3290637792156343, + "grad_norm": 0.34375, + "learning_rate": 0.0001960386940348771, + "loss": 1.7475, + "num_input_tokens_seen": 577945600, + "step": 8820, + "train_runtime": 136900.2296, + "train_tokens_per_second": 4221.655 + }, + { + "epoch": 2.3317047405255513, + "grad_norm": 0.357421875, + "learning_rate": 0.00019551687914541021, + "loss": 1.7873, + "num_input_tokens_seen": 578600960, + "step": 8830, + "train_runtime": 137020.1599, + "train_tokens_per_second": 4222.743 + }, + { + "epoch": 2.3343457018354683, + "grad_norm": 0.359375, + "learning_rate": 0.00019499531333694257, + "loss": 1.7672, + "num_input_tokens_seen": 579256320, + "step": 8840, + "train_runtime": 137140.2882, + "train_tokens_per_second": 4223.823 + }, + { + "epoch": 2.336986663145385, + "grad_norm": 0.349609375, + "learning_rate": 0.00019447399899392154, + "loss": 1.8065, + "num_input_tokens_seen": 579911680, + "step": 8850, + "train_runtime": 137259.1949, + "train_tokens_per_second": 4224.939 + }, + { + "epoch": 2.339627624455302, + "grad_norm": 0.3671875, + "learning_rate": 0.00019395293849964465, + "loss": 1.7768, + "num_input_tokens_seen": 580567040, + "step": 8860, + "train_runtime": 137378.606, + "train_tokens_per_second": 4226.037 + }, + { + "epoch": 2.3422685857652183, + "grad_norm": 0.3515625, + "learning_rate": 0.00019343213423624923, + "loss": 1.7421, + "num_input_tokens_seen": 581222400, + "step": 8870, + "train_runtime": 137497.6372, + "train_tokens_per_second": 4227.145 + }, + { + "epoch": 2.3449095470751353, + "grad_norm": 0.359375, + "learning_rate": 0.00019291158858470112, + "loss": 1.8188, + "num_input_tokens_seen": 581877760, + "step": 8880, + "train_runtime": 137616.1183, + "train_tokens_per_second": 4228.267 + }, + { + "epoch": 2.3475505083850523, + "grad_norm": 0.34765625, + "learning_rate": 0.0001923913039247836, + "loss": 1.7275, + "num_input_tokens_seen": 582533120, + "step": 8890, + "train_runtime": 137735.6367, + "train_tokens_per_second": 4229.357 + }, + { + "epoch": 2.350191469694969, + "grad_norm": 0.35546875, + "learning_rate": 0.00019187128263508713, + "loss": 1.7716, + "num_input_tokens_seen": 583188480, + "step": 8900, + "train_runtime": 137857.7389, + "train_tokens_per_second": 4230.364 + }, + { + "epoch": 2.352832431004886, + "grad_norm": 0.333984375, + "learning_rate": 0.00019135152709299792, + "loss": 1.7503, + "num_input_tokens_seen": 583843840, + "step": 8910, + "train_runtime": 137976.5331, + "train_tokens_per_second": 4231.472 + }, + { + "epoch": 2.355473392314803, + "grad_norm": 0.37890625, + "learning_rate": 0.00019083203967468727, + "loss": 1.7817, + "num_input_tokens_seen": 584499200, + "step": 8920, + "train_runtime": 138096.6407, + "train_tokens_per_second": 4232.537 + }, + { + "epoch": 2.3581143536247193, + "grad_norm": 0.34765625, + "learning_rate": 0.00019031282275510086, + "loss": 1.7261, + "num_input_tokens_seen": 585154560, + "step": 8930, + "train_runtime": 138215.4588, + "train_tokens_per_second": 4233.64 + }, + { + "epoch": 2.3607553149346363, + "grad_norm": 0.34765625, + "learning_rate": 0.0001897938787079474, + "loss": 1.7492, + "num_input_tokens_seen": 585809920, + "step": 8940, + "train_runtime": 138334.5922, + "train_tokens_per_second": 4234.732 + }, + { + "epoch": 2.363396276244553, + "grad_norm": 0.35546875, + "learning_rate": 0.00018927520990568835, + "loss": 1.754, + "num_input_tokens_seen": 586465280, + "step": 8950, + "train_runtime": 138454.3847, + "train_tokens_per_second": 4235.801 + }, + { + "epoch": 2.36603723755447, + "grad_norm": 0.369140625, + "learning_rate": 0.00018875681871952695, + "loss": 1.7688, + "num_input_tokens_seen": 587120640, + "step": 8960, + "train_runtime": 138573.7515, + "train_tokens_per_second": 4236.882 + }, + { + "epoch": 2.368678198864387, + "grad_norm": 0.3515625, + "learning_rate": 0.00018823870751939688, + "loss": 1.7996, + "num_input_tokens_seen": 587776000, + "step": 8970, + "train_runtime": 138693.7682, + "train_tokens_per_second": 4237.941 + }, + { + "epoch": 2.3713191601743033, + "grad_norm": 0.35546875, + "learning_rate": 0.00018772087867395206, + "loss": 1.7875, + "num_input_tokens_seen": 588431360, + "step": 8980, + "train_runtime": 138813.0599, + "train_tokens_per_second": 4239.02 + }, + { + "epoch": 2.3739601214842203, + "grad_norm": 0.34375, + "learning_rate": 0.00018720333455055565, + "loss": 1.7478, + "num_input_tokens_seen": 589086720, + "step": 8990, + "train_runtime": 138932.062, + "train_tokens_per_second": 4240.106 + }, + { + "epoch": 2.376601082794137, + "grad_norm": 0.349609375, + "learning_rate": 0.0001866860775152689, + "loss": 1.7108, + "num_input_tokens_seen": 589742080, + "step": 9000, + "train_runtime": 139052.1789, + "train_tokens_per_second": 4241.157 + }, + { + "epoch": 2.379242044104054, + "grad_norm": 0.349609375, + "learning_rate": 0.00018616910993284066, + "loss": 1.762, + "num_input_tokens_seen": 590397440, + "step": 9010, + "train_runtime": 139183.8467, + "train_tokens_per_second": 4241.853 + }, + { + "epoch": 2.381883005413971, + "grad_norm": 0.34375, + "learning_rate": 0.00018565243416669673, + "loss": 1.7225, + "num_input_tokens_seen": 591052800, + "step": 9020, + "train_runtime": 139304.0618, + "train_tokens_per_second": 4242.897 + }, + { + "epoch": 2.3845239667238873, + "grad_norm": 0.3671875, + "learning_rate": 0.00018513605257892832, + "loss": 1.7293, + "num_input_tokens_seen": 591708160, + "step": 9030, + "train_runtime": 139422.0794, + "train_tokens_per_second": 4244.006 + }, + { + "epoch": 2.3871649280338043, + "grad_norm": 0.3515625, + "learning_rate": 0.00018461996753028225, + "loss": 1.8099, + "num_input_tokens_seen": 592363520, + "step": 9040, + "train_runtime": 139540.6114, + "train_tokens_per_second": 4245.098 + }, + { + "epoch": 2.3898058893437213, + "grad_norm": 0.34765625, + "learning_rate": 0.00018410418138014927, + "loss": 1.7611, + "num_input_tokens_seen": 593018880, + "step": 9050, + "train_runtime": 139659.2386, + "train_tokens_per_second": 4246.184 + }, + { + "epoch": 2.392446850653638, + "grad_norm": 0.341796875, + "learning_rate": 0.00018358869648655383, + "loss": 1.7364, + "num_input_tokens_seen": 593674240, + "step": 9060, + "train_runtime": 139777.7025, + "train_tokens_per_second": 4247.274 + }, + { + "epoch": 2.395087811963555, + "grad_norm": 0.357421875, + "learning_rate": 0.00018307351520614317, + "loss": 1.7905, + "num_input_tokens_seen": 594329600, + "step": 9070, + "train_runtime": 139896.6241, + "train_tokens_per_second": 4248.348 + }, + { + "epoch": 2.397728773273472, + "grad_norm": 0.369140625, + "learning_rate": 0.0001825586398941763, + "loss": 1.7841, + "num_input_tokens_seen": 594984960, + "step": 9080, + "train_runtime": 140015.3229, + "train_tokens_per_second": 4249.427 + }, + { + "epoch": 2.4003697345833883, + "grad_norm": 0.337890625, + "learning_rate": 0.0001820440729045137, + "loss": 1.7908, + "num_input_tokens_seen": 595640320, + "step": 9090, + "train_runtime": 140134.781, + "train_tokens_per_second": 4250.482 + }, + { + "epoch": 2.4030106958933053, + "grad_norm": 0.361328125, + "learning_rate": 0.00018152981658960612, + "loss": 1.7509, + "num_input_tokens_seen": 596295680, + "step": 9100, + "train_runtime": 140253.068, + "train_tokens_per_second": 4251.57 + }, + { + "epoch": 2.405651657203222, + "grad_norm": 0.365234375, + "learning_rate": 0.0001810158733004839, + "loss": 1.7322, + "num_input_tokens_seen": 596951040, + "step": 9110, + "train_runtime": 140372.2313, + "train_tokens_per_second": 4252.629 + }, + { + "epoch": 2.408292618513139, + "grad_norm": 0.337890625, + "learning_rate": 0.00018050224538674654, + "loss": 1.7181, + "num_input_tokens_seen": 597606400, + "step": 9120, + "train_runtime": 140495.1037, + "train_tokens_per_second": 4253.575 + }, + { + "epoch": 2.410933579823056, + "grad_norm": 0.33984375, + "learning_rate": 0.00017998893519655172, + "loss": 1.7501, + "num_input_tokens_seen": 598261760, + "step": 9130, + "train_runtime": 140616.0523, + "train_tokens_per_second": 4254.577 + }, + { + "epoch": 2.4135745411329723, + "grad_norm": 0.357421875, + "learning_rate": 0.00017947594507660425, + "loss": 1.7542, + "num_input_tokens_seen": 598917120, + "step": 9140, + "train_runtime": 140734.2068, + "train_tokens_per_second": 4255.661 + }, + { + "epoch": 2.4162155024428893, + "grad_norm": 0.349609375, + "learning_rate": 0.00017896327737214606, + "loss": 1.7611, + "num_input_tokens_seen": 599572480, + "step": 9150, + "train_runtime": 140851.816, + "train_tokens_per_second": 4256.761 + }, + { + "epoch": 2.418856463752806, + "grad_norm": 0.3515625, + "learning_rate": 0.00017845093442694503, + "loss": 1.7998, + "num_input_tokens_seen": 600227840, + "step": 9160, + "train_runtime": 140971.7527, + "train_tokens_per_second": 4257.788 + }, + { + "epoch": 2.421497425062723, + "grad_norm": 0.341796875, + "learning_rate": 0.00017793891858328405, + "loss": 1.7349, + "num_input_tokens_seen": 600883200, + "step": 9170, + "train_runtime": 141090.3309, + "train_tokens_per_second": 4258.855 + }, + { + "epoch": 2.42413838637264, + "grad_norm": 0.365234375, + "learning_rate": 0.00017742723218195107, + "loss": 1.6993, + "num_input_tokens_seen": 601538560, + "step": 9180, + "train_runtime": 141208.2383, + "train_tokens_per_second": 4259.94 + }, + { + "epoch": 2.4267793476825563, + "grad_norm": 0.3359375, + "learning_rate": 0.00017691587756222735, + "loss": 1.7373, + "num_input_tokens_seen": 602193920, + "step": 9190, + "train_runtime": 141327.2411, + "train_tokens_per_second": 4260.99 + }, + { + "epoch": 2.4294203089924733, + "grad_norm": 0.357421875, + "learning_rate": 0.0001764048570618778, + "loss": 1.7174, + "num_input_tokens_seen": 602849280, + "step": 9200, + "train_runtime": 141446.1423, + "train_tokens_per_second": 4262.041 + }, + { + "epoch": 2.4320612703023903, + "grad_norm": 0.341796875, + "learning_rate": 0.0001758941730171398, + "loss": 1.7377, + "num_input_tokens_seen": 603504640, + "step": 9210, + "train_runtime": 141564.9698, + "train_tokens_per_second": 4263.093 + }, + { + "epoch": 2.434702231612307, + "grad_norm": 0.353515625, + "learning_rate": 0.00017538382776271212, + "loss": 1.7213, + "num_input_tokens_seen": 604160000, + "step": 9220, + "train_runtime": 141684.1453, + "train_tokens_per_second": 4264.133 + }, + { + "epoch": 2.437343192922224, + "grad_norm": 0.34765625, + "learning_rate": 0.0001748738236317452, + "loss": 1.7713, + "num_input_tokens_seen": 604815360, + "step": 9230, + "train_runtime": 141803.3338, + "train_tokens_per_second": 4265.17 + }, + { + "epoch": 2.4399841542321403, + "grad_norm": 0.34765625, + "learning_rate": 0.0001743641629558298, + "loss": 1.7932, + "num_input_tokens_seen": 605470720, + "step": 9240, + "train_runtime": 141922.5822, + "train_tokens_per_second": 4266.204 + }, + { + "epoch": 2.4426251155420573, + "grad_norm": 0.361328125, + "learning_rate": 0.00017385484806498627, + "loss": 1.7473, + "num_input_tokens_seen": 606126080, + "step": 9250, + "train_runtime": 142041.8718, + "train_tokens_per_second": 4267.235 + }, + { + "epoch": 2.4452660768519743, + "grad_norm": 0.36328125, + "learning_rate": 0.00017334588128765444, + "loss": 1.8083, + "num_input_tokens_seen": 606781440, + "step": 9260, + "train_runtime": 142162.8118, + "train_tokens_per_second": 4268.215 + }, + { + "epoch": 2.447907038161891, + "grad_norm": 0.365234375, + "learning_rate": 0.00017283726495068253, + "loss": 1.8204, + "num_input_tokens_seen": 607436800, + "step": 9270, + "train_runtime": 142281.8887, + "train_tokens_per_second": 4269.249 + }, + { + "epoch": 2.450547999471808, + "grad_norm": 0.36328125, + "learning_rate": 0.00017232900137931662, + "loss": 1.7764, + "num_input_tokens_seen": 608092160, + "step": 9280, + "train_runtime": 142401.1413, + "train_tokens_per_second": 4270.276 + }, + { + "epoch": 2.4531889607817243, + "grad_norm": 0.369140625, + "learning_rate": 0.00017182109289719022, + "loss": 1.8187, + "num_input_tokens_seen": 608747520, + "step": 9290, + "train_runtime": 142520.2632, + "train_tokens_per_second": 4271.305 + }, + { + "epoch": 2.4558299220916413, + "grad_norm": 0.359375, + "learning_rate": 0.00017131354182631315, + "loss": 1.7767, + "num_input_tokens_seen": 609402880, + "step": 9300, + "train_runtime": 142639.1497, + "train_tokens_per_second": 4272.34 + }, + { + "epoch": 2.4584708834015583, + "grad_norm": 0.34765625, + "learning_rate": 0.0001708063504870615, + "loss": 1.7341, + "num_input_tokens_seen": 610058240, + "step": 9310, + "train_runtime": 142758.107, + "train_tokens_per_second": 4273.37 + }, + { + "epoch": 2.461111844711475, + "grad_norm": 0.349609375, + "learning_rate": 0.00017029952119816688, + "loss": 1.765, + "num_input_tokens_seen": 610713600, + "step": 9320, + "train_runtime": 142876.8107, + "train_tokens_per_second": 4274.407 + }, + { + "epoch": 2.463752806021392, + "grad_norm": 0.353515625, + "learning_rate": 0.00016979305627670533, + "loss": 1.7883, + "num_input_tokens_seen": 611368960, + "step": 9330, + "train_runtime": 142996.1941, + "train_tokens_per_second": 4275.421 + }, + { + "epoch": 2.4663937673313088, + "grad_norm": 0.35546875, + "learning_rate": 0.00016928695803808738, + "loss": 1.7812, + "num_input_tokens_seen": 612024320, + "step": 9340, + "train_runtime": 143114.962, + "train_tokens_per_second": 4276.452 + }, + { + "epoch": 2.4690347286412253, + "grad_norm": 0.35546875, + "learning_rate": 0.00016878122879604725, + "loss": 1.7889, + "num_input_tokens_seen": 612679680, + "step": 9350, + "train_runtime": 143234.1143, + "train_tokens_per_second": 4277.47 + }, + { + "epoch": 2.4716756899511423, + "grad_norm": 0.3671875, + "learning_rate": 0.00016827587086263194, + "loss": 1.7724, + "num_input_tokens_seen": 613335040, + "step": 9360, + "train_runtime": 143357.6622, + "train_tokens_per_second": 4278.355 + }, + { + "epoch": 2.4743166512610593, + "grad_norm": 0.361328125, + "learning_rate": 0.00016777088654819117, + "loss": 1.7803, + "num_input_tokens_seen": 613990400, + "step": 9370, + "train_runtime": 143476.0368, + "train_tokens_per_second": 4279.393 + }, + { + "epoch": 2.476957612570976, + "grad_norm": 0.353515625, + "learning_rate": 0.00016726627816136664, + "loss": 1.7175, + "num_input_tokens_seen": 614645760, + "step": 9380, + "train_runtime": 143594.8596, + "train_tokens_per_second": 4280.416 + }, + { + "epoch": 2.4795985738808928, + "grad_norm": 0.375, + "learning_rate": 0.00016676204800908107, + "loss": 1.8176, + "num_input_tokens_seen": 615301120, + "step": 9390, + "train_runtime": 143713.5326, + "train_tokens_per_second": 4281.442 + }, + { + "epoch": 2.4822395351908093, + "grad_norm": 0.337890625, + "learning_rate": 0.0001662581983965284, + "loss": 1.7688, + "num_input_tokens_seen": 615956480, + "step": 9400, + "train_runtime": 143831.839, + "train_tokens_per_second": 4282.477 + }, + { + "epoch": 2.4848804965007263, + "grad_norm": 0.337890625, + "learning_rate": 0.00016575473162716247, + "loss": 1.7741, + "num_input_tokens_seen": 616611840, + "step": 9410, + "train_runtime": 143950.6805, + "train_tokens_per_second": 4283.494 + }, + { + "epoch": 2.4875214578106433, + "grad_norm": 0.3515625, + "learning_rate": 0.0001652516500026872, + "loss": 1.8501, + "num_input_tokens_seen": 617267200, + "step": 9420, + "train_runtime": 144069.9304, + "train_tokens_per_second": 4284.497 + }, + { + "epoch": 2.49016241912056, + "grad_norm": 0.345703125, + "learning_rate": 0.00016474895582304562, + "loss": 1.6814, + "num_input_tokens_seen": 617922560, + "step": 9430, + "train_runtime": 144188.5969, + "train_tokens_per_second": 4285.516 + }, + { + "epoch": 2.492803380430477, + "grad_norm": 0.384765625, + "learning_rate": 0.00016424665138640944, + "loss": 1.7343, + "num_input_tokens_seen": 618577920, + "step": 9440, + "train_runtime": 144306.7939, + "train_tokens_per_second": 4286.547 + }, + { + "epoch": 2.4954443417403933, + "grad_norm": 0.361328125, + "learning_rate": 0.0001637447389891686, + "loss": 1.7792, + "num_input_tokens_seen": 619233280, + "step": 9450, + "train_runtime": 144425.3384, + "train_tokens_per_second": 4287.567 + }, + { + "epoch": 2.4980853030503103, + "grad_norm": 0.373046875, + "learning_rate": 0.00016324322092592088, + "loss": 1.7462, + "num_input_tokens_seen": 619888640, + "step": 9460, + "train_runtime": 144545.926, + "train_tokens_per_second": 4288.524 + }, + { + "epoch": 2.5007262643602273, + "grad_norm": 0.359375, + "learning_rate": 0.0001627420994894609, + "loss": 1.7535, + "num_input_tokens_seen": 620544000, + "step": 9470, + "train_runtime": 144665.4447, + "train_tokens_per_second": 4289.511 + }, + { + "epoch": 2.503367225670144, + "grad_norm": 0.369140625, + "learning_rate": 0.00016224137697077047, + "loss": 1.7452, + "num_input_tokens_seen": 621199360, + "step": 9480, + "train_runtime": 144784.5654, + "train_tokens_per_second": 4290.508 + }, + { + "epoch": 2.506008186980061, + "grad_norm": 0.345703125, + "learning_rate": 0.00016174105565900748, + "loss": 1.7659, + "num_input_tokens_seen": 621854720, + "step": 9490, + "train_runtime": 144903.4354, + "train_tokens_per_second": 4291.511 + }, + { + "epoch": 2.5086491482899778, + "grad_norm": 0.34765625, + "learning_rate": 0.00016124113784149547, + "loss": 1.7674, + "num_input_tokens_seen": 622510080, + "step": 9500, + "train_runtime": 145021.8183, + "train_tokens_per_second": 4292.527 + }, + { + "epoch": 2.5112901095998943, + "grad_norm": 0.341796875, + "learning_rate": 0.00016074162580371356, + "loss": 1.8104, + "num_input_tokens_seen": 623165440, + "step": 9510, + "train_runtime": 145154.1453, + "train_tokens_per_second": 4293.129 + }, + { + "epoch": 2.5139310709098113, + "grad_norm": 0.359375, + "learning_rate": 0.00016024252182928562, + "loss": 1.8579, + "num_input_tokens_seen": 623820800, + "step": 9520, + "train_runtime": 145274.2883, + "train_tokens_per_second": 4294.09 + }, + { + "epoch": 2.5165720322197282, + "grad_norm": 0.34375, + "learning_rate": 0.00015974382819996995, + "loss": 1.731, + "num_input_tokens_seen": 624476160, + "step": 9530, + "train_runtime": 145393.8544, + "train_tokens_per_second": 4295.066 + }, + { + "epoch": 2.519212993529645, + "grad_norm": 0.365234375, + "learning_rate": 0.0001592455471956492, + "loss": 1.7521, + "num_input_tokens_seen": 625131520, + "step": 9540, + "train_runtime": 145512.4505, + "train_tokens_per_second": 4296.069 + }, + { + "epoch": 2.5218539548395618, + "grad_norm": 0.35546875, + "learning_rate": 0.00015874768109431898, + "loss": 1.8008, + "num_input_tokens_seen": 625786880, + "step": 9550, + "train_runtime": 145630.963, + "train_tokens_per_second": 4297.073 + }, + { + "epoch": 2.5244949161494783, + "grad_norm": 0.359375, + "learning_rate": 0.00015825023217207868, + "loss": 1.7386, + "num_input_tokens_seen": 626442240, + "step": 9560, + "train_runtime": 145750.7145, + "train_tokens_per_second": 4298.039 + }, + { + "epoch": 2.5271358774593953, + "grad_norm": 0.34765625, + "learning_rate": 0.00015775320270312027, + "loss": 1.7579, + "num_input_tokens_seen": 627097600, + "step": 9570, + "train_runtime": 145869.0005, + "train_tokens_per_second": 4299.046 + }, + { + "epoch": 2.529776838769312, + "grad_norm": 0.353515625, + "learning_rate": 0.00015725659495971795, + "loss": 1.7801, + "num_input_tokens_seen": 627752960, + "step": 9580, + "train_runtime": 145988.3153, + "train_tokens_per_second": 4300.022 + }, + { + "epoch": 2.532417800079229, + "grad_norm": 0.35546875, + "learning_rate": 0.00015676041121221807, + "loss": 1.6687, + "num_input_tokens_seen": 628408320, + "step": 9590, + "train_runtime": 146107.4785, + "train_tokens_per_second": 4301.0 + }, + { + "epoch": 2.5350587613891458, + "grad_norm": 0.359375, + "learning_rate": 0.00015626465372902865, + "loss": 1.7341, + "num_input_tokens_seen": 629063680, + "step": 9600, + "train_runtime": 146228.4682, + "train_tokens_per_second": 4301.923 + }, + { + "epoch": 2.5376997226990623, + "grad_norm": 0.390625, + "learning_rate": 0.0001557693247766088, + "loss": 1.7433, + "num_input_tokens_seen": 629719040, + "step": 9610, + "train_runtime": 146347.5804, + "train_tokens_per_second": 4302.9 + }, + { + "epoch": 2.5403406840089793, + "grad_norm": 0.357421875, + "learning_rate": 0.00015527442661945857, + "loss": 1.729, + "num_input_tokens_seen": 630374400, + "step": 9620, + "train_runtime": 146466.9131, + "train_tokens_per_second": 4303.869 + }, + { + "epoch": 2.5429816453188963, + "grad_norm": 0.353515625, + "learning_rate": 0.00015477996152010859, + "loss": 1.7704, + "num_input_tokens_seen": 631029760, + "step": 9630, + "train_runtime": 146586.2223, + "train_tokens_per_second": 4304.837 + }, + { + "epoch": 2.545622606628813, + "grad_norm": 0.361328125, + "learning_rate": 0.00015428593173910955, + "loss": 1.8101, + "num_input_tokens_seen": 631685120, + "step": 9640, + "train_runtime": 146705.5529, + "train_tokens_per_second": 4305.802 + }, + { + "epoch": 2.5482635679387298, + "grad_norm": 0.345703125, + "learning_rate": 0.00015379233953502226, + "loss": 1.7623, + "num_input_tokens_seen": 632340480, + "step": 9650, + "train_runtime": 146824.4086, + "train_tokens_per_second": 4306.78 + }, + { + "epoch": 2.5509045292486467, + "grad_norm": 0.35546875, + "learning_rate": 0.00015329918716440664, + "loss": 1.7418, + "num_input_tokens_seen": 632995840, + "step": 9660, + "train_runtime": 146944.2577, + "train_tokens_per_second": 4307.728 + }, + { + "epoch": 2.5535454905585633, + "grad_norm": 0.365234375, + "learning_rate": 0.00015280647688181216, + "loss": 1.7479, + "num_input_tokens_seen": 633651200, + "step": 9670, + "train_runtime": 147065.0312, + "train_tokens_per_second": 4308.646 + }, + { + "epoch": 2.5561864518684803, + "grad_norm": 0.3515625, + "learning_rate": 0.00015231421093976716, + "loss": 1.721, + "num_input_tokens_seen": 634306560, + "step": 9680, + "train_runtime": 147183.3892, + "train_tokens_per_second": 4309.634 + }, + { + "epoch": 2.558827413178397, + "grad_norm": 0.345703125, + "learning_rate": 0.00015182239158876833, + "loss": 1.8321, + "num_input_tokens_seen": 634961920, + "step": 9690, + "train_runtime": 147302.1551, + "train_tokens_per_second": 4310.608 + }, + { + "epoch": 2.5614683744883138, + "grad_norm": 0.333984375, + "learning_rate": 0.00015133102107727094, + "loss": 1.7363, + "num_input_tokens_seen": 635617280, + "step": 9700, + "train_runtime": 147420.2693, + "train_tokens_per_second": 4311.6 + }, + { + "epoch": 2.5641093357982303, + "grad_norm": 0.369140625, + "learning_rate": 0.00015084010165167827, + "loss": 1.6908, + "num_input_tokens_seen": 636272640, + "step": 9710, + "train_runtime": 147539.2396, + "train_tokens_per_second": 4312.566 + }, + { + "epoch": 2.5667502971081473, + "grad_norm": 0.34765625, + "learning_rate": 0.00015034963555633118, + "loss": 1.7516, + "num_input_tokens_seen": 636928000, + "step": 9720, + "train_runtime": 147658.2838, + "train_tokens_per_second": 4313.527 + }, + { + "epoch": 2.5693912584180643, + "grad_norm": 0.3515625, + "learning_rate": 0.00014985962503349825, + "loss": 1.7675, + "num_input_tokens_seen": 637583360, + "step": 9730, + "train_runtime": 147777.1215, + "train_tokens_per_second": 4314.493 + }, + { + "epoch": 2.572032219727981, + "grad_norm": 0.365234375, + "learning_rate": 0.0001493700723233653, + "loss": 1.7258, + "num_input_tokens_seen": 638238720, + "step": 9740, + "train_runtime": 147896.5141, + "train_tokens_per_second": 4315.441 + }, + { + "epoch": 2.5746731810378978, + "grad_norm": 0.359375, + "learning_rate": 0.00014888097966402487, + "loss": 1.7838, + "num_input_tokens_seen": 638894080, + "step": 9750, + "train_runtime": 148015.0797, + "train_tokens_per_second": 4316.412 + }, + { + "epoch": 2.5773141423478148, + "grad_norm": 0.37890625, + "learning_rate": 0.00014839234929146672, + "loss": 1.8084, + "num_input_tokens_seen": 639549440, + "step": 9760, + "train_runtime": 148133.9696, + "train_tokens_per_second": 4317.372 + }, + { + "epoch": 2.5799551036577313, + "grad_norm": 0.34765625, + "learning_rate": 0.00014790418343956673, + "loss": 1.7442, + "num_input_tokens_seen": 640204800, + "step": 9770, + "train_runtime": 148252.6304, + "train_tokens_per_second": 4318.337 + }, + { + "epoch": 2.5825960649676483, + "grad_norm": 0.353515625, + "learning_rate": 0.00014741648434007747, + "loss": 1.7685, + "num_input_tokens_seen": 640860160, + "step": 9780, + "train_runtime": 148373.1837, + "train_tokens_per_second": 4319.245 + }, + { + "epoch": 2.5852370262775652, + "grad_norm": 0.349609375, + "learning_rate": 0.0001469292542226176, + "loss": 1.7935, + "num_input_tokens_seen": 641515520, + "step": 9790, + "train_runtime": 148492.2266, + "train_tokens_per_second": 4320.196 + }, + { + "epoch": 2.5878779875874818, + "grad_norm": 0.349609375, + "learning_rate": 0.00014644249531466148, + "loss": 1.7533, + "num_input_tokens_seen": 642170880, + "step": 9800, + "train_runtime": 148611.013, + "train_tokens_per_second": 4321.153 + }, + { + "epoch": 2.5905189488973988, + "grad_norm": 0.375, + "learning_rate": 0.00014595620984152958, + "loss": 1.8039, + "num_input_tokens_seen": 642826240, + "step": 9810, + "train_runtime": 148729.6877, + "train_tokens_per_second": 4322.111 + }, + { + "epoch": 2.5931599102073157, + "grad_norm": 0.34765625, + "learning_rate": 0.00014547040002637775, + "loss": 1.8059, + "num_input_tokens_seen": 643481600, + "step": 9820, + "train_runtime": 148848.1016, + "train_tokens_per_second": 4323.076 + }, + { + "epoch": 2.5958008715172323, + "grad_norm": 0.365234375, + "learning_rate": 0.00014498506809018725, + "loss": 1.7737, + "num_input_tokens_seen": 644136960, + "step": 9830, + "train_runtime": 148967.8001, + "train_tokens_per_second": 4324.001 + }, + { + "epoch": 2.5984418328271492, + "grad_norm": 0.34375, + "learning_rate": 0.00014450021625175466, + "loss": 1.7851, + "num_input_tokens_seen": 644792320, + "step": 9840, + "train_runtime": 149089.5883, + "train_tokens_per_second": 4324.865 + }, + { + "epoch": 2.601082794137066, + "grad_norm": 0.369140625, + "learning_rate": 0.00014401584672768192, + "loss": 1.7162, + "num_input_tokens_seen": 645447680, + "step": 9850, + "train_runtime": 149208.5088, + "train_tokens_per_second": 4325.81 + }, + { + "epoch": 2.6037237554469828, + "grad_norm": 0.3671875, + "learning_rate": 0.0001435319617323656, + "loss": 1.783, + "num_input_tokens_seen": 646103040, + "step": 9860, + "train_runtime": 149327.0397, + "train_tokens_per_second": 4326.765 + }, + { + "epoch": 2.6063647167568993, + "grad_norm": 0.3515625, + "learning_rate": 0.00014304856347798736, + "loss": 1.7032, + "num_input_tokens_seen": 646758400, + "step": 9870, + "train_runtime": 149446.2157, + "train_tokens_per_second": 4327.7 + }, + { + "epoch": 2.6090056780668163, + "grad_norm": 0.3671875, + "learning_rate": 0.00014256565417450356, + "loss": 1.7575, + "num_input_tokens_seen": 647413760, + "step": 9880, + "train_runtime": 149565.2883, + "train_tokens_per_second": 4328.636 + }, + { + "epoch": 2.6116466393767332, + "grad_norm": 0.361328125, + "learning_rate": 0.0001420832360296352, + "loss": 1.7756, + "num_input_tokens_seen": 648069120, + "step": 9890, + "train_runtime": 149684.1139, + "train_tokens_per_second": 4329.578 + }, + { + "epoch": 2.61428760068665, + "grad_norm": 0.333984375, + "learning_rate": 0.00014160131124885806, + "loss": 1.6877, + "num_input_tokens_seen": 648724480, + "step": 9900, + "train_runtime": 149803.3178, + "train_tokens_per_second": 4330.508 + }, + { + "epoch": 2.6169285619965668, + "grad_norm": 0.359375, + "learning_rate": 0.0001411198820353919, + "loss": 1.7656, + "num_input_tokens_seen": 649379840, + "step": 9910, + "train_runtime": 149921.9236, + "train_tokens_per_second": 4331.453 + }, + { + "epoch": 2.6195695233064837, + "grad_norm": 0.353515625, + "learning_rate": 0.00014063895059019145, + "loss": 1.7506, + "num_input_tokens_seen": 650035200, + "step": 9920, + "train_runtime": 150041.0453, + "train_tokens_per_second": 4332.383 + }, + { + "epoch": 2.6222104846164003, + "grad_norm": 0.37109375, + "learning_rate": 0.0001401585191119355, + "loss": 1.7237, + "num_input_tokens_seen": 650690560, + "step": 9930, + "train_runtime": 150160.1682, + "train_tokens_per_second": 4333.31 + }, + { + "epoch": 2.6248514459263173, + "grad_norm": 0.353515625, + "learning_rate": 0.00013967858979701712, + "loss": 1.7054, + "num_input_tokens_seen": 651345920, + "step": 9940, + "train_runtime": 150282.8071, + "train_tokens_per_second": 4334.135 + }, + { + "epoch": 2.6274924072362342, + "grad_norm": 0.34765625, + "learning_rate": 0.00013919916483953382, + "loss": 1.7295, + "num_input_tokens_seen": 652001280, + "step": 9950, + "train_runtime": 150402.2739, + "train_tokens_per_second": 4335.049 + }, + { + "epoch": 2.6301333685461508, + "grad_norm": 0.349609375, + "learning_rate": 0.00013872024643127716, + "loss": 1.7022, + "num_input_tokens_seen": 652656640, + "step": 9960, + "train_runtime": 150521.4317, + "train_tokens_per_second": 4335.972 + }, + { + "epoch": 2.6327743298560677, + "grad_norm": 0.333984375, + "learning_rate": 0.00013824183676172292, + "loss": 1.7629, + "num_input_tokens_seen": 653312000, + "step": 9970, + "train_runtime": 150640.7138, + "train_tokens_per_second": 4336.889 + }, + { + "epoch": 2.6354152911659843, + "grad_norm": 0.359375, + "learning_rate": 0.00013776393801802117, + "loss": 1.7168, + "num_input_tokens_seen": 653967360, + "step": 9980, + "train_runtime": 150759.568, + "train_tokens_per_second": 4337.817 + }, + { + "epoch": 2.6380562524759013, + "grad_norm": 0.353515625, + "learning_rate": 0.0001372865523849861, + "loss": 1.7788, + "num_input_tokens_seen": 654622720, + "step": 9990, + "train_runtime": 150878.3328, + "train_tokens_per_second": 4338.746 + }, + { + "epoch": 2.640697213785818, + "grad_norm": 0.353515625, + "learning_rate": 0.000136809682045086, + "loss": 1.7543, + "num_input_tokens_seen": 655278080, + "step": 10000, + "train_runtime": 150996.925, + "train_tokens_per_second": 4339.678 + }, + { + "epoch": 2.6433381750957348, + "grad_norm": 0.35546875, + "learning_rate": 0.0001363333291784337, + "loss": 1.7494, + "num_input_tokens_seen": 655933440, + "step": 10010, + "train_runtime": 151130.2799, + "train_tokens_per_second": 4340.185 + }, + { + "epoch": 2.6459791364056517, + "grad_norm": 0.349609375, + "learning_rate": 0.00013585749596277608, + "loss": 1.6409, + "num_input_tokens_seen": 656588800, + "step": 10020, + "train_runtime": 151251.6957, + "train_tokens_per_second": 4341.034 + }, + { + "epoch": 2.6486200977155683, + "grad_norm": 0.345703125, + "learning_rate": 0.00013538218457348424, + "loss": 1.6976, + "num_input_tokens_seen": 657244160, + "step": 10030, + "train_runtime": 151370.1323, + "train_tokens_per_second": 4341.967 + }, + { + "epoch": 2.6512610590254853, + "grad_norm": 0.375, + "learning_rate": 0.000134907397183544, + "loss": 1.7745, + "num_input_tokens_seen": 657899520, + "step": 10040, + "train_runtime": 151488.5968, + "train_tokens_per_second": 4342.898 + }, + { + "epoch": 2.6539020203354022, + "grad_norm": 0.361328125, + "learning_rate": 0.000134433135963545, + "loss": 1.7694, + "num_input_tokens_seen": 658554880, + "step": 10050, + "train_runtime": 151608.039, + "train_tokens_per_second": 4343.799 + }, + { + "epoch": 2.6565429816453188, + "grad_norm": 0.37109375, + "learning_rate": 0.00013395940308167203, + "loss": 1.7231, + "num_input_tokens_seen": 659210240, + "step": 10060, + "train_runtime": 151726.6255, + "train_tokens_per_second": 4344.724 + }, + { + "epoch": 2.6591839429552357, + "grad_norm": 0.361328125, + "learning_rate": 0.0001334862007036941, + "loss": 1.7514, + "num_input_tokens_seen": 659865600, + "step": 10070, + "train_runtime": 151845.6218, + "train_tokens_per_second": 4345.635 + }, + { + "epoch": 2.6618249042651527, + "grad_norm": 0.359375, + "learning_rate": 0.00013301353099295506, + "loss": 1.7589, + "num_input_tokens_seen": 660520960, + "step": 10080, + "train_runtime": 151964.4801, + "train_tokens_per_second": 4346.548 + }, + { + "epoch": 2.6644658655750693, + "grad_norm": 0.3515625, + "learning_rate": 0.00013254139611036328, + "loss": 1.7025, + "num_input_tokens_seen": 661176320, + "step": 10090, + "train_runtime": 152084.2543, + "train_tokens_per_second": 4347.434 + }, + { + "epoch": 2.6671068268849862, + "grad_norm": 0.37109375, + "learning_rate": 0.00013206979821438254, + "loss": 1.7462, + "num_input_tokens_seen": 661831680, + "step": 10100, + "train_runtime": 152205.3163, + "train_tokens_per_second": 4348.282 + }, + { + "epoch": 2.669747788194903, + "grad_norm": 0.337890625, + "learning_rate": 0.0001315987394610213, + "loss": 1.7323, + "num_input_tokens_seen": 662487040, + "step": 10110, + "train_runtime": 152325.6475, + "train_tokens_per_second": 4349.15 + }, + { + "epoch": 2.6723887495048197, + "grad_norm": 0.353515625, + "learning_rate": 0.00013112822200382336, + "loss": 1.81, + "num_input_tokens_seen": 663142400, + "step": 10120, + "train_runtime": 152444.303, + "train_tokens_per_second": 4350.064 + }, + { + "epoch": 2.6750297108147367, + "grad_norm": 0.33203125, + "learning_rate": 0.00013065824799385773, + "loss": 1.7513, + "num_input_tokens_seen": 663797760, + "step": 10130, + "train_runtime": 152563.5979, + "train_tokens_per_second": 4350.958 + }, + { + "epoch": 2.6776706721246533, + "grad_norm": 0.349609375, + "learning_rate": 0.00013018881957970903, + "loss": 1.7439, + "num_input_tokens_seen": 664453120, + "step": 10140, + "train_runtime": 152683.481, + "train_tokens_per_second": 4351.834 + }, + { + "epoch": 2.6803116334345702, + "grad_norm": 0.33984375, + "learning_rate": 0.00012971993890746781, + "loss": 1.7359, + "num_input_tokens_seen": 665108480, + "step": 10150, + "train_runtime": 152804.3205, + "train_tokens_per_second": 4352.681 + }, + { + "epoch": 2.6829525947444868, + "grad_norm": 0.341796875, + "learning_rate": 0.00012925160812071994, + "loss": 1.7752, + "num_input_tokens_seen": 665763840, + "step": 10160, + "train_runtime": 152924.5429, + "train_tokens_per_second": 4353.545 + }, + { + "epoch": 2.6855935560544038, + "grad_norm": 0.361328125, + "learning_rate": 0.000128783829360538, + "loss": 1.7361, + "num_input_tokens_seen": 666419200, + "step": 10170, + "train_runtime": 153043.2269, + "train_tokens_per_second": 4354.451 + }, + { + "epoch": 2.6882345173643207, + "grad_norm": 0.353515625, + "learning_rate": 0.00012831660476547046, + "loss": 1.7662, + "num_input_tokens_seen": 667074560, + "step": 10180, + "train_runtime": 153161.5733, + "train_tokens_per_second": 4355.365 + }, + { + "epoch": 2.6908754786742373, + "grad_norm": 0.35546875, + "learning_rate": 0.00012784993647153243, + "loss": 1.6742, + "num_input_tokens_seen": 667729920, + "step": 10190, + "train_runtime": 153280.1511, + "train_tokens_per_second": 4356.271 + }, + { + "epoch": 2.6935164399841542, + "grad_norm": 0.35546875, + "learning_rate": 0.0001273838266121956, + "loss": 1.6736, + "num_input_tokens_seen": 668385280, + "step": 10200, + "train_runtime": 153399.7672, + "train_tokens_per_second": 4357.147 + }, + { + "epoch": 2.696157401294071, + "grad_norm": 0.359375, + "learning_rate": 0.00012691827731837912, + "loss": 1.7415, + "num_input_tokens_seen": 669040640, + "step": 10210, + "train_runtime": 153519.1639, + "train_tokens_per_second": 4358.027 + }, + { + "epoch": 2.6987983626039878, + "grad_norm": 0.3515625, + "learning_rate": 0.000126453290718439, + "loss": 1.7608, + "num_input_tokens_seen": 669696000, + "step": 10220, + "train_runtime": 153638.739, + "train_tokens_per_second": 4358.901 + }, + { + "epoch": 2.7014393239139047, + "grad_norm": 0.365234375, + "learning_rate": 0.0001259888689381588, + "loss": 1.8025, + "num_input_tokens_seen": 670351360, + "step": 10230, + "train_runtime": 153759.6299, + "train_tokens_per_second": 4359.736 + }, + { + "epoch": 2.7040802852238217, + "grad_norm": 0.3515625, + "learning_rate": 0.00012552501410074005, + "loss": 1.7883, + "num_input_tokens_seen": 671006720, + "step": 10240, + "train_runtime": 153879.7124, + "train_tokens_per_second": 4360.593 + }, + { + "epoch": 2.7067212465337382, + "grad_norm": 0.35546875, + "learning_rate": 0.00012506172832679215, + "loss": 1.7846, + "num_input_tokens_seen": 671662080, + "step": 10250, + "train_runtime": 153998.9053, + "train_tokens_per_second": 4361.473 + }, + { + "epoch": 2.7093622078436552, + "grad_norm": 0.345703125, + "learning_rate": 0.00012459901373432333, + "loss": 1.7479, + "num_input_tokens_seen": 672317440, + "step": 10260, + "train_runtime": 154117.2327, + "train_tokens_per_second": 4362.377 + }, + { + "epoch": 2.7120031691535718, + "grad_norm": 0.375, + "learning_rate": 0.00012413687243872996, + "loss": 1.7055, + "num_input_tokens_seen": 672972800, + "step": 10270, + "train_runtime": 154236.4631, + "train_tokens_per_second": 4363.254 + }, + { + "epoch": 2.7146441304634887, + "grad_norm": 0.373046875, + "learning_rate": 0.000123675306552788, + "loss": 1.7548, + "num_input_tokens_seen": 673628160, + "step": 10280, + "train_runtime": 154355.8921, + "train_tokens_per_second": 4364.123 + }, + { + "epoch": 2.7172850917734053, + "grad_norm": 0.3515625, + "learning_rate": 0.00012321431818664252, + "loss": 1.7346, + "num_input_tokens_seen": 674283520, + "step": 10290, + "train_runtime": 154474.3228, + "train_tokens_per_second": 4365.02 + }, + { + "epoch": 2.7199260530833222, + "grad_norm": 0.357421875, + "learning_rate": 0.00012275390944779826, + "loss": 1.7375, + "num_input_tokens_seen": 674938880, + "step": 10300, + "train_runtime": 154593.0733, + "train_tokens_per_second": 4365.906 + }, + { + "epoch": 2.7225670143932392, + "grad_norm": 0.361328125, + "learning_rate": 0.00012229408244111045, + "loss": 1.7223, + "num_input_tokens_seen": 675594240, + "step": 10310, + "train_runtime": 154715.0506, + "train_tokens_per_second": 4366.7 + }, + { + "epoch": 2.7252079757031558, + "grad_norm": 0.3515625, + "learning_rate": 0.00012183483926877442, + "loss": 1.7378, + "num_input_tokens_seen": 676249600, + "step": 10320, + "train_runtime": 154836.8973, + "train_tokens_per_second": 4367.496 + }, + { + "epoch": 2.7278489370130727, + "grad_norm": 0.35546875, + "learning_rate": 0.00012137618203031659, + "loss": 1.6316, + "num_input_tokens_seen": 676904960, + "step": 10330, + "train_runtime": 154957.1985, + "train_tokens_per_second": 4368.335 + }, + { + "epoch": 2.7304898983229897, + "grad_norm": 0.3515625, + "learning_rate": 0.00012091811282258452, + "loss": 1.6752, + "num_input_tokens_seen": 677560320, + "step": 10340, + "train_runtime": 155075.758, + "train_tokens_per_second": 4369.221 + }, + { + "epoch": 2.7331308596329063, + "grad_norm": 0.3515625, + "learning_rate": 0.00012046063373973759, + "loss": 1.7578, + "num_input_tokens_seen": 678215680, + "step": 10350, + "train_runtime": 155195.2428, + "train_tokens_per_second": 4370.08 + }, + { + "epoch": 2.7357718209428232, + "grad_norm": 0.369140625, + "learning_rate": 0.00012000374687323718, + "loss": 1.809, + "num_input_tokens_seen": 678871040, + "step": 10360, + "train_runtime": 155314.0119, + "train_tokens_per_second": 4370.958 + }, + { + "epoch": 2.73841278225274, + "grad_norm": 0.359375, + "learning_rate": 0.00011954745431183742, + "loss": 1.6975, + "num_input_tokens_seen": 679526400, + "step": 10370, + "train_runtime": 155433.363, + "train_tokens_per_second": 4371.818 + }, + { + "epoch": 2.7410537435626567, + "grad_norm": 0.3515625, + "learning_rate": 0.0001190917581415753, + "loss": 1.7345, + "num_input_tokens_seen": 680181760, + "step": 10380, + "train_runtime": 155552.076, + "train_tokens_per_second": 4372.695 + }, + { + "epoch": 2.7436947048725737, + "grad_norm": 0.361328125, + "learning_rate": 0.00011863666044576118, + "loss": 1.6923, + "num_input_tokens_seen": 680837120, + "step": 10390, + "train_runtime": 155671.0341, + "train_tokens_per_second": 4373.563 + }, + { + "epoch": 2.7463356661824907, + "grad_norm": 0.34375, + "learning_rate": 0.00011818216330496981, + "loss": 1.7456, + "num_input_tokens_seen": 681492480, + "step": 10400, + "train_runtime": 155790.2815, + "train_tokens_per_second": 4374.422 + }, + { + "epoch": 2.7489766274924072, + "grad_norm": 0.345703125, + "learning_rate": 0.00011772826879702969, + "loss": 1.6887, + "num_input_tokens_seen": 682147840, + "step": 10410, + "train_runtime": 155908.1093, + "train_tokens_per_second": 4375.32 + }, + { + "epoch": 2.751617588802324, + "grad_norm": 0.3671875, + "learning_rate": 0.00011727497899701489, + "loss": 1.6616, + "num_input_tokens_seen": 682803200, + "step": 10420, + "train_runtime": 156026.2955, + "train_tokens_per_second": 4376.206 + }, + { + "epoch": 2.7542585501122407, + "grad_norm": 0.359375, + "learning_rate": 0.00011682229597723462, + "loss": 1.7517, + "num_input_tokens_seen": 683458560, + "step": 10430, + "train_runtime": 156146.1264, + "train_tokens_per_second": 4377.045 + }, + { + "epoch": 2.7568995114221577, + "grad_norm": 0.35546875, + "learning_rate": 0.00011637022180722412, + "loss": 1.6757, + "num_input_tokens_seen": 684113920, + "step": 10440, + "train_runtime": 156265.4483, + "train_tokens_per_second": 4377.896 + }, + { + "epoch": 2.7595404727320743, + "grad_norm": 0.359375, + "learning_rate": 0.00011591875855373515, + "loss": 1.712, + "num_input_tokens_seen": 684769280, + "step": 10450, + "train_runtime": 156384.4208, + "train_tokens_per_second": 4378.756 + }, + { + "epoch": 2.7621814340419912, + "grad_norm": 0.34375, + "learning_rate": 0.00011546790828072643, + "loss": 1.7351, + "num_input_tokens_seen": 685424640, + "step": 10460, + "train_runtime": 156503.5248, + "train_tokens_per_second": 4379.612 + }, + { + "epoch": 2.764822395351908, + "grad_norm": 0.345703125, + "learning_rate": 0.00011501767304935463, + "loss": 1.7844, + "num_input_tokens_seen": 686080000, + "step": 10470, + "train_runtime": 156623.0679, + "train_tokens_per_second": 4380.453 + }, + { + "epoch": 2.7674633566618247, + "grad_norm": 0.3515625, + "learning_rate": 0.00011456805491796429, + "loss": 1.6747, + "num_input_tokens_seen": 686735360, + "step": 10480, + "train_runtime": 156741.8389, + "train_tokens_per_second": 4381.315 + }, + { + "epoch": 2.7701043179717417, + "grad_norm": 0.380859375, + "learning_rate": 0.00011411905594207889, + "loss": 1.699, + "num_input_tokens_seen": 687390720, + "step": 10490, + "train_runtime": 156862.1683, + "train_tokens_per_second": 4382.132 + }, + { + "epoch": 2.7727452792816587, + "grad_norm": 0.34765625, + "learning_rate": 0.00011367067817439122, + "loss": 1.7249, + "num_input_tokens_seen": 688046080, + "step": 10500, + "train_runtime": 156981.6318, + "train_tokens_per_second": 4382.972 + }, + { + "epoch": 2.7753862405915752, + "grad_norm": 0.37109375, + "learning_rate": 0.00011322292366475442, + "loss": 1.7308, + "num_input_tokens_seen": 688701440, + "step": 10510, + "train_runtime": 157113.7815, + "train_tokens_per_second": 4383.457 + }, + { + "epoch": 2.778027201901492, + "grad_norm": 0.361328125, + "learning_rate": 0.0001127757944601717, + "loss": 1.802, + "num_input_tokens_seen": 689356800, + "step": 10520, + "train_runtime": 157233.1516, + "train_tokens_per_second": 4384.297 + }, + { + "epoch": 2.780668163211409, + "grad_norm": 0.35546875, + "learning_rate": 0.00011232929260478808, + "loss": 1.7407, + "num_input_tokens_seen": 690012160, + "step": 10530, + "train_runtime": 157353.0862, + "train_tokens_per_second": 4385.12 + }, + { + "epoch": 2.7833091245213257, + "grad_norm": 0.37109375, + "learning_rate": 0.00011188342013988026, + "loss": 1.7362, + "num_input_tokens_seen": 690667520, + "step": 10540, + "train_runtime": 157471.115, + "train_tokens_per_second": 4385.995 + }, + { + "epoch": 2.7859500858312427, + "grad_norm": 0.357421875, + "learning_rate": 0.00011143817910384752, + "loss": 1.6644, + "num_input_tokens_seen": 691322880, + "step": 10550, + "train_runtime": 157589.8414, + "train_tokens_per_second": 4386.849 + }, + { + "epoch": 2.7885910471411592, + "grad_norm": 0.34375, + "learning_rate": 0.00011099357153220268, + "loss": 1.7045, + "num_input_tokens_seen": 691978240, + "step": 10560, + "train_runtime": 157709.0865, + "train_tokens_per_second": 4387.688 + }, + { + "epoch": 2.791232008451076, + "grad_norm": 0.359375, + "learning_rate": 0.00011054959945756235, + "loss": 1.7144, + "num_input_tokens_seen": 692633600, + "step": 10570, + "train_runtime": 157827.2305, + "train_tokens_per_second": 4388.556 + }, + { + "epoch": 2.7938729697609928, + "grad_norm": 0.37109375, + "learning_rate": 0.0001101062649096378, + "loss": 1.7182, + "num_input_tokens_seen": 693288960, + "step": 10580, + "train_runtime": 157948.9031, + "train_tokens_per_second": 4389.324 + }, + { + "epoch": 2.7965139310709097, + "grad_norm": 0.380859375, + "learning_rate": 0.00010966356991522578, + "loss": 1.76, + "num_input_tokens_seen": 693944320, + "step": 10590, + "train_runtime": 158067.7946, + "train_tokens_per_second": 4390.169 + }, + { + "epoch": 2.7991548923808267, + "grad_norm": 0.36328125, + "learning_rate": 0.00010922151649819922, + "loss": 1.7266, + "num_input_tokens_seen": 694599680, + "step": 10600, + "train_runtime": 158187.267, + "train_tokens_per_second": 4390.996 + }, + { + "epoch": 2.8017958536907432, + "grad_norm": 0.341796875, + "learning_rate": 0.00010878010667949778, + "loss": 1.7727, + "num_input_tokens_seen": 695255040, + "step": 10610, + "train_runtime": 158306.3972, + "train_tokens_per_second": 4391.832 + }, + { + "epoch": 2.80443681500066, + "grad_norm": 0.369140625, + "learning_rate": 0.00010833934247711915, + "loss": 1.7041, + "num_input_tokens_seen": 695910400, + "step": 10620, + "train_runtime": 158426.0221, + "train_tokens_per_second": 4392.652 + }, + { + "epoch": 2.807077776310577, + "grad_norm": 0.3671875, + "learning_rate": 0.00010789922590610906, + "loss": 1.7687, + "num_input_tokens_seen": 696565760, + "step": 10630, + "train_runtime": 158545.9365, + "train_tokens_per_second": 4393.463 + }, + { + "epoch": 2.8097187376204937, + "grad_norm": 0.365234375, + "learning_rate": 0.00010745975897855262, + "loss": 1.7769, + "num_input_tokens_seen": 697221120, + "step": 10640, + "train_runtime": 158664.6968, + "train_tokens_per_second": 4394.305 + }, + { + "epoch": 2.8123596989304107, + "grad_norm": 0.35546875, + "learning_rate": 0.00010702094370356491, + "loss": 1.7321, + "num_input_tokens_seen": 697876480, + "step": 10650, + "train_runtime": 158783.4357, + "train_tokens_per_second": 4395.147 + }, + { + "epoch": 2.8150006602403277, + "grad_norm": 0.380859375, + "learning_rate": 0.00010658278208728184, + "loss": 1.7564, + "num_input_tokens_seen": 698531840, + "step": 10660, + "train_runtime": 158904.9205, + "train_tokens_per_second": 4395.911 + }, + { + "epoch": 2.8176416215502442, + "grad_norm": 0.36328125, + "learning_rate": 0.00010614527613285118, + "loss": 1.7091, + "num_input_tokens_seen": 699187200, + "step": 10670, + "train_runtime": 159023.0431, + "train_tokens_per_second": 4396.767 + }, + { + "epoch": 2.820282582860161, + "grad_norm": 0.361328125, + "learning_rate": 0.00010570842784042295, + "loss": 1.7171, + "num_input_tokens_seen": 699842560, + "step": 10680, + "train_runtime": 159143.3027, + "train_tokens_per_second": 4397.562 + }, + { + "epoch": 2.822923544170078, + "grad_norm": 0.3671875, + "learning_rate": 0.00010527223920714058, + "loss": 1.6966, + "num_input_tokens_seen": 700497920, + "step": 10690, + "train_runtime": 159260.7033, + "train_tokens_per_second": 4398.435 + }, + { + "epoch": 2.8255645054799947, + "grad_norm": 0.35546875, + "learning_rate": 0.00010483671222713184, + "loss": 1.7572, + "num_input_tokens_seen": 701153280, + "step": 10700, + "train_runtime": 159380.4878, + "train_tokens_per_second": 4399.242 + }, + { + "epoch": 2.8282054667899117, + "grad_norm": 0.345703125, + "learning_rate": 0.00010440184889149951, + "loss": 1.7431, + "num_input_tokens_seen": 701808640, + "step": 10710, + "train_runtime": 159500.2378, + "train_tokens_per_second": 4400.048 + }, + { + "epoch": 2.8308464280998282, + "grad_norm": 0.36328125, + "learning_rate": 0.0001039676511883123, + "loss": 1.7611, + "num_input_tokens_seen": 702464000, + "step": 10720, + "train_runtime": 159618.7691, + "train_tokens_per_second": 4400.886 + }, + { + "epoch": 2.833487389409745, + "grad_norm": 0.365234375, + "learning_rate": 0.00010353412110259621, + "loss": 1.7719, + "num_input_tokens_seen": 703119360, + "step": 10730, + "train_runtime": 159737.3516, + "train_tokens_per_second": 4401.722 + }, + { + "epoch": 2.8361283507196617, + "grad_norm": 0.365234375, + "learning_rate": 0.00010310126061632469, + "loss": 1.7507, + "num_input_tokens_seen": 703774720, + "step": 10740, + "train_runtime": 159858.7302, + "train_tokens_per_second": 4402.479 + }, + { + "epoch": 2.8387693120295787, + "grad_norm": 0.35546875, + "learning_rate": 0.00010266907170841006, + "loss": 1.7133, + "num_input_tokens_seen": 704430080, + "step": 10750, + "train_runtime": 159981.1148, + "train_tokens_per_second": 4403.208 + }, + { + "epoch": 2.8414102733394957, + "grad_norm": 0.359375, + "learning_rate": 0.00010223755635469467, + "loss": 1.6978, + "num_input_tokens_seen": 705085440, + "step": 10760, + "train_runtime": 160100.2728, + "train_tokens_per_second": 4404.024 + }, + { + "epoch": 2.8440512346494122, + "grad_norm": 0.353515625, + "learning_rate": 0.00010180671652794105, + "loss": 1.7199, + "num_input_tokens_seen": 705740800, + "step": 10770, + "train_runtime": 160219.6657, + "train_tokens_per_second": 4404.833 + }, + { + "epoch": 2.846692195959329, + "grad_norm": 0.36328125, + "learning_rate": 0.0001013765541978239, + "loss": 1.7775, + "num_input_tokens_seen": 706396160, + "step": 10780, + "train_runtime": 160338.3002, + "train_tokens_per_second": 4405.661 + }, + { + "epoch": 2.849333157269246, + "grad_norm": 0.376953125, + "learning_rate": 0.0001009470713309204, + "loss": 1.6821, + "num_input_tokens_seen": 707051520, + "step": 10790, + "train_runtime": 160456.5683, + "train_tokens_per_second": 4406.498 + }, + { + "epoch": 2.8519741185791627, + "grad_norm": 0.36328125, + "learning_rate": 0.00010051826989070142, + "loss": 1.7831, + "num_input_tokens_seen": 707706880, + "step": 10800, + "train_runtime": 160578.6246, + "train_tokens_per_second": 4407.23 + }, + { + "epoch": 2.8546150798890797, + "grad_norm": 0.36328125, + "learning_rate": 0.00010009015183752251, + "loss": 1.7763, + "num_input_tokens_seen": 708362240, + "step": 10810, + "train_runtime": 160698.0841, + "train_tokens_per_second": 4408.032 + }, + { + "epoch": 2.8572560411989967, + "grad_norm": 0.361328125, + "learning_rate": 9.966271912861502e-05, + "loss": 1.7499, + "num_input_tokens_seen": 709017600, + "step": 10820, + "train_runtime": 160816.1067, + "train_tokens_per_second": 4408.872 + }, + { + "epoch": 2.859897002508913, + "grad_norm": 0.37109375, + "learning_rate": 9.923597371807722e-05, + "loss": 1.7093, + "num_input_tokens_seen": 709672960, + "step": 10830, + "train_runtime": 160934.861, + "train_tokens_per_second": 4409.691 + }, + { + "epoch": 2.86253796381883, + "grad_norm": 0.359375, + "learning_rate": 9.880991755686508e-05, + "loss": 1.7036, + "num_input_tokens_seen": 710328320, + "step": 10840, + "train_runtime": 161060.4538, + "train_tokens_per_second": 4410.321 + }, + { + "epoch": 2.8651789251287467, + "grad_norm": 0.345703125, + "learning_rate": 9.838455259278358e-05, + "loss": 1.6477, + "num_input_tokens_seen": 710983680, + "step": 10850, + "train_runtime": 161192.2855, + "train_tokens_per_second": 4410.78 + }, + { + "epoch": 2.8678198864386637, + "grad_norm": 0.37109375, + "learning_rate": 9.795988077047768e-05, + "loss": 1.7531, + "num_input_tokens_seen": 711639040, + "step": 10860, + "train_runtime": 161325.0553, + "train_tokens_per_second": 4411.212 + }, + { + "epoch": 2.8704608477485802, + "grad_norm": 0.359375, + "learning_rate": 9.753590403142381e-05, + "loss": 1.7527, + "num_input_tokens_seen": 712294400, + "step": 10870, + "train_runtime": 161458.6546, + "train_tokens_per_second": 4411.621 + }, + { + "epoch": 2.873101809058497, + "grad_norm": 0.380859375, + "learning_rate": 9.711262431392009e-05, + "loss": 1.6999, + "num_input_tokens_seen": 712949760, + "step": 10880, + "train_runtime": 161591.4557, + "train_tokens_per_second": 4412.051 + }, + { + "epoch": 2.875742770368414, + "grad_norm": 0.34765625, + "learning_rate": 9.669004355307868e-05, + "loss": 1.722, + "num_input_tokens_seen": 713605120, + "step": 10890, + "train_runtime": 161729.3806, + "train_tokens_per_second": 4412.341 + }, + { + "epoch": 2.8783837316783307, + "grad_norm": 0.361328125, + "learning_rate": 9.626816368081595e-05, + "loss": 1.7297, + "num_input_tokens_seen": 714260480, + "step": 10900, + "train_runtime": 161906.5229, + "train_tokens_per_second": 4411.561 + }, + { + "epoch": 2.8810246929882477, + "grad_norm": 0.353515625, + "learning_rate": 9.584698662584404e-05, + "loss": 1.7523, + "num_input_tokens_seen": 714915840, + "step": 10910, + "train_runtime": 162092.9704, + "train_tokens_per_second": 4410.53 + }, + { + "epoch": 2.8836656542981647, + "grad_norm": 0.35546875, + "learning_rate": 9.542651431366231e-05, + "loss": 1.7262, + "num_input_tokens_seen": 715571200, + "step": 10920, + "train_runtime": 162275.9244, + "train_tokens_per_second": 4409.596 + }, + { + "epoch": 2.886306615608081, + "grad_norm": 0.357421875, + "learning_rate": 9.500674866654768e-05, + "loss": 1.7127, + "num_input_tokens_seen": 716226560, + "step": 10930, + "train_runtime": 162462.9486, + "train_tokens_per_second": 4408.553 + }, + { + "epoch": 2.888947576917998, + "grad_norm": 0.3671875, + "learning_rate": 9.4587691603547e-05, + "loss": 1.7462, + "num_input_tokens_seen": 716881920, + "step": 10940, + "train_runtime": 162647.4856, + "train_tokens_per_second": 4407.581 + }, + { + "epoch": 2.891588538227915, + "grad_norm": 0.341796875, + "learning_rate": 9.416934504046725e-05, + "loss": 1.7307, + "num_input_tokens_seen": 717537280, + "step": 10950, + "train_runtime": 162831.6973, + "train_tokens_per_second": 4406.619 + }, + { + "epoch": 2.8942294995378317, + "grad_norm": 0.37109375, + "learning_rate": 9.375171088986747e-05, + "loss": 1.6707, + "num_input_tokens_seen": 718192640, + "step": 10960, + "train_runtime": 163015.2947, + "train_tokens_per_second": 4405.676 + }, + { + "epoch": 2.8968704608477487, + "grad_norm": 0.36328125, + "learning_rate": 9.333479106104954e-05, + "loss": 1.7345, + "num_input_tokens_seen": 718848000, + "step": 10970, + "train_runtime": 163200.4999, + "train_tokens_per_second": 4404.692 + }, + { + "epoch": 2.8995114221576657, + "grad_norm": 0.359375, + "learning_rate": 9.291858746004995e-05, + "loss": 1.7042, + "num_input_tokens_seen": 719503360, + "step": 10980, + "train_runtime": 163398.6307, + "train_tokens_per_second": 4403.362 + }, + { + "epoch": 2.902152383467582, + "grad_norm": 0.359375, + "learning_rate": 9.250310198963052e-05, + "loss": 1.7219, + "num_input_tokens_seen": 720158720, + "step": 10990, + "train_runtime": 163604.6978, + "train_tokens_per_second": 4401.822 + }, + { + "epoch": 2.904793344777499, + "grad_norm": 0.369140625, + "learning_rate": 9.208833654927019e-05, + "loss": 1.6894, + "num_input_tokens_seen": 720814080, + "step": 11000, + "train_runtime": 163811.6561, + "train_tokens_per_second": 4400.261 + }, + { + "epoch": 2.9074343060874157, + "grad_norm": 0.341796875, + "learning_rate": 9.167429303515596e-05, + "loss": 1.6541, + "num_input_tokens_seen": 721469440, + "step": 11010, + "train_runtime": 164028.1648, + "train_tokens_per_second": 4398.449 + }, + { + "epoch": 2.9100752673973327, + "grad_norm": 0.373046875, + "learning_rate": 9.126097334017447e-05, + "loss": 1.6768, + "num_input_tokens_seen": 722124800, + "step": 11020, + "train_runtime": 164235.0251, + "train_tokens_per_second": 4396.899 + }, + { + "epoch": 2.912716228707249, + "grad_norm": 0.34375, + "learning_rate": 9.084837935390347e-05, + "loss": 1.6783, + "num_input_tokens_seen": 722780160, + "step": 11030, + "train_runtime": 164441.5491, + "train_tokens_per_second": 4395.362 + }, + { + "epoch": 2.915357190017166, + "grad_norm": 0.412109375, + "learning_rate": 9.043651296260253e-05, + "loss": 1.7177, + "num_input_tokens_seen": 723435520, + "step": 11040, + "train_runtime": 164647.0842, + "train_tokens_per_second": 4393.856 + }, + { + "epoch": 2.917998151327083, + "grad_norm": 0.353515625, + "learning_rate": 9.00253760492053e-05, + "loss": 1.7235, + "num_input_tokens_seen": 724090880, + "step": 11050, + "train_runtime": 164853.7354, + "train_tokens_per_second": 4392.323 + }, + { + "epoch": 2.9206391126369997, + "grad_norm": 0.373046875, + "learning_rate": 8.961497049331027e-05, + "loss": 1.7528, + "num_input_tokens_seen": 724746240, + "step": 11060, + "train_runtime": 165061.0799, + "train_tokens_per_second": 4390.776 + }, + { + "epoch": 2.9232800739469167, + "grad_norm": 0.3671875, + "learning_rate": 8.920529817117237e-05, + "loss": 1.7676, + "num_input_tokens_seen": 725401600, + "step": 11070, + "train_runtime": 165266.6018, + "train_tokens_per_second": 4389.281 + }, + { + "epoch": 2.9259210352568337, + "grad_norm": 0.359375, + "learning_rate": 8.879636095569438e-05, + "loss": 1.6574, + "num_input_tokens_seen": 726056960, + "step": 11080, + "train_runtime": 165472.1923, + "train_tokens_per_second": 4387.788 + }, + { + "epoch": 2.92856199656675, + "grad_norm": 0.34765625, + "learning_rate": 8.838816071641856e-05, + "loss": 1.6925, + "num_input_tokens_seen": 726712320, + "step": 11090, + "train_runtime": 165676.9164, + "train_tokens_per_second": 4386.322 + }, + { + "epoch": 2.931202957876667, + "grad_norm": 0.3515625, + "learning_rate": 8.79806993195178e-05, + "loss": 1.7012, + "num_input_tokens_seen": 727367680, + "step": 11100, + "train_runtime": 165880.6202, + "train_tokens_per_second": 4384.886 + }, + { + "epoch": 2.933843919186584, + "grad_norm": 0.361328125, + "learning_rate": 8.757397862778704e-05, + "loss": 1.7296, + "num_input_tokens_seen": 728023040, + "step": 11110, + "train_runtime": 166086.5299, + "train_tokens_per_second": 4383.396 + }, + { + "epoch": 2.9364848804965007, + "grad_norm": 0.353515625, + "learning_rate": 8.716800050063545e-05, + "loss": 1.6135, + "num_input_tokens_seen": 728678400, + "step": 11120, + "train_runtime": 166291.6164, + "train_tokens_per_second": 4381.931 + }, + { + "epoch": 2.9391258418064177, + "grad_norm": 0.384765625, + "learning_rate": 8.676276679407671e-05, + "loss": 1.7623, + "num_input_tokens_seen": 729333760, + "step": 11130, + "train_runtime": 166498.2747, + "train_tokens_per_second": 4380.428 + }, + { + "epoch": 2.941766803116334, + "grad_norm": 0.357421875, + "learning_rate": 8.635827936072183e-05, + "loss": 1.7493, + "num_input_tokens_seen": 729989120, + "step": 11140, + "train_runtime": 166703.3736, + "train_tokens_per_second": 4378.97 + }, + { + "epoch": 2.944407764426251, + "grad_norm": 0.349609375, + "learning_rate": 8.595454004976977e-05, + "loss": 1.6727, + "num_input_tokens_seen": 730644480, + "step": 11150, + "train_runtime": 166908.2835, + "train_tokens_per_second": 4377.521 + }, + { + "epoch": 2.9470487257361677, + "grad_norm": 0.34375, + "learning_rate": 8.555155070699935e-05, + "loss": 1.6715, + "num_input_tokens_seen": 731299840, + "step": 11160, + "train_runtime": 167114.1981, + "train_tokens_per_second": 4376.049 + }, + { + "epoch": 2.9496896870460847, + "grad_norm": 0.35546875, + "learning_rate": 8.514931317476076e-05, + "loss": 1.6644, + "num_input_tokens_seen": 731955200, + "step": 11170, + "train_runtime": 167319.6064, + "train_tokens_per_second": 4374.593 + }, + { + "epoch": 2.9523306483560017, + "grad_norm": 0.359375, + "learning_rate": 8.474782929196705e-05, + "loss": 1.6907, + "num_input_tokens_seen": 732610560, + "step": 11180, + "train_runtime": 167524.7882, + "train_tokens_per_second": 4373.147 + }, + { + "epoch": 2.954971609665918, + "grad_norm": 0.37109375, + "learning_rate": 8.434710089408609e-05, + "loss": 1.6984, + "num_input_tokens_seen": 733265920, + "step": 11190, + "train_runtime": 167728.7061, + "train_tokens_per_second": 4371.738 + }, + { + "epoch": 2.957612570975835, + "grad_norm": 0.341796875, + "learning_rate": 8.394712981313155e-05, + "loss": 1.7113, + "num_input_tokens_seen": 733921280, + "step": 11200, + "train_runtime": 167932.659, + "train_tokens_per_second": 4370.331 + }, + { + "epoch": 2.960253532285752, + "grad_norm": 0.36328125, + "learning_rate": 8.354791787765503e-05, + "loss": 1.7431, + "num_input_tokens_seen": 734576640, + "step": 11210, + "train_runtime": 168135.7357, + "train_tokens_per_second": 4368.95 + }, + { + "epoch": 2.9628944935956687, + "grad_norm": 0.34765625, + "learning_rate": 8.314946691273742e-05, + "loss": 1.6599, + "num_input_tokens_seen": 735232000, + "step": 11220, + "train_runtime": 168338.9372, + "train_tokens_per_second": 4367.569 + }, + { + "epoch": 2.9655354549055857, + "grad_norm": 0.36328125, + "learning_rate": 8.275177873998105e-05, + "loss": 1.7044, + "num_input_tokens_seen": 735887360, + "step": 11230, + "train_runtime": 168541.4777, + "train_tokens_per_second": 4366.209 + }, + { + "epoch": 2.9681764162155027, + "grad_norm": 0.3515625, + "learning_rate": 8.235485517750032e-05, + "loss": 1.733, + "num_input_tokens_seen": 736542720, + "step": 11240, + "train_runtime": 168744.6864, + "train_tokens_per_second": 4364.835 + }, + { + "epoch": 2.970817377525419, + "grad_norm": 0.369140625, + "learning_rate": 8.19586980399147e-05, + "loss": 1.7647, + "num_input_tokens_seen": 737198080, + "step": 11250, + "train_runtime": 168948.3139, + "train_tokens_per_second": 4363.453 + }, + { + "epoch": 2.973458338835336, + "grad_norm": 0.359375, + "learning_rate": 8.156330913833948e-05, + "loss": 1.7157, + "num_input_tokens_seen": 737853440, + "step": 11260, + "train_runtime": 169151.1825, + "train_tokens_per_second": 4362.094 + }, + { + "epoch": 2.976099300145253, + "grad_norm": 0.369140625, + "learning_rate": 8.116869028037774e-05, + "loss": 1.7479, + "num_input_tokens_seen": 738508800, + "step": 11270, + "train_runtime": 169354.1482, + "train_tokens_per_second": 4360.736 + }, + { + "epoch": 2.9787402614551697, + "grad_norm": 0.35546875, + "learning_rate": 8.077484327011248e-05, + "loss": 1.7005, + "num_input_tokens_seen": 739164160, + "step": 11280, + "train_runtime": 169557.8097, + "train_tokens_per_second": 4359.364 + }, + { + "epoch": 2.981381222765086, + "grad_norm": 0.359375, + "learning_rate": 8.038176990809748e-05, + "loss": 1.7275, + "num_input_tokens_seen": 739819520, + "step": 11290, + "train_runtime": 169762.6635, + "train_tokens_per_second": 4357.964 + }, + { + "epoch": 2.984022184075003, + "grad_norm": 0.349609375, + "learning_rate": 7.998947199135017e-05, + "loss": 1.7322, + "num_input_tokens_seen": 740474880, + "step": 11300, + "train_runtime": 169964.0397, + "train_tokens_per_second": 4356.656 + }, + { + "epoch": 2.98666314538492, + "grad_norm": 0.34765625, + "learning_rate": 7.959795131334263e-05, + "loss": 1.708, + "num_input_tokens_seen": 741130240, + "step": 11310, + "train_runtime": 170169.0421, + "train_tokens_per_second": 4355.259 + }, + { + "epoch": 2.9893041066948367, + "grad_norm": 0.345703125, + "learning_rate": 7.920720966399361e-05, + "loss": 1.6864, + "num_input_tokens_seen": 741785600, + "step": 11320, + "train_runtime": 170374.6853, + "train_tokens_per_second": 4353.849 + }, + { + "epoch": 2.9919450680047537, + "grad_norm": 0.359375, + "learning_rate": 7.881724882966031e-05, + "loss": 1.7189, + "num_input_tokens_seen": 742440960, + "step": 11330, + "train_runtime": 170579.1932, + "train_tokens_per_second": 4352.471 + }, + { + "epoch": 2.9945860293146707, + "grad_norm": 0.3984375, + "learning_rate": 7.842807059313056e-05, + "loss": 1.6928, + "num_input_tokens_seen": 743096320, + "step": 11340, + "train_runtime": 170783.4647, + "train_tokens_per_second": 4351.102 + }, + { + "epoch": 2.997226990624587, + "grad_norm": 0.353515625, + "learning_rate": 7.80396767336141e-05, + "loss": 1.6561, + "num_input_tokens_seen": 743751680, + "step": 11350, + "train_runtime": 170988.5409, + "train_tokens_per_second": 4349.717 + }, + { + "epoch": 2.999867951934504, + "grad_norm": 0.353515625, + "learning_rate": 7.765206902673478e-05, + "loss": 1.6885, + "num_input_tokens_seen": 744407040, + "step": 11360, + "train_runtime": 171193.267, + "train_tokens_per_second": 4348.343 + }, + { + "epoch": 3.0023768651789253, + "grad_norm": 0.48828125, + "learning_rate": 7.726524924452247e-05, + "loss": 1.366, + "num_input_tokens_seen": 745021440, + "step": 11370, + "train_runtime": 171385.9885, + "train_tokens_per_second": 4347.038 + }, + { + "epoch": 3.005017826488842, + "grad_norm": 0.50390625, + "learning_rate": 7.687921915540469e-05, + "loss": 1.2321, + "num_input_tokens_seen": 745676800, + "step": 11380, + "train_runtime": 171592.3094, + "train_tokens_per_second": 4345.631 + }, + { + "epoch": 3.007658787798759, + "grad_norm": 0.42578125, + "learning_rate": 7.649398052419918e-05, + "loss": 1.2529, + "num_input_tokens_seen": 746332160, + "step": 11390, + "train_runtime": 171795.8175, + "train_tokens_per_second": 4344.298 + }, + { + "epoch": 3.0102997491086754, + "grad_norm": 0.421875, + "learning_rate": 7.610953511210461e-05, + "loss": 1.2261, + "num_input_tokens_seen": 746987520, + "step": 11400, + "train_runtime": 172001.2238, + "train_tokens_per_second": 4342.92 + }, + { + "epoch": 3.0129407104185923, + "grad_norm": 0.447265625, + "learning_rate": 7.572588467669403e-05, + "loss": 1.237, + "num_input_tokens_seen": 747642880, + "step": 11410, + "train_runtime": 172206.9653, + "train_tokens_per_second": 4341.537 + }, + { + "epoch": 3.0155816717285093, + "grad_norm": 0.4453125, + "learning_rate": 7.534303097190565e-05, + "loss": 1.1882, + "num_input_tokens_seen": 748298240, + "step": 11420, + "train_runtime": 172412.2409, + "train_tokens_per_second": 4340.169 + }, + { + "epoch": 3.018222633038426, + "grad_norm": 0.447265625, + "learning_rate": 7.49609757480354e-05, + "loss": 1.1744, + "num_input_tokens_seen": 748953600, + "step": 11430, + "train_runtime": 172617.4078, + "train_tokens_per_second": 4338.807 + }, + { + "epoch": 3.020863594348343, + "grad_norm": 0.4296875, + "learning_rate": 7.457972075172864e-05, + "loss": 1.2523, + "num_input_tokens_seen": 749608960, + "step": 11440, + "train_runtime": 172822.1584, + "train_tokens_per_second": 4337.459 + }, + { + "epoch": 3.02350455565826, + "grad_norm": 0.435546875, + "learning_rate": 7.419926772597266e-05, + "loss": 1.2435, + "num_input_tokens_seen": 750264320, + "step": 11450, + "train_runtime": 173026.6398, + "train_tokens_per_second": 4336.12 + }, + { + "epoch": 3.0261455169681764, + "grad_norm": 0.458984375, + "learning_rate": 7.381961841008802e-05, + "loss": 1.2082, + "num_input_tokens_seen": 750919680, + "step": 11460, + "train_runtime": 173229.1934, + "train_tokens_per_second": 4334.833 + }, + { + "epoch": 3.0287864782780933, + "grad_norm": 0.498046875, + "learning_rate": 7.344077453972106e-05, + "loss": 1.1764, + "num_input_tokens_seen": 751575040, + "step": 11470, + "train_runtime": 173434.0962, + "train_tokens_per_second": 4333.491 + }, + { + "epoch": 3.03142743958801, + "grad_norm": 0.451171875, + "learning_rate": 7.306273784683609e-05, + "loss": 1.1778, + "num_input_tokens_seen": 752230400, + "step": 11480, + "train_runtime": 173639.6384, + "train_tokens_per_second": 4332.135 + }, + { + "epoch": 3.034068400897927, + "grad_norm": 0.435546875, + "learning_rate": 7.268551005970672e-05, + "loss": 1.2463, + "num_input_tokens_seen": 752885760, + "step": 11490, + "train_runtime": 173844.5667, + "train_tokens_per_second": 4330.798 + }, + { + "epoch": 3.036709362207844, + "grad_norm": 0.466796875, + "learning_rate": 7.230909290290916e-05, + "loss": 1.2593, + "num_input_tokens_seen": 753541120, + "step": 11500, + "train_runtime": 174048.9046, + "train_tokens_per_second": 4329.479 + }, + { + "epoch": 3.0393503235177604, + "grad_norm": 0.478515625, + "learning_rate": 7.19334880973129e-05, + "loss": 1.2332, + "num_input_tokens_seen": 754196480, + "step": 11510, + "train_runtime": 174262.4978, + "train_tokens_per_second": 4327.933 + }, + { + "epoch": 3.0419912848276773, + "grad_norm": 0.455078125, + "learning_rate": 7.155869736007428e-05, + "loss": 1.1999, + "num_input_tokens_seen": 754851840, + "step": 11520, + "train_runtime": 174466.1751, + "train_tokens_per_second": 4326.637 + }, + { + "epoch": 3.044632246137594, + "grad_norm": 0.46875, + "learning_rate": 7.118472240462753e-05, + "loss": 1.1875, + "num_input_tokens_seen": 755507200, + "step": 11530, + "train_runtime": 174667.5255, + "train_tokens_per_second": 4325.402 + }, + { + "epoch": 3.047273207447511, + "grad_norm": 0.451171875, + "learning_rate": 7.081156494067747e-05, + "loss": 1.2414, + "num_input_tokens_seen": 756162560, + "step": 11540, + "train_runtime": 174869.0498, + "train_tokens_per_second": 4324.165 + }, + { + "epoch": 3.049914168757428, + "grad_norm": 0.47265625, + "learning_rate": 7.043922667419173e-05, + "loss": 1.2012, + "num_input_tokens_seen": 756817920, + "step": 11550, + "train_runtime": 175072.7246, + "train_tokens_per_second": 4322.877 + }, + { + "epoch": 3.0525551300673444, + "grad_norm": 0.46875, + "learning_rate": 7.006770930739263e-05, + "loss": 1.2022, + "num_input_tokens_seen": 757473280, + "step": 11560, + "train_runtime": 175275.7965, + "train_tokens_per_second": 4321.608 + }, + { + "epoch": 3.0551960913772613, + "grad_norm": 0.466796875, + "learning_rate": 6.96970145387496e-05, + "loss": 1.1891, + "num_input_tokens_seen": 758128640, + "step": 11570, + "train_runtime": 175477.3259, + "train_tokens_per_second": 4320.379 + }, + { + "epoch": 3.0578370526871783, + "grad_norm": 0.484375, + "learning_rate": 6.932714406297136e-05, + "loss": 1.2574, + "num_input_tokens_seen": 758784000, + "step": 11580, + "train_runtime": 175679.6596, + "train_tokens_per_second": 4319.134 + }, + { + "epoch": 3.060478013997095, + "grad_norm": 0.44921875, + "learning_rate": 6.89580995709985e-05, + "loss": 1.2055, + "num_input_tokens_seen": 759439360, + "step": 11590, + "train_runtime": 175881.5726, + "train_tokens_per_second": 4317.902 + }, + { + "epoch": 3.063118975307012, + "grad_norm": 0.443359375, + "learning_rate": 6.858988274999492e-05, + "loss": 1.2026, + "num_input_tokens_seen": 760094720, + "step": 11600, + "train_runtime": 176083.1278, + "train_tokens_per_second": 4316.681 + }, + { + "epoch": 3.0657599366169284, + "grad_norm": 0.462890625, + "learning_rate": 6.822249528334115e-05, + "loss": 1.2139, + "num_input_tokens_seen": 760750080, + "step": 11610, + "train_runtime": 176285.2639, + "train_tokens_per_second": 4315.449 + }, + { + "epoch": 3.0684008979268453, + "grad_norm": 0.474609375, + "learning_rate": 6.785593885062588e-05, + "loss": 1.1767, + "num_input_tokens_seen": 761405440, + "step": 11620, + "train_runtime": 176488.0127, + "train_tokens_per_second": 4314.205 + }, + { + "epoch": 3.0710418592367623, + "grad_norm": 0.8125, + "learning_rate": 6.749021512763856e-05, + "loss": 1.2371, + "num_input_tokens_seen": 762060800, + "step": 11630, + "train_runtime": 176689.3264, + "train_tokens_per_second": 4312.999 + }, + { + "epoch": 3.073682820546679, + "grad_norm": 0.4609375, + "learning_rate": 6.712532578636199e-05, + "loss": 1.2089, + "num_input_tokens_seen": 762716160, + "step": 11640, + "train_runtime": 176892.066, + "train_tokens_per_second": 4311.76 + }, + { + "epoch": 3.076323781856596, + "grad_norm": 0.47265625, + "learning_rate": 6.676127249496396e-05, + "loss": 1.19, + "num_input_tokens_seen": 763371520, + "step": 11650, + "train_runtime": 177095.7753, + "train_tokens_per_second": 4310.501 + }, + { + "epoch": 3.078964743166513, + "grad_norm": 0.44921875, + "learning_rate": 6.639805691779057e-05, + "loss": 1.2058, + "num_input_tokens_seen": 764026880, + "step": 11660, + "train_runtime": 177299.3681, + "train_tokens_per_second": 4309.248 + }, + { + "epoch": 3.0816057044764293, + "grad_norm": 0.52734375, + "learning_rate": 6.603568071535782e-05, + "loss": 1.2047, + "num_input_tokens_seen": 764682240, + "step": 11670, + "train_runtime": 177500.3616, + "train_tokens_per_second": 4308.06 + }, + { + "epoch": 3.0842466657863463, + "grad_norm": 0.455078125, + "learning_rate": 6.567414554434442e-05, + "loss": 1.1677, + "num_input_tokens_seen": 765337600, + "step": 11680, + "train_runtime": 177703.4506, + "train_tokens_per_second": 4306.825 + }, + { + "epoch": 3.086887627096263, + "grad_norm": 0.478515625, + "learning_rate": 6.531345305758405e-05, + "loss": 1.2848, + "num_input_tokens_seen": 765992960, + "step": 11690, + "train_runtime": 177906.3446, + "train_tokens_per_second": 4305.597 + }, + { + "epoch": 3.08952858840618, + "grad_norm": 0.4609375, + "learning_rate": 6.495360490405816e-05, + "loss": 1.2277, + "num_input_tokens_seen": 766648320, + "step": 11700, + "train_runtime": 178109.5755, + "train_tokens_per_second": 4304.363 + }, + { + "epoch": 3.092169549716097, + "grad_norm": 0.4375, + "learning_rate": 6.459460272888781e-05, + "loss": 1.1662, + "num_input_tokens_seen": 767303680, + "step": 11710, + "train_runtime": 178311.4916, + "train_tokens_per_second": 4303.164 + }, + { + "epoch": 3.0948105110260133, + "grad_norm": 0.498046875, + "learning_rate": 6.423644817332666e-05, + "loss": 1.1641, + "num_input_tokens_seen": 767959040, + "step": 11720, + "train_runtime": 178513.9369, + "train_tokens_per_second": 4301.956 + }, + { + "epoch": 3.0974514723359303, + "grad_norm": 0.4765625, + "learning_rate": 6.387914287475344e-05, + "loss": 1.1757, + "num_input_tokens_seen": 768614400, + "step": 11730, + "train_runtime": 178718.9456, + "train_tokens_per_second": 4300.688 + }, + { + "epoch": 3.1000924336458473, + "grad_norm": 0.478515625, + "learning_rate": 6.352268846666387e-05, + "loss": 1.2153, + "num_input_tokens_seen": 769269760, + "step": 11740, + "train_runtime": 178919.3015, + "train_tokens_per_second": 4299.535 + }, + { + "epoch": 3.102733394955764, + "grad_norm": 0.458984375, + "learning_rate": 6.316708657866427e-05, + "loss": 1.1833, + "num_input_tokens_seen": 769925120, + "step": 11750, + "train_runtime": 179121.932, + "train_tokens_per_second": 4298.33 + }, + { + "epoch": 3.105374356265681, + "grad_norm": 0.466796875, + "learning_rate": 6.281233883646282e-05, + "loss": 1.1648, + "num_input_tokens_seen": 770580480, + "step": 11760, + "train_runtime": 179322.9721, + "train_tokens_per_second": 4297.165 + }, + { + "epoch": 3.1080153175755973, + "grad_norm": 0.462890625, + "learning_rate": 6.24584468618634e-05, + "loss": 1.2042, + "num_input_tokens_seen": 771235840, + "step": 11770, + "train_runtime": 179526.3532, + "train_tokens_per_second": 4295.948 + }, + { + "epoch": 3.1106562788855143, + "grad_norm": 0.46875, + "learning_rate": 6.210541227275715e-05, + "loss": 1.1574, + "num_input_tokens_seen": 771891200, + "step": 11780, + "train_runtime": 179730.1261, + "train_tokens_per_second": 4294.724 + }, + { + "epoch": 3.1132972401954313, + "grad_norm": 0.486328125, + "learning_rate": 6.175323668311564e-05, + "loss": 1.1767, + "num_input_tokens_seen": 772546560, + "step": 11790, + "train_runtime": 179934.1076, + "train_tokens_per_second": 4293.497 + }, + { + "epoch": 3.115938201505348, + "grad_norm": 0.46875, + "learning_rate": 6.140192170298347e-05, + "loss": 1.1836, + "num_input_tokens_seen": 773201920, + "step": 11800, + "train_runtime": 180138.7366, + "train_tokens_per_second": 4292.258 + }, + { + "epoch": 3.118579162815265, + "grad_norm": 0.462890625, + "learning_rate": 6.105146893847061e-05, + "loss": 1.242, + "num_input_tokens_seen": 773857280, + "step": 11810, + "train_runtime": 180342.6868, + "train_tokens_per_second": 4291.038 + }, + { + "epoch": 3.1212201241251813, + "grad_norm": 0.515625, + "learning_rate": 6.070187999174523e-05, + "loss": 1.2258, + "num_input_tokens_seen": 774512640, + "step": 11820, + "train_runtime": 180546.2469, + "train_tokens_per_second": 4289.83 + }, + { + "epoch": 3.1238610854350983, + "grad_norm": 0.458984375, + "learning_rate": 6.0353156461026375e-05, + "loss": 1.2256, + "num_input_tokens_seen": 775168000, + "step": 11830, + "train_runtime": 180747.8866, + "train_tokens_per_second": 4288.67 + }, + { + "epoch": 3.1265020467450153, + "grad_norm": 0.46484375, + "learning_rate": 6.000529994057693e-05, + "loss": 1.1685, + "num_input_tokens_seen": 775823360, + "step": 11840, + "train_runtime": 180951.0813, + "train_tokens_per_second": 4287.476 + }, + { + "epoch": 3.129143008054932, + "grad_norm": 0.48828125, + "learning_rate": 5.9658312020695546e-05, + "loss": 1.2126, + "num_input_tokens_seen": 776478720, + "step": 11850, + "train_runtime": 181155.4595, + "train_tokens_per_second": 4286.256 + }, + { + "epoch": 3.131783969364849, + "grad_norm": 0.490234375, + "learning_rate": 5.931219428771051e-05, + "loss": 1.1811, + "num_input_tokens_seen": 777134080, + "step": 11860, + "train_runtime": 181359.9392, + "train_tokens_per_second": 4285.037 + }, + { + "epoch": 3.134424930674766, + "grad_norm": 0.451171875, + "learning_rate": 5.8966948323971174e-05, + "loss": 1.1746, + "num_input_tokens_seen": 777789440, + "step": 11870, + "train_runtime": 181564.7329, + "train_tokens_per_second": 4283.813 + }, + { + "epoch": 3.1370658919846823, + "grad_norm": 0.5078125, + "learning_rate": 5.8622575707842044e-05, + "loss": 1.1834, + "num_input_tokens_seen": 778444800, + "step": 11880, + "train_runtime": 181768.6004, + "train_tokens_per_second": 4282.614 + }, + { + "epoch": 3.1397068532945993, + "grad_norm": 0.478515625, + "learning_rate": 5.8279078013694614e-05, + "loss": 1.1937, + "num_input_tokens_seen": 779100160, + "step": 11890, + "train_runtime": 181971.0491, + "train_tokens_per_second": 4281.451 + }, + { + "epoch": 3.142347814604516, + "grad_norm": 0.46875, + "learning_rate": 5.793645681190041e-05, + "loss": 1.152, + "num_input_tokens_seen": 779755520, + "step": 11900, + "train_runtime": 182172.5015, + "train_tokens_per_second": 4280.314 + }, + { + "epoch": 3.144988775914433, + "grad_norm": 0.466796875, + "learning_rate": 5.759471366882421e-05, + "loss": 1.1854, + "num_input_tokens_seen": 780410880, + "step": 11910, + "train_runtime": 182375.3746, + "train_tokens_per_second": 4279.146 + }, + { + "epoch": 3.14762973722435, + "grad_norm": 0.4453125, + "learning_rate": 5.72538501468163e-05, + "loss": 1.2418, + "num_input_tokens_seen": 781066240, + "step": 11920, + "train_runtime": 182577.484, + "train_tokens_per_second": 4277.999 + }, + { + "epoch": 3.1502706985342663, + "grad_norm": 0.52734375, + "learning_rate": 5.6913867804205654e-05, + "loss": 1.2272, + "num_input_tokens_seen": 781721600, + "step": 11930, + "train_runtime": 182777.1398, + "train_tokens_per_second": 4276.911 + }, + { + "epoch": 3.1529116598441833, + "grad_norm": 0.44921875, + "learning_rate": 5.65747681952927e-05, + "loss": 1.2006, + "num_input_tokens_seen": 782376960, + "step": 11940, + "train_runtime": 182981.0444, + "train_tokens_per_second": 4275.727 + }, + { + "epoch": 3.1555526211541003, + "grad_norm": 0.478515625, + "learning_rate": 5.623655287034255e-05, + "loss": 1.2469, + "num_input_tokens_seen": 783032320, + "step": 11950, + "train_runtime": 183183.1595, + "train_tokens_per_second": 4274.587 + }, + { + "epoch": 3.158193582464017, + "grad_norm": 0.51171875, + "learning_rate": 5.5899223375577124e-05, + "loss": 1.1927, + "num_input_tokens_seen": 783687680, + "step": 11960, + "train_runtime": 183384.8108, + "train_tokens_per_second": 4273.46 + }, + { + "epoch": 3.160834543773934, + "grad_norm": 0.46875, + "learning_rate": 5.5562781253169045e-05, + "loss": 1.2217, + "num_input_tokens_seen": 784343040, + "step": 11970, + "train_runtime": 183588.1507, + "train_tokens_per_second": 4272.297 + }, + { + "epoch": 3.1634755050838503, + "grad_norm": 0.439453125, + "learning_rate": 5.5227228041233925e-05, + "loss": 1.2154, + "num_input_tokens_seen": 784998400, + "step": 11980, + "train_runtime": 183791.6363, + "train_tokens_per_second": 4271.132 + }, + { + "epoch": 3.1661164663937673, + "grad_norm": 0.458984375, + "learning_rate": 5.489256527382344e-05, + "loss": 1.2052, + "num_input_tokens_seen": 785653760, + "step": 11990, + "train_runtime": 183993.7186, + "train_tokens_per_second": 4270.003 + }, + { + "epoch": 3.1687574277036843, + "grad_norm": 0.494140625, + "learning_rate": 5.45587944809188e-05, + "loss": 1.2221, + "num_input_tokens_seen": 786309120, + "step": 12000, + "train_runtime": 184196.8061, + "train_tokens_per_second": 4268.853 + }, + { + "epoch": 3.171398389013601, + "grad_norm": 0.447265625, + "learning_rate": 5.422591718842276e-05, + "loss": 1.1988, + "num_input_tokens_seen": 786964480, + "step": 12010, + "train_runtime": 184403.0738, + "train_tokens_per_second": 4267.632 + }, + { + "epoch": 3.174039350323518, + "grad_norm": 0.48046875, + "learning_rate": 5.3893934918153807e-05, + "loss": 1.208, + "num_input_tokens_seen": 787619840, + "step": 12020, + "train_runtime": 184604.1963, + "train_tokens_per_second": 4266.533 + }, + { + "epoch": 3.176680311633435, + "grad_norm": 0.470703125, + "learning_rate": 5.356284918783841e-05, + "loss": 1.175, + "num_input_tokens_seen": 788275200, + "step": 12030, + "train_runtime": 184803.1735, + "train_tokens_per_second": 4265.485 + }, + { + "epoch": 3.1793212729433513, + "grad_norm": 0.46484375, + "learning_rate": 5.3232661511104284e-05, + "loss": 1.2011, + "num_input_tokens_seen": 788930560, + "step": 12040, + "train_runtime": 185004.3045, + "train_tokens_per_second": 4264.39 + }, + { + "epoch": 3.1819622342532683, + "grad_norm": 0.48046875, + "learning_rate": 5.2903373397473475e-05, + "loss": 1.2469, + "num_input_tokens_seen": 789585920, + "step": 12050, + "train_runtime": 185207.5872, + "train_tokens_per_second": 4263.248 + }, + { + "epoch": 3.184603195563185, + "grad_norm": 0.470703125, + "learning_rate": 5.2574986352355744e-05, + "loss": 1.2128, + "num_input_tokens_seen": 790241280, + "step": 12060, + "train_runtime": 185410.2119, + "train_tokens_per_second": 4262.124 + }, + { + "epoch": 3.187244156873102, + "grad_norm": 0.50390625, + "learning_rate": 5.224750187704119e-05, + "loss": 1.2556, + "num_input_tokens_seen": 790896640, + "step": 12070, + "train_runtime": 185612.4609, + "train_tokens_per_second": 4261.01 + }, + { + "epoch": 3.189885118183019, + "grad_norm": 0.474609375, + "learning_rate": 5.1920921468693596e-05, + "loss": 1.1623, + "num_input_tokens_seen": 791552000, + "step": 12080, + "train_runtime": 185816.1209, + "train_tokens_per_second": 4259.867 + }, + { + "epoch": 3.1925260794929353, + "grad_norm": 0.474609375, + "learning_rate": 5.15952466203439e-05, + "loss": 1.1934, + "num_input_tokens_seen": 792207360, + "step": 12090, + "train_runtime": 186019.6856, + "train_tokens_per_second": 4258.729 + }, + { + "epoch": 3.1951670408028523, + "grad_norm": 0.462890625, + "learning_rate": 5.1270478820882624e-05, + "loss": 1.2061, + "num_input_tokens_seen": 792862720, + "step": 12100, + "train_runtime": 186222.1638, + "train_tokens_per_second": 4257.617 + }, + { + "epoch": 3.197808002112769, + "grad_norm": 0.486328125, + "learning_rate": 5.0946619555054087e-05, + "loss": 1.2467, + "num_input_tokens_seen": 793518080, + "step": 12110, + "train_runtime": 186424.8323, + "train_tokens_per_second": 4256.504 + }, + { + "epoch": 3.200448963422686, + "grad_norm": 0.490234375, + "learning_rate": 5.062367030344847e-05, + "loss": 1.2435, + "num_input_tokens_seen": 794173440, + "step": 12120, + "train_runtime": 186626.2296, + "train_tokens_per_second": 4255.422 + }, + { + "epoch": 3.203089924732603, + "grad_norm": 0.49609375, + "learning_rate": 5.0301632542496116e-05, + "loss": 1.157, + "num_input_tokens_seen": 794828800, + "step": 12130, + "train_runtime": 186828.4998, + "train_tokens_per_second": 4254.323 + }, + { + "epoch": 3.2057308860425193, + "grad_norm": 0.478515625, + "learning_rate": 4.998050774446003e-05, + "loss": 1.2036, + "num_input_tokens_seen": 795484160, + "step": 12140, + "train_runtime": 187032.7024, + "train_tokens_per_second": 4253.182 + }, + { + "epoch": 3.2083718473524363, + "grad_norm": 0.490234375, + "learning_rate": 4.9660297377429467e-05, + "loss": 1.1836, + "num_input_tokens_seen": 796139520, + "step": 12150, + "train_runtime": 187236.0655, + "train_tokens_per_second": 4252.063 + }, + { + "epoch": 3.2110128086623533, + "grad_norm": 0.54296875, + "learning_rate": 4.9341002905313266e-05, + "loss": 1.237, + "num_input_tokens_seen": 796794880, + "step": 12160, + "train_runtime": 187440.5407, + "train_tokens_per_second": 4250.921 + }, + { + "epoch": 3.21365376997227, + "grad_norm": 0.490234375, + "learning_rate": 4.902262578783298e-05, + "loss": 1.1889, + "num_input_tokens_seen": 797450240, + "step": 12170, + "train_runtime": 187637.9299, + "train_tokens_per_second": 4249.942 + }, + { + "epoch": 3.216294731282187, + "grad_norm": 0.48828125, + "learning_rate": 4.870516748051623e-05, + "loss": 1.2064, + "num_input_tokens_seen": 798105600, + "step": 12180, + "train_runtime": 187836.5977, + "train_tokens_per_second": 4248.936 + }, + { + "epoch": 3.2189356925921033, + "grad_norm": 0.451171875, + "learning_rate": 4.8388629434690165e-05, + "loss": 1.1945, + "num_input_tokens_seen": 798760960, + "step": 12190, + "train_runtime": 188035.2611, + "train_tokens_per_second": 4247.932 + }, + { + "epoch": 3.2215766539020203, + "grad_norm": 0.51953125, + "learning_rate": 4.807301309747491e-05, + "loss": 1.1948, + "num_input_tokens_seen": 799416320, + "step": 12200, + "train_runtime": 188234.517, + "train_tokens_per_second": 4246.917 + }, + { + "epoch": 3.2242176152119373, + "grad_norm": 0.4921875, + "learning_rate": 4.77583199117764e-05, + "loss": 1.2201, + "num_input_tokens_seen": 800071680, + "step": 12210, + "train_runtime": 188434.3452, + "train_tokens_per_second": 4245.891 + }, + { + "epoch": 3.226858576521854, + "grad_norm": 0.46484375, + "learning_rate": 4.7444551316280695e-05, + "loss": 1.2017, + "num_input_tokens_seen": 800727040, + "step": 12220, + "train_runtime": 188636.5671, + "train_tokens_per_second": 4244.813 + }, + { + "epoch": 3.229499537831771, + "grad_norm": 0.47265625, + "learning_rate": 4.7131708745446534e-05, + "loss": 1.1923, + "num_input_tokens_seen": 801382400, + "step": 12230, + "train_runtime": 188837.9683, + "train_tokens_per_second": 4243.757 + }, + { + "epoch": 3.2321404991416878, + "grad_norm": 0.447265625, + "learning_rate": 4.6819793629499256e-05, + "loss": 1.1951, + "num_input_tokens_seen": 802037760, + "step": 12240, + "train_runtime": 189039.7027, + "train_tokens_per_second": 4242.695 + }, + { + "epoch": 3.2347814604516043, + "grad_norm": 0.48046875, + "learning_rate": 4.65088073944242e-05, + "loss": 1.1761, + "num_input_tokens_seen": 802693120, + "step": 12250, + "train_runtime": 189240.384, + "train_tokens_per_second": 4241.659 + }, + { + "epoch": 3.2374224217615213, + "grad_norm": 0.490234375, + "learning_rate": 4.619875146195995e-05, + "loss": 1.1761, + "num_input_tokens_seen": 803348480, + "step": 12260, + "train_runtime": 189439.481, + "train_tokens_per_second": 4240.66 + }, + { + "epoch": 3.240063383071438, + "grad_norm": 0.48046875, + "learning_rate": 4.5889627249592335e-05, + "loss": 1.2408, + "num_input_tokens_seen": 804003840, + "step": 12270, + "train_runtime": 189638.4453, + "train_tokens_per_second": 4239.667 + }, + { + "epoch": 3.242704344381355, + "grad_norm": 0.462890625, + "learning_rate": 4.5581436170547355e-05, + "loss": 1.2056, + "num_input_tokens_seen": 804659200, + "step": 12280, + "train_runtime": 189837.7164, + "train_tokens_per_second": 4238.669 + }, + { + "epoch": 3.2453453056912718, + "grad_norm": 0.4765625, + "learning_rate": 4.527417963378508e-05, + "loss": 1.2204, + "num_input_tokens_seen": 805314560, + "step": 12290, + "train_runtime": 190038.789, + "train_tokens_per_second": 4237.633 + }, + { + "epoch": 3.2479862670011883, + "grad_norm": 0.458984375, + "learning_rate": 4.4967859043993056e-05, + "loss": 1.1911, + "num_input_tokens_seen": 805969920, + "step": 12300, + "train_runtime": 190240.2099, + "train_tokens_per_second": 4236.591 + }, + { + "epoch": 3.2506272283111053, + "grad_norm": 0.51953125, + "learning_rate": 4.4662475801580224e-05, + "loss": 1.2564, + "num_input_tokens_seen": 806625280, + "step": 12310, + "train_runtime": 190442.6886, + "train_tokens_per_second": 4235.528 + }, + { + "epoch": 3.2532681896210223, + "grad_norm": 0.462890625, + "learning_rate": 4.435803130266977e-05, + "loss": 1.216, + "num_input_tokens_seen": 807280640, + "step": 12320, + "train_runtime": 190642.2658, + "train_tokens_per_second": 4234.531 + }, + { + "epoch": 3.255909150930939, + "grad_norm": 0.4765625, + "learning_rate": 4.40545269390937e-05, + "loss": 1.199, + "num_input_tokens_seen": 807936000, + "step": 12330, + "train_runtime": 190843.6231, + "train_tokens_per_second": 4233.497 + }, + { + "epoch": 3.258550112240856, + "grad_norm": 0.4765625, + "learning_rate": 4.375196409838575e-05, + "loss": 1.192, + "num_input_tokens_seen": 808591360, + "step": 12340, + "train_runtime": 191046.6976, + "train_tokens_per_second": 4232.428 + }, + { + "epoch": 3.2611910735507723, + "grad_norm": 0.47265625, + "learning_rate": 4.345034416377519e-05, + "loss": 1.2505, + "num_input_tokens_seen": 809246720, + "step": 12350, + "train_runtime": 191249.443, + "train_tokens_per_second": 4231.368 + }, + { + "epoch": 3.2638320348606893, + "grad_norm": 0.5078125, + "learning_rate": 4.314966851418098e-05, + "loss": 1.1946, + "num_input_tokens_seen": 809902080, + "step": 12360, + "train_runtime": 191453.933, + "train_tokens_per_second": 4230.271 + }, + { + "epoch": 3.2664729961706063, + "grad_norm": 0.498046875, + "learning_rate": 4.284993852420455e-05, + "loss": 1.2053, + "num_input_tokens_seen": 810557440, + "step": 12370, + "train_runtime": 191654.5173, + "train_tokens_per_second": 4229.263 + }, + { + "epoch": 3.269113957480523, + "grad_norm": 0.478515625, + "learning_rate": 4.25511555641245e-05, + "loss": 1.2578, + "num_input_tokens_seen": 811212800, + "step": 12380, + "train_runtime": 191851.853, + "train_tokens_per_second": 4228.329 + }, + { + "epoch": 3.27175491879044, + "grad_norm": 0.51953125, + "learning_rate": 4.225332099988971e-05, + "loss": 1.2183, + "num_input_tokens_seen": 811868160, + "step": 12390, + "train_runtime": 192052.9899, + "train_tokens_per_second": 4227.313 + }, + { + "epoch": 3.2743958801003563, + "grad_norm": 0.5, + "learning_rate": 4.1956436193113235e-05, + "loss": 1.16, + "num_input_tokens_seen": 812523520, + "step": 12400, + "train_runtime": 192252.7378, + "train_tokens_per_second": 4226.33 + }, + { + "epoch": 3.2770368414102733, + "grad_norm": 0.46484375, + "learning_rate": 4.166050250106609e-05, + "loss": 1.2374, + "num_input_tokens_seen": 813178880, + "step": 12410, + "train_runtime": 192451.3782, + "train_tokens_per_second": 4225.373 + }, + { + "epoch": 3.2796778027201903, + "grad_norm": 0.48046875, + "learning_rate": 4.1365521276671284e-05, + "loss": 1.2071, + "num_input_tokens_seen": 813834240, + "step": 12420, + "train_runtime": 192649.8482, + "train_tokens_per_second": 4224.422 + }, + { + "epoch": 3.282318764030107, + "grad_norm": 0.51953125, + "learning_rate": 4.1071493868497205e-05, + "loss": 1.1917, + "num_input_tokens_seen": 814489600, + "step": 12430, + "train_runtime": 192850.3521, + "train_tokens_per_second": 4223.428 + }, + { + "epoch": 3.284959725340024, + "grad_norm": 0.453125, + "learning_rate": 4.077842162075174e-05, + "loss": 1.2022, + "num_input_tokens_seen": 815144960, + "step": 12440, + "train_runtime": 193052.5646, + "train_tokens_per_second": 4222.399 + }, + { + "epoch": 3.2876006866499408, + "grad_norm": 0.515625, + "learning_rate": 4.048630587327603e-05, + "loss": 1.2061, + "num_input_tokens_seen": 815800320, + "step": 12450, + "train_runtime": 193253.9883, + "train_tokens_per_second": 4221.389 + }, + { + "epoch": 3.2902416479598573, + "grad_norm": 0.458984375, + "learning_rate": 4.0195147961538364e-05, + "loss": 1.2372, + "num_input_tokens_seen": 816455680, + "step": 12460, + "train_runtime": 193452.2257, + "train_tokens_per_second": 4220.451 + }, + { + "epoch": 3.2928826092697743, + "grad_norm": 0.466796875, + "learning_rate": 3.990494921662832e-05, + "loss": 1.1781, + "num_input_tokens_seen": 817111040, + "step": 12470, + "train_runtime": 193650.1728, + "train_tokens_per_second": 4219.521 + }, + { + "epoch": 3.2955235705796913, + "grad_norm": 0.462890625, + "learning_rate": 3.961571096524996e-05, + "loss": 1.2021, + "num_input_tokens_seen": 817766400, + "step": 12480, + "train_runtime": 193850.9784, + "train_tokens_per_second": 4218.531 + }, + { + "epoch": 3.298164531889608, + "grad_norm": 0.5, + "learning_rate": 3.932743452971674e-05, + "loss": 1.162, + "num_input_tokens_seen": 818421760, + "step": 12490, + "train_runtime": 194051.3656, + "train_tokens_per_second": 4217.552 + }, + { + "epoch": 3.3008054931995248, + "grad_norm": 0.482421875, + "learning_rate": 3.904012122794473e-05, + "loss": 1.2186, + "num_input_tokens_seen": 819077120, + "step": 12500, + "train_runtime": 194250.6449, + "train_tokens_per_second": 4216.599 + }, + { + "epoch": 3.3034464545094413, + "grad_norm": 0.4609375, + "learning_rate": 3.875377237344677e-05, + "loss": 1.2087, + "num_input_tokens_seen": 819732480, + "step": 12510, + "train_runtime": 194460.5412, + "train_tokens_per_second": 4215.418 + }, + { + "epoch": 3.3060874158193583, + "grad_norm": 0.48828125, + "learning_rate": 3.846838927532678e-05, + "loss": 1.1453, + "num_input_tokens_seen": 820387840, + "step": 12520, + "train_runtime": 194659.8251, + "train_tokens_per_second": 4214.469 + }, + { + "epoch": 3.308728377129275, + "grad_norm": 0.494140625, + "learning_rate": 3.81839732382733e-05, + "loss": 1.2011, + "num_input_tokens_seen": 821043200, + "step": 12530, + "train_runtime": 194859.1382, + "train_tokens_per_second": 4213.522 + }, + { + "epoch": 3.311369338439192, + "grad_norm": 0.455078125, + "learning_rate": 3.79005255625538e-05, + "loss": 1.1627, + "num_input_tokens_seen": 821698560, + "step": 12540, + "train_runtime": 195058.5844, + "train_tokens_per_second": 4212.573 + }, + { + "epoch": 3.3140102997491088, + "grad_norm": 0.46484375, + "learning_rate": 3.7618047544008686e-05, + "loss": 1.1898, + "num_input_tokens_seen": 822353920, + "step": 12550, + "train_runtime": 195256.4063, + "train_tokens_per_second": 4211.662 + }, + { + "epoch": 3.3166512610590253, + "grad_norm": 0.49609375, + "learning_rate": 3.73365404740455e-05, + "loss": 1.2063, + "num_input_tokens_seen": 823009280, + "step": 12560, + "train_runtime": 195456.346, + "train_tokens_per_second": 4210.706 + }, + { + "epoch": 3.3192922223689423, + "grad_norm": 0.451171875, + "learning_rate": 3.70560056396326e-05, + "loss": 1.245, + "num_input_tokens_seen": 823664640, + "step": 12570, + "train_runtime": 195655.7369, + "train_tokens_per_second": 4209.765 + }, + { + "epoch": 3.3219331836788593, + "grad_norm": 0.466796875, + "learning_rate": 3.677644432329391e-05, + "loss": 1.226, + "num_input_tokens_seen": 824320000, + "step": 12580, + "train_runtime": 195853.919, + "train_tokens_per_second": 4208.851 + }, + { + "epoch": 3.324574144988776, + "grad_norm": 0.5, + "learning_rate": 3.649785780310247e-05, + "loss": 1.2331, + "num_input_tokens_seen": 824975360, + "step": 12590, + "train_runtime": 196052.8684, + "train_tokens_per_second": 4207.923 + }, + { + "epoch": 3.3272151062986928, + "grad_norm": 0.484375, + "learning_rate": 3.62202473526749e-05, + "loss": 1.1965, + "num_input_tokens_seen": 825630720, + "step": 12600, + "train_runtime": 196252.9747, + "train_tokens_per_second": 4206.972 + }, + { + "epoch": 3.3298560676086097, + "grad_norm": 0.474609375, + "learning_rate": 3.5943614241165544e-05, + "loss": 1.2003, + "num_input_tokens_seen": 826286080, + "step": 12610, + "train_runtime": 196452.4579, + "train_tokens_per_second": 4206.036 + }, + { + "epoch": 3.3324970289185263, + "grad_norm": 0.48828125, + "learning_rate": 3.5667959733260565e-05, + "loss": 1.2092, + "num_input_tokens_seen": 826941440, + "step": 12620, + "train_runtime": 196651.6064, + "train_tokens_per_second": 4205.109 + }, + { + "epoch": 3.3351379902284433, + "grad_norm": 0.4765625, + "learning_rate": 3.539328508917239e-05, + "loss": 1.208, + "num_input_tokens_seen": 827596800, + "step": 12630, + "train_runtime": 196851.5946, + "train_tokens_per_second": 4204.166 + }, + { + "epoch": 3.33777895153836, + "grad_norm": 0.478515625, + "learning_rate": 3.511959156463362e-05, + "loss": 1.2347, + "num_input_tokens_seen": 828252160, + "step": 12640, + "train_runtime": 197049.8646, + "train_tokens_per_second": 4203.262 + }, + { + "epoch": 3.3404199128482768, + "grad_norm": 0.494140625, + "learning_rate": 3.484688041089157e-05, + "loss": 1.2062, + "num_input_tokens_seen": 828907520, + "step": 12650, + "train_runtime": 197250.6507, + "train_tokens_per_second": 4202.306 + }, + { + "epoch": 3.3430608741581938, + "grad_norm": 0.484375, + "learning_rate": 3.4575152874702284e-05, + "loss": 1.2187, + "num_input_tokens_seen": 829562880, + "step": 12660, + "train_runtime": 197450.127, + "train_tokens_per_second": 4201.379 + }, + { + "epoch": 3.3457018354681103, + "grad_norm": 0.49609375, + "learning_rate": 3.4304410198325335e-05, + "loss": 1.2662, + "num_input_tokens_seen": 830218240, + "step": 12670, + "train_runtime": 197649.6302, + "train_tokens_per_second": 4200.454 + }, + { + "epoch": 3.3483427967780273, + "grad_norm": 0.53125, + "learning_rate": 3.403465361951732e-05, + "loss": 1.2491, + "num_input_tokens_seen": 830873600, + "step": 12680, + "train_runtime": 197848.3821, + "train_tokens_per_second": 4199.547 + }, + { + "epoch": 3.350983758087944, + "grad_norm": 0.462890625, + "learning_rate": 3.3765884371527114e-05, + "loss": 1.2065, + "num_input_tokens_seen": 831528960, + "step": 12690, + "train_runtime": 198048.1863, + "train_tokens_per_second": 4198.619 + }, + { + "epoch": 3.3536247193978608, + "grad_norm": 0.50390625, + "learning_rate": 3.349810368308962e-05, + "loss": 1.1948, + "num_input_tokens_seen": 832184320, + "step": 12700, + "train_runtime": 198248.7732, + "train_tokens_per_second": 4197.677 + }, + { + "epoch": 3.3562656807077778, + "grad_norm": 0.484375, + "learning_rate": 3.323131277842023e-05, + "loss": 1.2187, + "num_input_tokens_seen": 832839680, + "step": 12710, + "train_runtime": 198450.305, + "train_tokens_per_second": 4196.717 + }, + { + "epoch": 3.3589066420176943, + "grad_norm": 0.48828125, + "learning_rate": 3.296551287720964e-05, + "loss": 1.2602, + "num_input_tokens_seen": 833495040, + "step": 12720, + "train_runtime": 198649.2565, + "train_tokens_per_second": 4195.813 + }, + { + "epoch": 3.3615476033276113, + "grad_norm": 0.46875, + "learning_rate": 3.270070519461754e-05, + "loss": 1.1583, + "num_input_tokens_seen": 834150400, + "step": 12730, + "train_runtime": 198848.4163, + "train_tokens_per_second": 4194.906 + }, + { + "epoch": 3.3641885646375282, + "grad_norm": 0.47265625, + "learning_rate": 3.2436890941267924e-05, + "loss": 1.1927, + "num_input_tokens_seen": 834805760, + "step": 12740, + "train_runtime": 199048.4254, + "train_tokens_per_second": 4193.983 + }, + { + "epoch": 3.366829525947445, + "grad_norm": 0.46484375, + "learning_rate": 3.217407132324279e-05, + "loss": 1.1686, + "num_input_tokens_seen": 835461120, + "step": 12750, + "train_runtime": 199249.215, + "train_tokens_per_second": 4193.046 + }, + { + "epoch": 3.3694704872573618, + "grad_norm": 0.470703125, + "learning_rate": 3.191224754207714e-05, + "loss": 1.2457, + "num_input_tokens_seen": 836116480, + "step": 12760, + "train_runtime": 199450.6316, + "train_tokens_per_second": 4192.097 + }, + { + "epoch": 3.3721114485672787, + "grad_norm": 0.484375, + "learning_rate": 3.165142079475314e-05, + "loss": 1.2059, + "num_input_tokens_seen": 836771840, + "step": 12770, + "train_runtime": 199649.8184, + "train_tokens_per_second": 4191.198 + }, + { + "epoch": 3.3747524098771953, + "grad_norm": 0.46875, + "learning_rate": 3.1391592273695005e-05, + "loss": 1.2168, + "num_input_tokens_seen": 837427200, + "step": 12780, + "train_runtime": 199848.8563, + "train_tokens_per_second": 4190.303 + }, + { + "epoch": 3.3773933711871122, + "grad_norm": 0.474609375, + "learning_rate": 3.113276316676322e-05, + "loss": 1.271, + "num_input_tokens_seen": 838082560, + "step": 12790, + "train_runtime": 200048.4541, + "train_tokens_per_second": 4189.398 + }, + { + "epoch": 3.380034332497029, + "grad_norm": 0.51953125, + "learning_rate": 3.087493465724922e-05, + "loss": 1.1732, + "num_input_tokens_seen": 838737920, + "step": 12800, + "train_runtime": 200248.2489, + "train_tokens_per_second": 4188.491 + }, + { + "epoch": 3.3826752938069458, + "grad_norm": 0.474609375, + "learning_rate": 3.061810792387007e-05, + "loss": 1.1944, + "num_input_tokens_seen": 839393280, + "step": 12810, + "train_runtime": 200450.0953, + "train_tokens_per_second": 4187.542 + }, + { + "epoch": 3.3853162551168623, + "grad_norm": 0.48046875, + "learning_rate": 3.036228414076292e-05, + "loss": 1.1843, + "num_input_tokens_seen": 840048640, + "step": 12820, + "train_runtime": 200649.5406, + "train_tokens_per_second": 4186.646 + }, + { + "epoch": 3.3879572164267793, + "grad_norm": 0.486328125, + "learning_rate": 3.0107464477479944e-05, + "loss": 1.2258, + "num_input_tokens_seen": 840704000, + "step": 12830, + "train_runtime": 200849.6394, + "train_tokens_per_second": 4185.738 + }, + { + "epoch": 3.3905981777366963, + "grad_norm": 0.486328125, + "learning_rate": 2.985365009898236e-05, + "loss": 1.2559, + "num_input_tokens_seen": 841359360, + "step": 12840, + "train_runtime": 201048.8138, + "train_tokens_per_second": 4184.851 + }, + { + "epoch": 3.393239139046613, + "grad_norm": 0.47265625, + "learning_rate": 2.9600842165635993e-05, + "loss": 1.1931, + "num_input_tokens_seen": 842014720, + "step": 12850, + "train_runtime": 201247.0508, + "train_tokens_per_second": 4183.985 + }, + { + "epoch": 3.3958801003565298, + "grad_norm": 0.48828125, + "learning_rate": 2.9349041833205136e-05, + "loss": 1.1732, + "num_input_tokens_seen": 842670080, + "step": 12860, + "train_runtime": 201444.9723, + "train_tokens_per_second": 4183.128 + }, + { + "epoch": 3.3985210616664467, + "grad_norm": 0.470703125, + "learning_rate": 2.9098250252847736e-05, + "loss": 1.1667, + "num_input_tokens_seen": 843325440, + "step": 12870, + "train_runtime": 201644.4698, + "train_tokens_per_second": 4182.239 + }, + { + "epoch": 3.4011620229763633, + "grad_norm": 0.49609375, + "learning_rate": 2.8848468571110148e-05, + "loss": 1.2126, + "num_input_tokens_seen": 843980800, + "step": 12880, + "train_runtime": 201843.5639, + "train_tokens_per_second": 4181.361 + }, + { + "epoch": 3.4038029842862803, + "grad_norm": 0.490234375, + "learning_rate": 2.859969792992159e-05, + "loss": 1.2616, + "num_input_tokens_seen": 844636160, + "step": 12890, + "train_runtime": 202042.1976, + "train_tokens_per_second": 4180.494 + }, + { + "epoch": 3.4064439455961972, + "grad_norm": 0.46875, + "learning_rate": 2.8351939466589148e-05, + "loss": 1.1774, + "num_input_tokens_seen": 845291520, + "step": 12900, + "train_runtime": 202241.7893, + "train_tokens_per_second": 4179.609 + }, + { + "epoch": 3.4090849069061138, + "grad_norm": 0.51953125, + "learning_rate": 2.8105194313792547e-05, + "loss": 1.1754, + "num_input_tokens_seen": 845946880, + "step": 12910, + "train_runtime": 202442.689, + "train_tokens_per_second": 4178.698 + }, + { + "epoch": 3.4117258682160307, + "grad_norm": 0.48828125, + "learning_rate": 2.7859463599578914e-05, + "loss": 1.2352, + "num_input_tokens_seen": 846602240, + "step": 12920, + "train_runtime": 202643.8979, + "train_tokens_per_second": 4177.783 + }, + { + "epoch": 3.4143668295259473, + "grad_norm": 0.5, + "learning_rate": 2.7614748447357645e-05, + "loss": 1.2543, + "num_input_tokens_seen": 847257600, + "step": 12930, + "train_runtime": 202843.0942, + "train_tokens_per_second": 4176.911 + }, + { + "epoch": 3.4170077908358643, + "grad_norm": 0.5, + "learning_rate": 2.737104997589543e-05, + "loss": 1.1776, + "num_input_tokens_seen": 847912960, + "step": 12940, + "train_runtime": 203043.176, + "train_tokens_per_second": 4176.023 + }, + { + "epoch": 3.4196487521457812, + "grad_norm": 0.546875, + "learning_rate": 2.7128369299310813e-05, + "loss": 1.2102, + "num_input_tokens_seen": 848568320, + "step": 12950, + "train_runtime": 203242.0299, + "train_tokens_per_second": 4175.162 + }, + { + "epoch": 3.4222897134556978, + "grad_norm": 0.48046875, + "learning_rate": 2.6886707527069315e-05, + "loss": 1.2282, + "num_input_tokens_seen": 849223680, + "step": 12960, + "train_runtime": 203440.4961, + "train_tokens_per_second": 4174.31 + }, + { + "epoch": 3.4249306747656147, + "grad_norm": 0.490234375, + "learning_rate": 2.6646065763978405e-05, + "loss": 1.2195, + "num_input_tokens_seen": 849879040, + "step": 12970, + "train_runtime": 203639.9581, + "train_tokens_per_second": 4173.439 + }, + { + "epoch": 3.4275716360755313, + "grad_norm": 0.4765625, + "learning_rate": 2.6406445110182196e-05, + "loss": 1.2001, + "num_input_tokens_seen": 850534400, + "step": 12980, + "train_runtime": 203840.5648, + "train_tokens_per_second": 4172.547 + }, + { + "epoch": 3.4302125973854483, + "grad_norm": 0.4921875, + "learning_rate": 2.6167846661156845e-05, + "loss": 1.2176, + "num_input_tokens_seen": 851189760, + "step": 12990, + "train_runtime": 204039.4385, + "train_tokens_per_second": 4171.692 + }, + { + "epoch": 3.4328535586953652, + "grad_norm": 0.5078125, + "learning_rate": 2.593027150770508e-05, + "loss": 1.2267, + "num_input_tokens_seen": 851845120, + "step": 13000, + "train_runtime": 204238.7053, + "train_tokens_per_second": 4170.831 + }, + { + "epoch": 3.4354945200052818, + "grad_norm": 0.46484375, + "learning_rate": 2.569372073595147e-05, + "loss": 1.224, + "num_input_tokens_seen": 852500480, + "step": 13010, + "train_runtime": 204447.0868, + "train_tokens_per_second": 4169.785 + }, + { + "epoch": 3.4381354813151987, + "grad_norm": 0.5078125, + "learning_rate": 2.545819542733735e-05, + "loss": 1.2173, + "num_input_tokens_seen": 853155840, + "step": 13020, + "train_runtime": 204644.5237, + "train_tokens_per_second": 4168.965 + }, + { + "epoch": 3.4407764426251157, + "grad_norm": 0.484375, + "learning_rate": 2.522369665861618e-05, + "loss": 1.1678, + "num_input_tokens_seen": 853811200, + "step": 13030, + "train_runtime": 204845.3204, + "train_tokens_per_second": 4168.078 + }, + { + "epoch": 3.4434174039350323, + "grad_norm": 0.5, + "learning_rate": 2.4990225501847985e-05, + "loss": 1.2247, + "num_input_tokens_seen": 854466560, + "step": 13040, + "train_runtime": 205045.968, + "train_tokens_per_second": 4167.195 + }, + { + "epoch": 3.4460583652449492, + "grad_norm": 0.48046875, + "learning_rate": 2.4757783024395242e-05, + "loss": 1.1834, + "num_input_tokens_seen": 855121920, + "step": 13050, + "train_runtime": 205244.1594, + "train_tokens_per_second": 4166.364 + }, + { + "epoch": 3.448699326554866, + "grad_norm": 0.4765625, + "learning_rate": 2.452637028891733e-05, + "loss": 1.1835, + "num_input_tokens_seen": 855777280, + "step": 13060, + "train_runtime": 205443.9479, + "train_tokens_per_second": 4165.503 + }, + { + "epoch": 3.4513402878647828, + "grad_norm": 0.4765625, + "learning_rate": 2.4295988353365994e-05, + "loss": 1.1729, + "num_input_tokens_seen": 856432640, + "step": 13070, + "train_runtime": 205644.5652, + "train_tokens_per_second": 4164.626 + }, + { + "epoch": 3.4539812491746997, + "grad_norm": 0.5546875, + "learning_rate": 2.4066638270980712e-05, + "loss": 1.2091, + "num_input_tokens_seen": 857088000, + "step": 13080, + "train_runtime": 205842.0822, + "train_tokens_per_second": 4163.813 + }, + { + "epoch": 3.4566222104846163, + "grad_norm": 0.48046875, + "learning_rate": 2.3838321090283168e-05, + "loss": 1.2263, + "num_input_tokens_seen": 857743360, + "step": 13090, + "train_runtime": 206041.7868, + "train_tokens_per_second": 4162.958 + }, + { + "epoch": 3.4592631717945332, + "grad_norm": 0.494140625, + "learning_rate": 2.3611037855073346e-05, + "loss": 1.2314, + "num_input_tokens_seen": 858398720, + "step": 13100, + "train_runtime": 206242.0558, + "train_tokens_per_second": 4162.094 + }, + { + "epoch": 3.4619041331044498, + "grad_norm": 0.46484375, + "learning_rate": 2.338478960442414e-05, + "loss": 1.1976, + "num_input_tokens_seen": 859054080, + "step": 13110, + "train_runtime": 206440.9257, + "train_tokens_per_second": 4161.259 + }, + { + "epoch": 3.4645450944143668, + "grad_norm": 0.498046875, + "learning_rate": 2.3159577372676765e-05, + "loss": 1.211, + "num_input_tokens_seen": 859709440, + "step": 13120, + "train_runtime": 206641.8579, + "train_tokens_per_second": 4160.384 + }, + { + "epoch": 3.4671860557242837, + "grad_norm": 0.4765625, + "learning_rate": 2.2935402189436126e-05, + "loss": 1.2251, + "num_input_tokens_seen": 860364800, + "step": 13130, + "train_runtime": 206843.7926, + "train_tokens_per_second": 4159.491 + }, + { + "epoch": 3.4698270170342003, + "grad_norm": 0.490234375, + "learning_rate": 2.2712265079566084e-05, + "loss": 1.1866, + "num_input_tokens_seen": 861020160, + "step": 13140, + "train_runtime": 207043.9691, + "train_tokens_per_second": 4158.634 + }, + { + "epoch": 3.4724679783441172, + "grad_norm": 0.50390625, + "learning_rate": 2.249016706318463e-05, + "loss": 1.1895, + "num_input_tokens_seen": 861675520, + "step": 13150, + "train_runtime": 207242.3309, + "train_tokens_per_second": 4157.816 + }, + { + "epoch": 3.4751089396540342, + "grad_norm": 0.48046875, + "learning_rate": 2.2269109155659385e-05, + "loss": 1.2169, + "num_input_tokens_seen": 862330880, + "step": 13160, + "train_runtime": 207442.3254, + "train_tokens_per_second": 4156.967 + }, + { + "epoch": 3.4777499009639508, + "grad_norm": 0.470703125, + "learning_rate": 2.2049092367602857e-05, + "loss": 1.1999, + "num_input_tokens_seen": 862986240, + "step": 13170, + "train_runtime": 207643.5395, + "train_tokens_per_second": 4156.095 + }, + { + "epoch": 3.4803908622738677, + "grad_norm": 0.48828125, + "learning_rate": 2.183011770486784e-05, + "loss": 1.1669, + "num_input_tokens_seen": 863641600, + "step": 13180, + "train_runtime": 207840.7608, + "train_tokens_per_second": 4155.304 + }, + { + "epoch": 3.4830318235837847, + "grad_norm": 0.48828125, + "learning_rate": 2.161218616854302e-05, + "loss": 1.1572, + "num_input_tokens_seen": 864296960, + "step": 13190, + "train_runtime": 208041.9979, + "train_tokens_per_second": 4154.435 + }, + { + "epoch": 3.4856727848937012, + "grad_norm": 0.466796875, + "learning_rate": 2.139529875494789e-05, + "loss": 1.2137, + "num_input_tokens_seen": 864952320, + "step": 13200, + "train_runtime": 208241.7729, + "train_tokens_per_second": 4153.597 + }, + { + "epoch": 3.4883137462036182, + "grad_norm": 0.4921875, + "learning_rate": 2.117945645562877e-05, + "loss": 1.1582, + "num_input_tokens_seen": 865607680, + "step": 13210, + "train_runtime": 208443.0606, + "train_tokens_per_second": 4152.73 + }, + { + "epoch": 3.4909547075135348, + "grad_norm": 0.5625, + "learning_rate": 2.096466025735397e-05, + "loss": 1.2028, + "num_input_tokens_seen": 866263040, + "step": 13220, + "train_runtime": 208642.2744, + "train_tokens_per_second": 4151.906 + }, + { + "epoch": 3.4935956688234517, + "grad_norm": 0.4765625, + "learning_rate": 2.0750911142109223e-05, + "loss": 1.196, + "num_input_tokens_seen": 866918400, + "step": 13230, + "train_runtime": 208842.4503, + "train_tokens_per_second": 4151.064 + }, + { + "epoch": 3.4962366301333687, + "grad_norm": 0.48828125, + "learning_rate": 2.0538210087093473e-05, + "loss": 1.1948, + "num_input_tokens_seen": 867573760, + "step": 13240, + "train_runtime": 209041.9236, + "train_tokens_per_second": 4150.238 + }, + { + "epoch": 3.4988775914432853, + "grad_norm": 0.484375, + "learning_rate": 2.032655806471409e-05, + "loss": 1.2176, + "num_input_tokens_seen": 868229120, + "step": 13250, + "train_runtime": 209240.9022, + "train_tokens_per_second": 4149.424 + }, + { + "epoch": 3.5015185527532022, + "grad_norm": 0.478515625, + "learning_rate": 2.0115956042582652e-05, + "loss": 1.2142, + "num_input_tokens_seen": 868884480, + "step": 13260, + "train_runtime": 209441.5027, + "train_tokens_per_second": 4148.578 + }, + { + "epoch": 3.5041595140631188, + "grad_norm": 0.62109375, + "learning_rate": 1.9906404983510373e-05, + "loss": 1.1974, + "num_input_tokens_seen": 869539840, + "step": 13270, + "train_runtime": 209641.0139, + "train_tokens_per_second": 4147.756 + }, + { + "epoch": 3.5068004753730357, + "grad_norm": 0.453125, + "learning_rate": 1.9697905845503877e-05, + "loss": 1.1643, + "num_input_tokens_seen": 870195200, + "step": 13280, + "train_runtime": 209842.1067, + "train_tokens_per_second": 4146.905 + }, + { + "epoch": 3.5094414366829527, + "grad_norm": 0.50390625, + "learning_rate": 1.9490459581760572e-05, + "loss": 1.2387, + "num_input_tokens_seen": 870850560, + "step": 13290, + "train_runtime": 210042.5034, + "train_tokens_per_second": 4146.068 + }, + { + "epoch": 3.5120823979928693, + "grad_norm": 0.453125, + "learning_rate": 1.928406714066458e-05, + "loss": 1.1701, + "num_input_tokens_seen": 871505920, + "step": 13300, + "train_runtime": 210243.3045, + "train_tokens_per_second": 4145.226 + }, + { + "epoch": 3.5147233593027862, + "grad_norm": 0.478515625, + "learning_rate": 1.9078729465782124e-05, + "loss": 1.1755, + "num_input_tokens_seen": 872161280, + "step": 13310, + "train_runtime": 210444.1165, + "train_tokens_per_second": 4144.384 + }, + { + "epoch": 3.517364320612703, + "grad_norm": 0.490234375, + "learning_rate": 1.88744474958574e-05, + "loss": 1.1613, + "num_input_tokens_seen": 872816640, + "step": 13320, + "train_runtime": 210644.2092, + "train_tokens_per_second": 4143.559 + }, + { + "epoch": 3.5200052819226197, + "grad_norm": 0.48046875, + "learning_rate": 1.8671222164808293e-05, + "loss": 1.2078, + "num_input_tokens_seen": 873472000, + "step": 13330, + "train_runtime": 210844.3767, + "train_tokens_per_second": 4142.733 + }, + { + "epoch": 3.5226462432325367, + "grad_norm": 0.486328125, + "learning_rate": 1.8469054401721862e-05, + "loss": 1.204, + "num_input_tokens_seen": 874127360, + "step": 13340, + "train_runtime": 211043.7854, + "train_tokens_per_second": 4141.924 + }, + { + "epoch": 3.5252872045424537, + "grad_norm": 0.466796875, + "learning_rate": 1.826794513085045e-05, + "loss": 1.186, + "num_input_tokens_seen": 874782720, + "step": 13350, + "train_runtime": 211244.06, + "train_tokens_per_second": 4141.1 + }, + { + "epoch": 3.5279281658523702, + "grad_norm": 0.51953125, + "learning_rate": 1.8067895271607237e-05, + "loss": 1.2133, + "num_input_tokens_seen": 875438080, + "step": 13360, + "train_runtime": 211443.4451, + "train_tokens_per_second": 4140.294 + }, + { + "epoch": 3.530569127162287, + "grad_norm": 0.486328125, + "learning_rate": 1.7868905738562008e-05, + "loss": 1.1679, + "num_input_tokens_seen": 876093440, + "step": 13370, + "train_runtime": 211643.4306, + "train_tokens_per_second": 4139.479 + }, + { + "epoch": 3.5332100884722037, + "grad_norm": 0.486328125, + "learning_rate": 1.7670977441437086e-05, + "loss": 1.1936, + "num_input_tokens_seen": 876748800, + "step": 13380, + "train_runtime": 211842.8057, + "train_tokens_per_second": 4138.676 + }, + { + "epoch": 3.5358510497821207, + "grad_norm": 0.51171875, + "learning_rate": 1.747411128510315e-05, + "loss": 1.2133, + "num_input_tokens_seen": 877404160, + "step": 13390, + "train_runtime": 212042.2914, + "train_tokens_per_second": 4137.873 + }, + { + "epoch": 3.5384920110920373, + "grad_norm": 0.47265625, + "learning_rate": 1.7278308169575097e-05, + "loss": 1.2262, + "num_input_tokens_seen": 878059520, + "step": 13400, + "train_runtime": 212242.0738, + "train_tokens_per_second": 4137.066 + }, + { + "epoch": 3.5411329724019542, + "grad_norm": 0.478515625, + "learning_rate": 1.7083568990007903e-05, + "loss": 1.2296, + "num_input_tokens_seen": 878714880, + "step": 13410, + "train_runtime": 212442.3742, + "train_tokens_per_second": 4136.251 + }, + { + "epoch": 3.543773933711871, + "grad_norm": 0.474609375, + "learning_rate": 1.6889894636692436e-05, + "loss": 1.2331, + "num_input_tokens_seen": 879370240, + "step": 13420, + "train_runtime": 212641.3248, + "train_tokens_per_second": 4135.463 + }, + { + "epoch": 3.5464148950217877, + "grad_norm": 0.48828125, + "learning_rate": 1.66972859950516e-05, + "loss": 1.2616, + "num_input_tokens_seen": 880025600, + "step": 13430, + "train_runtime": 212839.0824, + "train_tokens_per_second": 4134.699 + }, + { + "epoch": 3.5490558563317047, + "grad_norm": 0.462890625, + "learning_rate": 1.6505743945636254e-05, + "loss": 1.1874, + "num_input_tokens_seen": 880680960, + "step": 13440, + "train_runtime": 213038.6519, + "train_tokens_per_second": 4133.902 + }, + { + "epoch": 3.5516968176416217, + "grad_norm": 0.46484375, + "learning_rate": 1.631526936412081e-05, + "loss": 1.2255, + "num_input_tokens_seen": 881336320, + "step": 13450, + "train_runtime": 213238.8451, + "train_tokens_per_second": 4133.095 + }, + { + "epoch": 3.5543377789515382, + "grad_norm": 0.474609375, + "learning_rate": 1.6125863121299878e-05, + "loss": 1.2345, + "num_input_tokens_seen": 881991680, + "step": 13460, + "train_runtime": 213436.9801, + "train_tokens_per_second": 4132.328 + }, + { + "epoch": 3.556978740261455, + "grad_norm": 0.46484375, + "learning_rate": 1.5937526083083685e-05, + "loss": 1.1988, + "num_input_tokens_seen": 882647040, + "step": 13470, + "train_runtime": 213636.4625, + "train_tokens_per_second": 4131.537 + }, + { + "epoch": 3.559619701571372, + "grad_norm": 0.470703125, + "learning_rate": 1.5750259110494464e-05, + "loss": 1.1976, + "num_input_tokens_seen": 883302400, + "step": 13480, + "train_runtime": 213835.9394, + "train_tokens_per_second": 4130.748 + }, + { + "epoch": 3.5622606628812887, + "grad_norm": 0.48828125, + "learning_rate": 1.5564063059662376e-05, + "loss": 1.1442, + "num_input_tokens_seen": 883957760, + "step": 13490, + "train_runtime": 214033.1806, + "train_tokens_per_second": 4130.003 + }, + { + "epoch": 3.5649016241912057, + "grad_norm": 0.490234375, + "learning_rate": 1.5378938781821727e-05, + "loss": 1.2078, + "num_input_tokens_seen": 884613120, + "step": 13500, + "train_runtime": 214232.5465, + "train_tokens_per_second": 4129.219 + }, + { + "epoch": 3.5675425855011222, + "grad_norm": 0.474609375, + "learning_rate": 1.5194887123306911e-05, + "loss": 1.2222, + "num_input_tokens_seen": 885268480, + "step": 13510, + "train_runtime": 214439.8773, + "train_tokens_per_second": 4128.283 + }, + { + "epoch": 3.570183546811039, + "grad_norm": 0.462890625, + "learning_rate": 1.5011908925548656e-05, + "loss": 1.2242, + "num_input_tokens_seen": 885923840, + "step": 13520, + "train_runtime": 214640.0038, + "train_tokens_per_second": 4127.487 + }, + { + "epoch": 3.5728245081209558, + "grad_norm": 0.462890625, + "learning_rate": 1.4830005025070065e-05, + "loss": 1.2099, + "num_input_tokens_seen": 886579200, + "step": 13530, + "train_runtime": 214841.1359, + "train_tokens_per_second": 4126.673 + }, + { + "epoch": 3.5754654694308727, + "grad_norm": 0.462890625, + "learning_rate": 1.4649176253482944e-05, + "loss": 1.2126, + "num_input_tokens_seen": 887234560, + "step": 13540, + "train_runtime": 215040.6109, + "train_tokens_per_second": 4125.893 + }, + { + "epoch": 3.5781064307407897, + "grad_norm": 0.494140625, + "learning_rate": 1.4469423437483974e-05, + "loss": 1.226, + "num_input_tokens_seen": 887889920, + "step": 13550, + "train_runtime": 215240.4623, + "train_tokens_per_second": 4125.107 + }, + { + "epoch": 3.5807473920507062, + "grad_norm": 0.48046875, + "learning_rate": 1.429074739885064e-05, + "loss": 1.2343, + "num_input_tokens_seen": 888545280, + "step": 13560, + "train_runtime": 215437.507, + "train_tokens_per_second": 4124.376 + }, + { + "epoch": 3.5833883533606232, + "grad_norm": 0.466796875, + "learning_rate": 1.4113148954438048e-05, + "loss": 1.2473, + "num_input_tokens_seen": 889200640, + "step": 13570, + "train_runtime": 215636.2558, + "train_tokens_per_second": 4123.614 + }, + { + "epoch": 3.58602931467054, + "grad_norm": 0.44921875, + "learning_rate": 1.3936628916174588e-05, + "loss": 1.2331, + "num_input_tokens_seen": 889856000, + "step": 13580, + "train_runtime": 215837.184, + "train_tokens_per_second": 4122.811 + }, + { + "epoch": 3.5886702759804567, + "grad_norm": 0.462890625, + "learning_rate": 1.3761188091058614e-05, + "loss": 1.1826, + "num_input_tokens_seen": 890511360, + "step": 13590, + "train_runtime": 216037.0606, + "train_tokens_per_second": 4122.031 + }, + { + "epoch": 3.5913112372903737, + "grad_norm": 0.47265625, + "learning_rate": 1.3586827281154624e-05, + "loss": 1.2182, + "num_input_tokens_seen": 891166720, + "step": 13600, + "train_runtime": 216236.5386, + "train_tokens_per_second": 4121.259 + }, + { + "epoch": 3.5939521986002907, + "grad_norm": 0.5, + "learning_rate": 1.3413547283589566e-05, + "loss": 1.21, + "num_input_tokens_seen": 891822080, + "step": 13610, + "train_runtime": 216435.3099, + "train_tokens_per_second": 4120.502 + }, + { + "epoch": 3.5965931599102072, + "grad_norm": 0.48046875, + "learning_rate": 1.324134889054926e-05, + "loss": 1.2063, + "num_input_tokens_seen": 892477440, + "step": 13620, + "train_runtime": 216633.5407, + "train_tokens_per_second": 4119.757 + }, + { + "epoch": 3.599234121220124, + "grad_norm": 0.447265625, + "learning_rate": 1.3070232889274697e-05, + "loss": 1.167, + "num_input_tokens_seen": 893132800, + "step": 13630, + "train_runtime": 216831.2699, + "train_tokens_per_second": 4119.022 + }, + { + "epoch": 3.601875082530041, + "grad_norm": 0.494140625, + "learning_rate": 1.2900200062058554e-05, + "loss": 1.2579, + "num_input_tokens_seen": 893788160, + "step": 13640, + "train_runtime": 217031.5927, + "train_tokens_per_second": 4118.24 + }, + { + "epoch": 3.6045160438399577, + "grad_norm": 0.48046875, + "learning_rate": 1.2731251186241466e-05, + "loss": 1.1622, + "num_input_tokens_seen": 894443520, + "step": 13650, + "train_runtime": 217231.4601, + "train_tokens_per_second": 4117.468 + }, + { + "epoch": 3.6071570051498747, + "grad_norm": 0.470703125, + "learning_rate": 1.2563387034208673e-05, + "loss": 1.1766, + "num_input_tokens_seen": 895098880, + "step": 13660, + "train_runtime": 217431.878, + "train_tokens_per_second": 4116.687 + }, + { + "epoch": 3.6097979664597912, + "grad_norm": 0.466796875, + "learning_rate": 1.239660837338627e-05, + "loss": 1.2258, + "num_input_tokens_seen": 895754240, + "step": 13670, + "train_runtime": 217632.1859, + "train_tokens_per_second": 4115.909 + }, + { + "epoch": 3.612438927769708, + "grad_norm": 0.4765625, + "learning_rate": 1.2230915966237821e-05, + "loss": 1.2101, + "num_input_tokens_seen": 896409600, + "step": 13680, + "train_runtime": 217832.5619, + "train_tokens_per_second": 4115.131 + }, + { + "epoch": 3.6150798890796247, + "grad_norm": 0.453125, + "learning_rate": 1.2066310570260975e-05, + "loss": 1.1702, + "num_input_tokens_seen": 897064960, + "step": 13690, + "train_runtime": 218032.3563, + "train_tokens_per_second": 4114.366 + }, + { + "epoch": 3.6177208503895417, + "grad_norm": 0.458984375, + "learning_rate": 1.1902792937983603e-05, + "loss": 1.1979, + "num_input_tokens_seen": 897720320, + "step": 13700, + "train_runtime": 218233.3223, + "train_tokens_per_second": 4113.58 + }, + { + "epoch": 3.6203618116994587, + "grad_norm": 0.453125, + "learning_rate": 1.1740363816960974e-05, + "loss": 1.1481, + "num_input_tokens_seen": 898375680, + "step": 13710, + "train_runtime": 218431.4103, + "train_tokens_per_second": 4112.85 + }, + { + "epoch": 3.6230027730093752, + "grad_norm": 0.5, + "learning_rate": 1.1579023949771755e-05, + "loss": 1.2161, + "num_input_tokens_seen": 899031040, + "step": 13720, + "train_runtime": 218629.5375, + "train_tokens_per_second": 4112.121 + }, + { + "epoch": 3.625643734319292, + "grad_norm": 0.4765625, + "learning_rate": 1.1418774074014954e-05, + "loss": 1.2183, + "num_input_tokens_seen": 899686400, + "step": 13730, + "train_runtime": 218828.3947, + "train_tokens_per_second": 4111.379 + }, + { + "epoch": 3.628284695629209, + "grad_norm": 0.484375, + "learning_rate": 1.1259614922306483e-05, + "loss": 1.219, + "num_input_tokens_seen": 900341760, + "step": 13740, + "train_runtime": 219025.7375, + "train_tokens_per_second": 4110.666 + }, + { + "epoch": 3.6309256569391257, + "grad_norm": 0.44921875, + "learning_rate": 1.110154722227566e-05, + "loss": 1.1889, + "num_input_tokens_seen": 900997120, + "step": 13750, + "train_runtime": 219224.4013, + "train_tokens_per_second": 4109.931 + }, + { + "epoch": 3.6335666182490427, + "grad_norm": 0.48828125, + "learning_rate": 1.0944571696562156e-05, + "loss": 1.2194, + "num_input_tokens_seen": 901652480, + "step": 13760, + "train_runtime": 219422.6586, + "train_tokens_per_second": 4109.204 + }, + { + "epoch": 3.6362075795589597, + "grad_norm": 0.46484375, + "learning_rate": 1.078868906281244e-05, + "loss": 1.1816, + "num_input_tokens_seen": 902307840, + "step": 13770, + "train_runtime": 219620.9956, + "train_tokens_per_second": 4108.477 + }, + { + "epoch": 3.638848540868876, + "grad_norm": 0.482421875, + "learning_rate": 1.0633900033676646e-05, + "loss": 1.2197, + "num_input_tokens_seen": 902963200, + "step": 13780, + "train_runtime": 219820.9998, + "train_tokens_per_second": 4107.72 + }, + { + "epoch": 3.641489502178793, + "grad_norm": 0.48046875, + "learning_rate": 1.0480205316805214e-05, + "loss": 1.2363, + "num_input_tokens_seen": 903618560, + "step": 13790, + "train_runtime": 220021.0937, + "train_tokens_per_second": 4106.963 + }, + { + "epoch": 3.6441304634887097, + "grad_norm": 0.48828125, + "learning_rate": 1.0327605614845803e-05, + "loss": 1.2226, + "num_input_tokens_seen": 904273920, + "step": 13800, + "train_runtime": 220221.0633, + "train_tokens_per_second": 4106.21 + }, + { + "epoch": 3.6467714247986267, + "grad_norm": 0.51953125, + "learning_rate": 1.0176101625439777e-05, + "loss": 1.2025, + "num_input_tokens_seen": 904929280, + "step": 13810, + "train_runtime": 220418.9013, + "train_tokens_per_second": 4105.498 + }, + { + "epoch": 3.6494123861085432, + "grad_norm": 0.46484375, + "learning_rate": 1.0025694041219501e-05, + "loss": 1.2232, + "num_input_tokens_seen": 905584640, + "step": 13820, + "train_runtime": 220617.1805, + "train_tokens_per_second": 4104.778 + }, + { + "epoch": 3.65205334741846, + "grad_norm": 0.48046875, + "learning_rate": 9.876383549804662e-06, + "loss": 1.2338, + "num_input_tokens_seen": 906240000, + "step": 13830, + "train_runtime": 220816.7237, + "train_tokens_per_second": 4104.037 + }, + { + "epoch": 3.654694308728377, + "grad_norm": 0.478515625, + "learning_rate": 9.72817083379951e-06, + "loss": 1.1752, + "num_input_tokens_seen": 906895360, + "step": 13840, + "train_runtime": 221016.4261, + "train_tokens_per_second": 4103.294 + }, + { + "epoch": 3.6573352700382937, + "grad_norm": 0.466796875, + "learning_rate": 9.581056570789449e-06, + "loss": 1.2227, + "num_input_tokens_seen": 907550720, + "step": 13850, + "train_runtime": 221217.2326, + "train_tokens_per_second": 4102.532 + }, + { + "epoch": 3.6599762313482107, + "grad_norm": 0.4921875, + "learning_rate": 9.435041433338204e-06, + "loss": 1.2033, + "num_input_tokens_seen": 908206080, + "step": 13860, + "train_runtime": 221415.9105, + "train_tokens_per_second": 4101.81 + }, + { + "epoch": 3.6626171926581277, + "grad_norm": 0.5, + "learning_rate": 9.290126088984523e-06, + "loss": 1.1826, + "num_input_tokens_seen": 908861440, + "step": 13870, + "train_runtime": 221615.2648, + "train_tokens_per_second": 4101.078 + }, + { + "epoch": 3.665258153968044, + "grad_norm": 0.4921875, + "learning_rate": 9.146311200239316e-06, + "loss": 1.2178, + "num_input_tokens_seen": 909516800, + "step": 13880, + "train_runtime": 221815.2625, + "train_tokens_per_second": 4100.335 + }, + { + "epoch": 3.667899115277961, + "grad_norm": 0.490234375, + "learning_rate": 9.003597424582427e-06, + "loss": 1.2385, + "num_input_tokens_seen": 910172160, + "step": 13890, + "train_runtime": 222014.2274, + "train_tokens_per_second": 4099.612 + }, + { + "epoch": 3.670540076587878, + "grad_norm": 0.470703125, + "learning_rate": 8.861985414459733e-06, + "loss": 1.1916, + "num_input_tokens_seen": 910827520, + "step": 13900, + "train_runtime": 222212.6787, + "train_tokens_per_second": 4098.9 + }, + { + "epoch": 3.6731810378977947, + "grad_norm": 0.48828125, + "learning_rate": 8.721475817280306e-06, + "loss": 1.1773, + "num_input_tokens_seen": 911482880, + "step": 13910, + "train_runtime": 222409.6941, + "train_tokens_per_second": 4098.216 + }, + { + "epoch": 3.6758219992077117, + "grad_norm": 0.470703125, + "learning_rate": 8.582069275413107e-06, + "loss": 1.178, + "num_input_tokens_seen": 912138240, + "step": 13920, + "train_runtime": 222608.8701, + "train_tokens_per_second": 4097.493 + }, + { + "epoch": 3.6784629605176287, + "grad_norm": 0.51171875, + "learning_rate": 8.443766426184384e-06, + "loss": 1.2265, + "num_input_tokens_seen": 912793600, + "step": 13930, + "train_runtime": 222809.1608, + "train_tokens_per_second": 4096.751 + }, + { + "epoch": 3.681103921827545, + "grad_norm": 0.484375, + "learning_rate": 8.30656790187459e-06, + "loss": 1.2005, + "num_input_tokens_seen": 913448960, + "step": 13940, + "train_runtime": 223009.6557, + "train_tokens_per_second": 4096.006 + }, + { + "epoch": 3.683744883137462, + "grad_norm": 0.44921875, + "learning_rate": 8.170474329715489e-06, + "loss": 1.1958, + "num_input_tokens_seen": 914104320, + "step": 13950, + "train_runtime": 223208.3043, + "train_tokens_per_second": 4095.297 + }, + { + "epoch": 3.6863858444473787, + "grad_norm": 0.466796875, + "learning_rate": 8.03548633188736e-06, + "loss": 1.2196, + "num_input_tokens_seen": 914759680, + "step": 13960, + "train_runtime": 223405.5391, + "train_tokens_per_second": 4094.615 + }, + { + "epoch": 3.6890268057572957, + "grad_norm": 0.49609375, + "learning_rate": 7.901604525516137e-06, + "loss": 1.1927, + "num_input_tokens_seen": 915415040, + "step": 13970, + "train_runtime": 223604.7082, + "train_tokens_per_second": 4093.899 + }, + { + "epoch": 3.6916677670672122, + "grad_norm": 0.5, + "learning_rate": 7.768829522670523e-06, + "loss": 1.2416, + "num_input_tokens_seen": 916070400, + "step": 13980, + "train_runtime": 223803.4688, + "train_tokens_per_second": 4093.191 + }, + { + "epoch": 3.694308728377129, + "grad_norm": 0.46484375, + "learning_rate": 7.637161930359238e-06, + "loss": 1.1839, + "num_input_tokens_seen": 916725760, + "step": 13990, + "train_runtime": 224004.0002, + "train_tokens_per_second": 4092.453 + }, + { + "epoch": 3.696949689687046, + "grad_norm": 0.49609375, + "learning_rate": 7.506602350528302e-06, + "loss": 1.2306, + "num_input_tokens_seen": 917381120, + "step": 14000, + "train_runtime": 224203.0768, + "train_tokens_per_second": 4091.742 + }, + { + "epoch": 3.6995906509969627, + "grad_norm": 0.486328125, + "learning_rate": 7.377151380058095e-06, + "loss": 1.1985, + "num_input_tokens_seen": 918036480, + "step": 14010, + "train_runtime": 224415.321, + "train_tokens_per_second": 4090.792 + }, + { + "epoch": 3.7022316123068797, + "grad_norm": 0.4765625, + "learning_rate": 7.248809610760965e-06, + "loss": 1.2116, + "num_input_tokens_seen": 918691840, + "step": 14020, + "train_runtime": 224616.2308, + "train_tokens_per_second": 4090.051 + }, + { + "epoch": 3.7048725736167967, + "grad_norm": 0.486328125, + "learning_rate": 7.121577629378096e-06, + "loss": 1.1776, + "num_input_tokens_seen": 919347200, + "step": 14030, + "train_runtime": 224817.62, + "train_tokens_per_second": 4089.302 + }, + { + "epoch": 3.707513534926713, + "grad_norm": 0.474609375, + "learning_rate": 6.995456017577173e-06, + "loss": 1.1871, + "num_input_tokens_seen": 920002560, + "step": 14040, + "train_runtime": 225018.0887, + "train_tokens_per_second": 4088.572 + }, + { + "epoch": 3.71015449623663, + "grad_norm": 0.48828125, + "learning_rate": 6.870445351949611e-06, + "loss": 1.2087, + "num_input_tokens_seen": 920657920, + "step": 14050, + "train_runtime": 225218.7085, + "train_tokens_per_second": 4087.839 + }, + { + "epoch": 3.712795457546547, + "grad_norm": 0.482421875, + "learning_rate": 6.746546204007748e-06, + "loss": 1.1506, + "num_input_tokens_seen": 921313280, + "step": 14060, + "train_runtime": 225419.3873, + "train_tokens_per_second": 4087.108 + }, + { + "epoch": 3.7154364188564637, + "grad_norm": 0.49609375, + "learning_rate": 6.6237591401825945e-06, + "loss": 1.1702, + "num_input_tokens_seen": 921968640, + "step": 14070, + "train_runtime": 225621.2595, + "train_tokens_per_second": 4086.355 + }, + { + "epoch": 3.7180773801663807, + "grad_norm": 0.4765625, + "learning_rate": 6.502084721820872e-06, + "loss": 1.2142, + "num_input_tokens_seen": 922624000, + "step": 14080, + "train_runtime": 225824.7316, + "train_tokens_per_second": 4085.576 + }, + { + "epoch": 3.720718341476297, + "grad_norm": 0.494140625, + "learning_rate": 6.3815235051827015e-06, + "loss": 1.1986, + "num_input_tokens_seen": 923279360, + "step": 14090, + "train_runtime": 226027.682, + "train_tokens_per_second": 4084.807 + }, + { + "epoch": 3.723359302786214, + "grad_norm": 0.48046875, + "learning_rate": 6.262076041438913e-06, + "loss": 1.1982, + "num_input_tokens_seen": 923934720, + "step": 14100, + "train_runtime": 226231.419, + "train_tokens_per_second": 4084.025 + }, + { + "epoch": 3.7260002640961307, + "grad_norm": 0.462890625, + "learning_rate": 6.143742876668579e-06, + "loss": 1.2041, + "num_input_tokens_seen": 924590080, + "step": 14110, + "train_runtime": 226433.1677, + "train_tokens_per_second": 4083.28 + }, + { + "epoch": 3.7286412254060477, + "grad_norm": 0.46875, + "learning_rate": 6.026524551856622e-06, + "loss": 1.2097, + "num_input_tokens_seen": 925245440, + "step": 14120, + "train_runtime": 226634.7238, + "train_tokens_per_second": 4082.541 + }, + { + "epoch": 3.7312821867159647, + "grad_norm": 0.4765625, + "learning_rate": 5.910421602891153e-06, + "loss": 1.1972, + "num_input_tokens_seen": 925900800, + "step": 14130, + "train_runtime": 226835.6514, + "train_tokens_per_second": 4081.813 + }, + { + "epoch": 3.733923148025881, + "grad_norm": 0.498046875, + "learning_rate": 5.795434560561086e-06, + "loss": 1.1631, + "num_input_tokens_seen": 926556160, + "step": 14140, + "train_runtime": 227036.2349, + "train_tokens_per_second": 4081.094 + }, + { + "epoch": 3.736564109335798, + "grad_norm": 0.453125, + "learning_rate": 5.681563950553748e-06, + "loss": 1.1805, + "num_input_tokens_seen": 927211520, + "step": 14150, + "train_runtime": 227237.6319, + "train_tokens_per_second": 4080.361 + }, + { + "epoch": 3.739205070645715, + "grad_norm": 0.5078125, + "learning_rate": 5.5688102934525755e-06, + "loss": 1.1834, + "num_input_tokens_seen": 927866880, + "step": 14160, + "train_runtime": 227439.4682, + "train_tokens_per_second": 4079.621 + }, + { + "epoch": 3.7418460319556317, + "grad_norm": 0.490234375, + "learning_rate": 5.457174104734452e-06, + "loss": 1.2132, + "num_input_tokens_seen": 928522240, + "step": 14170, + "train_runtime": 227640.8196, + "train_tokens_per_second": 4078.892 + }, + { + "epoch": 3.7444869932655487, + "grad_norm": 0.478515625, + "learning_rate": 5.346655894767627e-06, + "loss": 1.2065, + "num_input_tokens_seen": 929177600, + "step": 14180, + "train_runtime": 227843.4445, + "train_tokens_per_second": 4078.141 + }, + { + "epoch": 3.7471279545754657, + "grad_norm": 0.478515625, + "learning_rate": 5.23725616880924e-06, + "loss": 1.2355, + "num_input_tokens_seen": 929832960, + "step": 14190, + "train_runtime": 228044.6956, + "train_tokens_per_second": 4077.415 + }, + { + "epoch": 3.749768915885382, + "grad_norm": 0.458984375, + "learning_rate": 5.128975427003052e-06, + "loss": 1.2436, + "num_input_tokens_seen": 930488320, + "step": 14200, + "train_runtime": 228245.4699, + "train_tokens_per_second": 4076.7 + }, + { + "epoch": 3.752409877195299, + "grad_norm": 0.466796875, + "learning_rate": 5.021814164377164e-06, + "loss": 1.2149, + "num_input_tokens_seen": 931143680, + "step": 14210, + "train_runtime": 228448.5168, + "train_tokens_per_second": 4075.945 + }, + { + "epoch": 3.755050838505216, + "grad_norm": 0.53515625, + "learning_rate": 4.9157728708417175e-06, + "loss": 1.187, + "num_input_tokens_seen": 931799040, + "step": 14220, + "train_runtime": 228650.9477, + "train_tokens_per_second": 4075.203 + }, + { + "epoch": 3.7576917998151327, + "grad_norm": 0.484375, + "learning_rate": 4.810852031186724e-06, + "loss": 1.1855, + "num_input_tokens_seen": 932454400, + "step": 14230, + "train_runtime": 228852.2679, + "train_tokens_per_second": 4074.482 + }, + { + "epoch": 3.7603327611250497, + "grad_norm": 0.515625, + "learning_rate": 4.7070521250797415e-06, + "loss": 1.2241, + "num_input_tokens_seen": 933109760, + "step": 14240, + "train_runtime": 229055.8793, + "train_tokens_per_second": 4073.721 + }, + { + "epoch": 3.762973722434966, + "grad_norm": 0.484375, + "learning_rate": 4.6043736270638405e-06, + "loss": 1.2204, + "num_input_tokens_seen": 933765120, + "step": 14250, + "train_runtime": 229257.8004, + "train_tokens_per_second": 4072.992 + }, + { + "epoch": 3.765614683744883, + "grad_norm": 0.486328125, + "learning_rate": 4.502817006555221e-06, + "loss": 1.2021, + "num_input_tokens_seen": 934420480, + "step": 14260, + "train_runtime": 229459.3204, + "train_tokens_per_second": 4072.271 + }, + { + "epoch": 3.7682556450547997, + "grad_norm": 0.494140625, + "learning_rate": 4.402382727841298e-06, + "loss": 1.2272, + "num_input_tokens_seen": 935075840, + "step": 14270, + "train_runtime": 229660.3344, + "train_tokens_per_second": 4071.56 + }, + { + "epoch": 3.7708966063647167, + "grad_norm": 0.4765625, + "learning_rate": 4.303071250078339e-06, + "loss": 1.2002, + "num_input_tokens_seen": 935731200, + "step": 14280, + "train_runtime": 229863.3416, + "train_tokens_per_second": 4070.815 + }, + { + "epoch": 3.7735375676746337, + "grad_norm": 0.462890625, + "learning_rate": 4.204883027289663e-06, + "loss": 1.2122, + "num_input_tokens_seen": 936386560, + "step": 14290, + "train_runtime": 230065.2321, + "train_tokens_per_second": 4070.092 + }, + { + "epoch": 3.77617852898455, + "grad_norm": 0.47265625, + "learning_rate": 4.107818508363226e-06, + "loss": 1.1655, + "num_input_tokens_seen": 937041920, + "step": 14300, + "train_runtime": 230267.3337, + "train_tokens_per_second": 4069.365 + }, + { + "epoch": 3.778819490294467, + "grad_norm": 0.482421875, + "learning_rate": 4.0118781370498406e-06, + "loss": 1.2199, + "num_input_tokens_seen": 937697280, + "step": 14310, + "train_runtime": 230468.4884, + "train_tokens_per_second": 4068.657 + }, + { + "epoch": 3.781460451604384, + "grad_norm": 0.466796875, + "learning_rate": 3.917062351961015e-06, + "loss": 1.2092, + "num_input_tokens_seen": 938352640, + "step": 14320, + "train_runtime": 230670.1688, + "train_tokens_per_second": 4067.941 + }, + { + "epoch": 3.7841014129143007, + "grad_norm": 0.51953125, + "learning_rate": 3.823371586566926e-06, + "loss": 1.2375, + "num_input_tokens_seen": 939008000, + "step": 14330, + "train_runtime": 230872.2466, + "train_tokens_per_second": 4067.219 + }, + { + "epoch": 3.7867423742242177, + "grad_norm": 0.4921875, + "learning_rate": 3.7308062691945864e-06, + "loss": 1.2465, + "num_input_tokens_seen": 939663360, + "step": 14340, + "train_runtime": 231072.487, + "train_tokens_per_second": 4066.531 + }, + { + "epoch": 3.7893833355341346, + "grad_norm": 0.494140625, + "learning_rate": 3.639366823025708e-06, + "loss": 1.2263, + "num_input_tokens_seen": 940318720, + "step": 14350, + "train_runtime": 231272.5489, + "train_tokens_per_second": 4065.847 + }, + { + "epoch": 3.792024296844051, + "grad_norm": 0.453125, + "learning_rate": 3.54905366609487e-06, + "loss": 1.1946, + "num_input_tokens_seen": 940974080, + "step": 14360, + "train_runtime": 231474.7511, + "train_tokens_per_second": 4065.126 + }, + { + "epoch": 3.794665258153968, + "grad_norm": 0.474609375, + "learning_rate": 3.459867211287576e-06, + "loss": 1.2057, + "num_input_tokens_seen": 941629440, + "step": 14370, + "train_runtime": 231676.2195, + "train_tokens_per_second": 4064.42 + }, + { + "epoch": 3.7973062194638847, + "grad_norm": 0.46875, + "learning_rate": 3.3718078663384223e-06, + "loss": 1.2121, + "num_input_tokens_seen": 942284800, + "step": 14380, + "train_runtime": 231877.9644, + "train_tokens_per_second": 4063.71 + }, + { + "epoch": 3.7999471807738017, + "grad_norm": 0.484375, + "learning_rate": 3.284876033829126e-06, + "loss": 1.1891, + "num_input_tokens_seen": 942940160, + "step": 14390, + "train_runtime": 232080.1367, + "train_tokens_per_second": 4062.994 + }, + { + "epoch": 3.802588142083718, + "grad_norm": 0.453125, + "learning_rate": 3.1990721111867514e-06, + "loss": 1.1746, + "num_input_tokens_seen": 943595520, + "step": 14400, + "train_runtime": 232285.0573, + "train_tokens_per_second": 4062.231 + }, + { + "epoch": 3.805229103393635, + "grad_norm": 0.47265625, + "learning_rate": 3.114396490681959e-06, + "loss": 1.1686, + "num_input_tokens_seen": 944250880, + "step": 14410, + "train_runtime": 232486.4408, + "train_tokens_per_second": 4061.531 + }, + { + "epoch": 3.807870064703552, + "grad_norm": 0.4609375, + "learning_rate": 3.0308495594270348e-06, + "loss": 1.175, + "num_input_tokens_seen": 944906240, + "step": 14420, + "train_runtime": 232689.4941, + "train_tokens_per_second": 4060.803 + }, + { + "epoch": 3.8105110260134687, + "grad_norm": 0.455078125, + "learning_rate": 2.948431699374282e-06, + "loss": 1.2303, + "num_input_tokens_seen": 945561600, + "step": 14430, + "train_runtime": 232892.6589, + "train_tokens_per_second": 4060.075 + }, + { + "epoch": 3.8131519873233857, + "grad_norm": 0.47265625, + "learning_rate": 2.8671432873142167e-06, + "loss": 1.2429, + "num_input_tokens_seen": 946216960, + "step": 14440, + "train_runtime": 233096.2378, + "train_tokens_per_second": 4059.34 + }, + { + "epoch": 3.8157929486333027, + "grad_norm": 0.462890625, + "learning_rate": 2.7869846948738453e-06, + "loss": 1.2247, + "num_input_tokens_seen": 946872320, + "step": 14450, + "train_runtime": 233299.6228, + "train_tokens_per_second": 4058.611 + }, + { + "epoch": 3.818433909943219, + "grad_norm": 0.482421875, + "learning_rate": 2.707956288514973e-06, + "loss": 1.2455, + "num_input_tokens_seen": 947527680, + "step": 14460, + "train_runtime": 233502.6646, + "train_tokens_per_second": 4057.888 + }, + { + "epoch": 3.821074871253136, + "grad_norm": 0.486328125, + "learning_rate": 2.6300584295324838e-06, + "loss": 1.1743, + "num_input_tokens_seen": 948183040, + "step": 14470, + "train_runtime": 233707.2456, + "train_tokens_per_second": 4057.14 + }, + { + "epoch": 3.823715832563053, + "grad_norm": 0.478515625, + "learning_rate": 2.5532914740527824e-06, + "loss": 1.1732, + "num_input_tokens_seen": 948838400, + "step": 14480, + "train_runtime": 233910.9349, + "train_tokens_per_second": 4056.409 + }, + { + "epoch": 3.8263567938729697, + "grad_norm": 0.46875, + "learning_rate": 2.477655773032078e-06, + "loss": 1.1558, + "num_input_tokens_seen": 949493760, + "step": 14490, + "train_runtime": 234114.5913, + "train_tokens_per_second": 4055.68 + }, + { + "epoch": 3.8289977551828867, + "grad_norm": 0.46875, + "learning_rate": 2.4031516722548275e-06, + "loss": 1.2516, + "num_input_tokens_seen": 950149120, + "step": 14500, + "train_runtime": 234316.7652, + "train_tokens_per_second": 4054.977 + }, + { + "epoch": 3.8316387164928036, + "grad_norm": 0.484375, + "learning_rate": 2.3297795123320974e-06, + "loss": 1.2092, + "num_input_tokens_seen": 950804480, + "step": 14510, + "train_runtime": 234525.197, + "train_tokens_per_second": 4054.168 + }, + { + "epoch": 3.83427967780272, + "grad_norm": 0.478515625, + "learning_rate": 2.2575396287001504e-06, + "loss": 1.2242, + "num_input_tokens_seen": 951459840, + "step": 14520, + "train_runtime": 234726.1867, + "train_tokens_per_second": 4053.488 + }, + { + "epoch": 3.836920639112637, + "grad_norm": 0.484375, + "learning_rate": 2.1864323516186945e-06, + "loss": 1.1876, + "num_input_tokens_seen": 952115200, + "step": 14530, + "train_runtime": 234926.1034, + "train_tokens_per_second": 4052.828 + }, + { + "epoch": 3.8395616004225537, + "grad_norm": 0.462890625, + "learning_rate": 2.1164580061695526e-06, + "loss": 1.2072, + "num_input_tokens_seen": 952770560, + "step": 14540, + "train_runtime": 235125.3789, + "train_tokens_per_second": 4052.181 + }, + { + "epoch": 3.8422025617324707, + "grad_norm": 0.44921875, + "learning_rate": 2.047616912255107e-06, + "loss": 1.2411, + "num_input_tokens_seen": 953425920, + "step": 14550, + "train_runtime": 235324.7506, + "train_tokens_per_second": 4051.533 + }, + { + "epoch": 3.844843523042387, + "grad_norm": 0.5078125, + "learning_rate": 1.9799093845968288e-06, + "loss": 1.2211, + "num_input_tokens_seen": 954081280, + "step": 14560, + "train_runtime": 235524.5496, + "train_tokens_per_second": 4050.878 + }, + { + "epoch": 3.847484484352304, + "grad_norm": 0.5390625, + "learning_rate": 1.9133357327338897e-06, + "loss": 1.2291, + "num_input_tokens_seen": 954736640, + "step": 14570, + "train_runtime": 235726.1599, + "train_tokens_per_second": 4050.194 + }, + { + "epoch": 3.850125445662221, + "grad_norm": 0.466796875, + "learning_rate": 1.8478962610216644e-06, + "loss": 1.1795, + "num_input_tokens_seen": 955392000, + "step": 14580, + "train_runtime": 235926.1679, + "train_tokens_per_second": 4049.538 + }, + { + "epoch": 3.8527664069721377, + "grad_norm": 0.48046875, + "learning_rate": 1.7835912686303967e-06, + "loss": 1.2069, + "num_input_tokens_seen": 956047360, + "step": 14590, + "train_runtime": 236128.0729, + "train_tokens_per_second": 4048.851 + }, + { + "epoch": 3.8554073682820547, + "grad_norm": 0.52734375, + "learning_rate": 1.720421049543841e-06, + "loss": 1.2321, + "num_input_tokens_seen": 956702720, + "step": 14600, + "train_runtime": 236329.5962, + "train_tokens_per_second": 4048.171 + }, + { + "epoch": 3.8580483295919716, + "grad_norm": 0.4765625, + "learning_rate": 1.6583858925578732e-06, + "loss": 1.1925, + "num_input_tokens_seen": 957358080, + "step": 14610, + "train_runtime": 236530.4196, + "train_tokens_per_second": 4047.505 + }, + { + "epoch": 3.860689290901888, + "grad_norm": 0.46875, + "learning_rate": 1.5974860812792146e-06, + "loss": 1.2331, + "num_input_tokens_seen": 958013440, + "step": 14620, + "train_runtime": 236731.028, + "train_tokens_per_second": 4046.844 + }, + { + "epoch": 3.863330252211805, + "grad_norm": 0.53125, + "learning_rate": 1.5377218941241277e-06, + "loss": 1.2274, + "num_input_tokens_seen": 958668800, + "step": 14630, + "train_runtime": 236932.0351, + "train_tokens_per_second": 4046.176 + }, + { + "epoch": 3.865971213521722, + "grad_norm": 0.46875, + "learning_rate": 1.4790936043170832e-06, + "loss": 1.1883, + "num_input_tokens_seen": 959324160, + "step": 14640, + "train_runtime": 237133.0352, + "train_tokens_per_second": 4045.51 + }, + { + "epoch": 3.8686121748316387, + "grad_norm": 0.4765625, + "learning_rate": 1.4216014798896227e-06, + "loss": 1.18, + "num_input_tokens_seen": 959979520, + "step": 14650, + "train_runtime": 237333.5434, + "train_tokens_per_second": 4044.854 + }, + { + "epoch": 3.8712531361415556, + "grad_norm": 0.486328125, + "learning_rate": 1.3652457836789977e-06, + "loss": 1.2159, + "num_input_tokens_seen": 960634880, + "step": 14660, + "train_runtime": 237534.255, + "train_tokens_per_second": 4044.195 + }, + { + "epoch": 3.873894097451472, + "grad_norm": 0.484375, + "learning_rate": 1.3100267733270887e-06, + "loss": 1.2189, + "num_input_tokens_seen": 961290240, + "step": 14670, + "train_runtime": 237736.0042, + "train_tokens_per_second": 4043.52 + }, + { + "epoch": 3.876535058761389, + "grad_norm": 0.470703125, + "learning_rate": 1.2559447012791824e-06, + "loss": 1.2065, + "num_input_tokens_seen": 961945600, + "step": 14680, + "train_runtime": 237936.4718, + "train_tokens_per_second": 4042.867 + }, + { + "epoch": 3.8791760200713057, + "grad_norm": 0.49609375, + "learning_rate": 1.2029998147827793e-06, + "loss": 1.2207, + "num_input_tokens_seen": 962600960, + "step": 14690, + "train_runtime": 238135.9488, + "train_tokens_per_second": 4042.233 + }, + { + "epoch": 3.8818169813812227, + "grad_norm": 0.451171875, + "learning_rate": 1.1511923558865657e-06, + "loss": 1.2172, + "num_input_tokens_seen": 963256320, + "step": 14700, + "train_runtime": 238334.3027, + "train_tokens_per_second": 4041.618 + }, + { + "epoch": 3.8844579426911396, + "grad_norm": 0.474609375, + "learning_rate": 1.10052256143911e-06, + "loss": 1.1744, + "num_input_tokens_seen": 963911680, + "step": 14710, + "train_runtime": 238533.0939, + "train_tokens_per_second": 4040.998 + }, + { + "epoch": 3.887098904001056, + "grad_norm": 0.478515625, + "learning_rate": 1.0509906630880583e-06, + "loss": 1.2104, + "num_input_tokens_seen": 964567040, + "step": 14720, + "train_runtime": 238732.2492, + "train_tokens_per_second": 4040.372 + }, + { + "epoch": 3.889739865310973, + "grad_norm": 0.48046875, + "learning_rate": 1.0025968872788282e-06, + "loss": 1.2209, + "num_input_tokens_seen": 965222400, + "step": 14730, + "train_runtime": 238930.6916, + "train_tokens_per_second": 4039.759 + }, + { + "epoch": 3.89238082662089, + "grad_norm": 0.462890625, + "learning_rate": 9.55341455253722e-07, + "loss": 1.1996, + "num_input_tokens_seen": 965877760, + "step": 14740, + "train_runtime": 239127.4331, + "train_tokens_per_second": 4039.176 + }, + { + "epoch": 3.8950217879308067, + "grad_norm": 0.443359375, + "learning_rate": 9.092245830508438e-07, + "loss": 1.1773, + "num_input_tokens_seen": 966533120, + "step": 14750, + "train_runtime": 239326.275, + "train_tokens_per_second": 4038.558 + }, + { + "epoch": 3.8976627492407236, + "grad_norm": 0.51171875, + "learning_rate": 8.642464815031004e-07, + "loss": 1.2295, + "num_input_tokens_seen": 967188480, + "step": 14760, + "train_runtime": 239521.3552, + "train_tokens_per_second": 4038.005 + }, + { + "epoch": 3.9003037105506406, + "grad_norm": 0.494140625, + "learning_rate": 8.204073562373404e-07, + "loss": 1.1858, + "num_input_tokens_seen": 967843840, + "step": 14770, + "train_runtime": 239721.737, + "train_tokens_per_second": 4037.364 + }, + { + "epoch": 3.902944671860557, + "grad_norm": 0.51953125, + "learning_rate": 7.777074076733004e-07, + "loss": 1.1794, + "num_input_tokens_seen": 968499200, + "step": 14780, + "train_runtime": 239920.1058, + "train_tokens_per_second": 4036.757 + }, + { + "epoch": 3.905585633170474, + "grad_norm": 0.4765625, + "learning_rate": 7.361468310227159e-07, + "loss": 1.2027, + "num_input_tokens_seen": 969154560, + "step": 14790, + "train_runtime": 240118.6572, + "train_tokens_per_second": 4036.149 + }, + { + "epoch": 3.908226594480391, + "grad_norm": 0.5078125, + "learning_rate": 6.957258162885171e-07, + "loss": 1.2326, + "num_input_tokens_seen": 969809920, + "step": 14800, + "train_runtime": 240318.3329, + "train_tokens_per_second": 4035.522 + }, + { + "epoch": 3.9108675557903076, + "grad_norm": 0.470703125, + "learning_rate": 6.564445482638015e-07, + "loss": 1.1767, + "num_input_tokens_seen": 970465280, + "step": 14810, + "train_runtime": 240516.6053, + "train_tokens_per_second": 4034.92 + }, + { + "epoch": 3.9135085171002246, + "grad_norm": 0.47265625, + "learning_rate": 6.183032065311123e-07, + "loss": 1.2003, + "num_input_tokens_seen": 971120640, + "step": 14820, + "train_runtime": 240716.4572, + "train_tokens_per_second": 4034.293 + }, + { + "epoch": 3.916149478410141, + "grad_norm": 0.4765625, + "learning_rate": 5.813019654615781e-07, + "loss": 1.2024, + "num_input_tokens_seen": 971776000, + "step": 14830, + "train_runtime": 240914.4151, + "train_tokens_per_second": 4033.698 + }, + { + "epoch": 3.918790439720058, + "grad_norm": 0.474609375, + "learning_rate": 5.454409942141636e-07, + "loss": 1.202, + "num_input_tokens_seen": 972431360, + "step": 14840, + "train_runtime": 241113.589, + "train_tokens_per_second": 4033.084 + }, + { + "epoch": 3.9214314010299747, + "grad_norm": 0.474609375, + "learning_rate": 5.107204567347812e-07, + "loss": 1.2099, + "num_input_tokens_seen": 973086720, + "step": 14850, + "train_runtime": 241313.8553, + "train_tokens_per_second": 4032.453 + }, + { + "epoch": 3.9240723623398917, + "grad_norm": 0.494140625, + "learning_rate": 4.771405117556526e-07, + "loss": 1.2373, + "num_input_tokens_seen": 973742080, + "step": 14860, + "train_runtime": 241512.7412, + "train_tokens_per_second": 4031.846 + }, + { + "epoch": 3.9267133236498086, + "grad_norm": 0.49609375, + "learning_rate": 4.447013127945043e-07, + "loss": 1.2142, + "num_input_tokens_seen": 974397440, + "step": 14870, + "train_runtime": 241712.1245, + "train_tokens_per_second": 4031.231 + }, + { + "epoch": 3.929354284959725, + "grad_norm": 0.50390625, + "learning_rate": 4.134030081539564e-07, + "loss": 1.2312, + "num_input_tokens_seen": 975052800, + "step": 14880, + "train_runtime": 241912.1289, + "train_tokens_per_second": 4030.607 + }, + { + "epoch": 3.931995246269642, + "grad_norm": 0.474609375, + "learning_rate": 3.832457409207457e-07, + "loss": 1.2093, + "num_input_tokens_seen": 975708160, + "step": 14890, + "train_runtime": 242113.1223, + "train_tokens_per_second": 4029.968 + }, + { + "epoch": 3.934636207579559, + "grad_norm": 0.482421875, + "learning_rate": 3.5422964896517087e-07, + "loss": 1.1968, + "num_input_tokens_seen": 976363520, + "step": 14900, + "train_runtime": 242311.4676, + "train_tokens_per_second": 4029.374 + }, + { + "epoch": 3.9372771688894757, + "grad_norm": 0.51171875, + "learning_rate": 3.2635486494031475e-07, + "loss": 1.193, + "num_input_tokens_seen": 977018880, + "step": 14910, + "train_runtime": 242511.3522, + "train_tokens_per_second": 4028.755 + }, + { + "epoch": 3.9399181301993926, + "grad_norm": 0.484375, + "learning_rate": 2.996215162816285e-07, + "loss": 1.1974, + "num_input_tokens_seen": 977674240, + "step": 14920, + "train_runtime": 242711.749, + "train_tokens_per_second": 4028.129 + }, + { + "epoch": 3.9425590915093096, + "grad_norm": 0.470703125, + "learning_rate": 2.7402972520623736e-07, + "loss": 1.221, + "num_input_tokens_seen": 978329600, + "step": 14930, + "train_runtime": 242909.5916, + "train_tokens_per_second": 4027.546 + }, + { + "epoch": 3.945200052819226, + "grad_norm": 0.52734375, + "learning_rate": 2.495796087123303e-07, + "loss": 1.2264, + "num_input_tokens_seen": 978984960, + "step": 14940, + "train_runtime": 243108.5553, + "train_tokens_per_second": 4026.946 + }, + { + "epoch": 3.947841014129143, + "grad_norm": 0.478515625, + "learning_rate": 2.2627127857874352e-07, + "loss": 1.1979, + "num_input_tokens_seen": 979640320, + "step": 14950, + "train_runtime": 243307.219, + "train_tokens_per_second": 4026.351 + }, + { + "epoch": 3.9504819754390597, + "grad_norm": 0.49609375, + "learning_rate": 2.0410484136443309e-07, + "loss": 1.2009, + "num_input_tokens_seen": 980295680, + "step": 14960, + "train_runtime": 243505.9871, + "train_tokens_per_second": 4025.756 + }, + { + "epoch": 3.9531229367489766, + "grad_norm": 0.466796875, + "learning_rate": 1.8308039840783663e-07, + "loss": 1.2191, + "num_input_tokens_seen": 980951040, + "step": 14970, + "train_runtime": 243704.8421, + "train_tokens_per_second": 4025.16 + }, + { + "epoch": 3.955763898058893, + "grad_norm": 0.478515625, + "learning_rate": 1.6319804582667907e-07, + "loss": 1.1971, + "num_input_tokens_seen": 981606400, + "step": 14980, + "train_runtime": 243904.6275, + "train_tokens_per_second": 4024.55 + }, + { + "epoch": 3.95840485936881, + "grad_norm": 0.484375, + "learning_rate": 1.444578745172509e-07, + "loss": 1.1879, + "num_input_tokens_seen": 982261760, + "step": 14990, + "train_runtime": 244103.8987, + "train_tokens_per_second": 4023.949 + }, + { + "epoch": 3.961045820678727, + "grad_norm": 0.474609375, + "learning_rate": 1.268599701541584e-07, + "loss": 1.1779, + "num_input_tokens_seen": 982917120, + "step": 15000, + "train_runtime": 244302.6509, + "train_tokens_per_second": 4023.358 + }, + { + "epoch": 3.9636867819886437, + "grad_norm": 0.50390625, + "learning_rate": 1.1040441318996286e-07, + "loss": 1.2384, + "num_input_tokens_seen": 983572480, + "step": 15010, + "train_runtime": 244515.7267, + "train_tokens_per_second": 4022.533 + }, + { + "epoch": 3.9663277432985606, + "grad_norm": 0.48828125, + "learning_rate": 9.509127885462542e-08, + "loss": 1.2202, + "num_input_tokens_seen": 984227840, + "step": 15020, + "train_runtime": 244719.9245, + "train_tokens_per_second": 4021.854 + }, + { + "epoch": 3.9689687046084776, + "grad_norm": 0.5, + "learning_rate": 8.09206371553961e-08, + "loss": 1.21, + "num_input_tokens_seen": 984883200, + "step": 15030, + "train_runtime": 244922.6385, + "train_tokens_per_second": 4021.201 + }, + { + "epoch": 3.971609665918394, + "grad_norm": 0.455078125, + "learning_rate": 6.789255287631412e-08, + "loss": 1.1314, + "num_input_tokens_seen": 985538560, + "step": 15040, + "train_runtime": 245125.8276, + "train_tokens_per_second": 4020.541 + }, + { + "epoch": 3.974250627228311, + "grad_norm": 0.48828125, + "learning_rate": 5.6007085578013705e-08, + "loss": 1.2161, + "num_input_tokens_seen": 986193920, + "step": 15050, + "train_runtime": 245326.2235, + "train_tokens_per_second": 4019.929 + }, + { + "epoch": 3.976891588538228, + "grad_norm": 0.48046875, + "learning_rate": 4.5264289597363174e-08, + "loss": 1.1896, + "num_input_tokens_seen": 986849280, + "step": 15060, + "train_runtime": 245528.3684, + "train_tokens_per_second": 4019.288 + }, + { + "epoch": 3.9795325498481446, + "grad_norm": 0.49609375, + "learning_rate": 3.566421404732623e-08, + "loss": 1.167, + "num_input_tokens_seen": 987504640, + "step": 15070, + "train_runtime": 245728.9485, + "train_tokens_per_second": 4018.674 + }, + { + "epoch": 3.9821735111580616, + "grad_norm": 0.51171875, + "learning_rate": 2.7206902816628854e-08, + "loss": 1.2484, + "num_input_tokens_seen": 988160000, + "step": 15080, + "train_runtime": 245928.8507, + "train_tokens_per_second": 4018.073 + }, + { + "epoch": 3.9848144724679786, + "grad_norm": 0.484375, + "learning_rate": 1.989239456970382e-08, + "loss": 1.1808, + "num_input_tokens_seen": 988815360, + "step": 15090, + "train_runtime": 246132.5042, + "train_tokens_per_second": 4017.411 + }, + { + "epoch": 3.987455433777895, + "grad_norm": 0.47265625, + "learning_rate": 1.3720722746302095e-08, + "loss": 1.2086, + "num_input_tokens_seen": 989470720, + "step": 15100, + "train_runtime": 246332.5248, + "train_tokens_per_second": 4016.809 + }, + { + "epoch": 3.990096395087812, + "grad_norm": 0.490234375, + "learning_rate": 8.691915561520602e-09, + "loss": 1.2707, + "num_input_tokens_seen": 990126080, + "step": 15110, + "train_runtime": 246535.5849, + "train_tokens_per_second": 4016.159 + }, + { + "epoch": 3.9927373563977286, + "grad_norm": 0.46484375, + "learning_rate": 4.805996005635693e-09, + "loss": 1.2222, + "num_input_tokens_seen": 990781440, + "step": 15120, + "train_runtime": 246736.6315, + "train_tokens_per_second": 4015.543 + }, + { + "epoch": 3.9953783177076456, + "grad_norm": 0.478515625, + "learning_rate": 2.0629818439366065e-09, + "loss": 1.1705, + "num_input_tokens_seen": 991436800, + "step": 15130, + "train_runtime": 246936.1816, + "train_tokens_per_second": 4014.952 + }, + { + "epoch": 3.998019279017562, + "grad_norm": 0.474609375, + "learning_rate": 4.6288561664220575e-10, + "loss": 1.1888, + "num_input_tokens_seen": 992092160, + "step": 15140, + "train_runtime": 247137.4018, + "train_tokens_per_second": 4014.334 + }, + { + "epoch": 4.0, + "num_input_tokens_seen": 992575488, + "step": 15148, + "total_flos": 2.1583206982169395e+19, + "train_loss": 2.0365376835735676, + "train_runtime": 247315.1676, + "train_samples_per_second": 1.96, + "train_steps_per_second": 0.061, + "train_tokens_per_second": 4013.403 + } + ], + "logging_steps": 10, + "max_steps": 15148, + "num_input_tokens_seen": 992575488, + "num_train_epochs": 4, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.1583206982169395e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}