{ "best_global_step": 180, "best_metric": 0.22924575209617615, "best_model_checkpoint": "saves_multiple/prompt-tuning/llama-3-8b-instruct/train_copa_42_1760623604/checkpoint-180", "epoch": 20.0, "eval_steps": 90, "global_step": 1800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05555555555555555, "grad_norm": 1.1328125, "learning_rate": 0.0006666666666666666, "loss": 0.1508, "num_input_tokens_seen": 1600, "step": 5, "train_runtime": 3.1038, "train_tokens_per_second": 515.503 }, { "epoch": 0.1111111111111111, "grad_norm": 12.25, "learning_rate": 0.0015, "loss": 0.1225, "num_input_tokens_seen": 3200, "step": 10, "train_runtime": 3.9613, "train_tokens_per_second": 807.808 }, { "epoch": 0.16666666666666666, "grad_norm": 11.625, "learning_rate": 0.002333333333333333, "loss": 0.2189, "num_input_tokens_seen": 4768, "step": 15, "train_runtime": 4.8299, "train_tokens_per_second": 987.182 }, { "epoch": 0.2222222222222222, "grad_norm": 41.25, "learning_rate": 0.0031666666666666666, "loss": 0.2288, "num_input_tokens_seen": 6336, "step": 20, "train_runtime": 5.6824, "train_tokens_per_second": 1115.019 }, { "epoch": 0.2777777777777778, "grad_norm": 100.0, "learning_rate": 0.004, "loss": 4.5867, "num_input_tokens_seen": 7904, "step": 25, "train_runtime": 6.501, "train_tokens_per_second": 1215.811 }, { "epoch": 0.3333333333333333, "grad_norm": 34.25, "learning_rate": 0.004833333333333334, "loss": 3.3424, "num_input_tokens_seen": 9504, "step": 30, "train_runtime": 7.296, "train_tokens_per_second": 1302.633 }, { "epoch": 0.3888888888888889, "grad_norm": 10.0, "learning_rate": 0.005666666666666666, "loss": 1.1922, "num_input_tokens_seen": 11072, "step": 35, "train_runtime": 8.0915, "train_tokens_per_second": 1368.343 }, { "epoch": 0.4444444444444444, "grad_norm": 31.875, "learning_rate": 0.0065, "loss": 0.5121, "num_input_tokens_seen": 12672, "step": 40, "train_runtime": 8.8889, "train_tokens_per_second": 1425.604 }, { "epoch": 0.5, "grad_norm": 0.953125, "learning_rate": 0.007333333333333333, "loss": 0.2904, "num_input_tokens_seen": 14176, "step": 45, "train_runtime": 9.678, "train_tokens_per_second": 1464.762 }, { "epoch": 0.5555555555555556, "grad_norm": 4.21875, "learning_rate": 0.008166666666666666, "loss": 0.3048, "num_input_tokens_seen": 15776, "step": 50, "train_runtime": 10.472, "train_tokens_per_second": 1506.489 }, { "epoch": 0.6111111111111112, "grad_norm": 4.21875, "learning_rate": 0.009, "loss": 0.3677, "num_input_tokens_seen": 17312, "step": 55, "train_runtime": 11.2639, "train_tokens_per_second": 1536.94 }, { "epoch": 0.6666666666666666, "grad_norm": 0.58203125, "learning_rate": 0.009833333333333333, "loss": 0.2604, "num_input_tokens_seen": 18848, "step": 60, "train_runtime": 12.0538, "train_tokens_per_second": 1563.659 }, { "epoch": 0.7222222222222222, "grad_norm": 0.50390625, "learning_rate": 0.010666666666666666, "loss": 0.256, "num_input_tokens_seen": 20448, "step": 65, "train_runtime": 12.8468, "train_tokens_per_second": 1591.68 }, { "epoch": 0.7777777777777778, "grad_norm": 1.484375, "learning_rate": 0.0115, "loss": 0.5609, "num_input_tokens_seen": 22016, "step": 70, "train_runtime": 13.6383, "train_tokens_per_second": 1614.278 }, { "epoch": 0.8333333333333334, "grad_norm": 0.333984375, "learning_rate": 0.012333333333333332, "loss": 1.0653, "num_input_tokens_seen": 23616, "step": 75, "train_runtime": 14.4335, "train_tokens_per_second": 1636.199 }, { "epoch": 0.8888888888888888, "grad_norm": 0.54296875, "learning_rate": 0.013166666666666667, "loss": 0.2765, "num_input_tokens_seen": 25152, "step": 80, "train_runtime": 15.2249, "train_tokens_per_second": 1652.025 }, { "epoch": 0.9444444444444444, "grad_norm": 22.875, "learning_rate": 0.014, "loss": 1.0491, "num_input_tokens_seen": 26688, "step": 85, "train_runtime": 16.0181, "train_tokens_per_second": 1666.117 }, { "epoch": 1.0, "grad_norm": 0.2060546875, "learning_rate": 0.014833333333333334, "loss": 0.26, "num_input_tokens_seen": 28256, "step": 90, "train_runtime": 16.9498, "train_tokens_per_second": 1667.037 }, { "epoch": 1.0, "eval_loss": 0.23620998859405518, "eval_runtime": 1.2704, "eval_samples_per_second": 31.486, "eval_steps_per_second": 7.872, "num_input_tokens_seen": 28256, "step": 90 }, { "epoch": 1.0555555555555556, "grad_norm": 0.91796875, "learning_rate": 0.015666666666666666, "loss": 0.29, "num_input_tokens_seen": 29824, "step": 95, "train_runtime": 19.874, "train_tokens_per_second": 1500.656 }, { "epoch": 1.1111111111111112, "grad_norm": 1.4140625, "learning_rate": 0.0165, "loss": 0.3073, "num_input_tokens_seen": 31360, "step": 100, "train_runtime": 20.6816, "train_tokens_per_second": 1516.326 }, { "epoch": 1.1666666666666667, "grad_norm": 0.251953125, "learning_rate": 0.017333333333333333, "loss": 0.231, "num_input_tokens_seen": 32960, "step": 105, "train_runtime": 21.4794, "train_tokens_per_second": 1534.493 }, { "epoch": 1.2222222222222223, "grad_norm": 0.1591796875, "learning_rate": 0.018166666666666664, "loss": 0.2256, "num_input_tokens_seen": 34464, "step": 110, "train_runtime": 22.2709, "train_tokens_per_second": 1547.491 }, { "epoch": 1.2777777777777777, "grad_norm": 0.375, "learning_rate": 0.019, "loss": 0.7092, "num_input_tokens_seen": 36032, "step": 115, "train_runtime": 23.0649, "train_tokens_per_second": 1562.201 }, { "epoch": 1.3333333333333333, "grad_norm": 3.171875, "learning_rate": 0.01983333333333333, "loss": 0.2771, "num_input_tokens_seen": 37600, "step": 120, "train_runtime": 23.8592, "train_tokens_per_second": 1575.909 }, { "epoch": 1.3888888888888888, "grad_norm": 2.15625, "learning_rate": 0.020666666666666667, "loss": 0.2618, "num_input_tokens_seen": 39168, "step": 125, "train_runtime": 24.6509, "train_tokens_per_second": 1588.905 }, { "epoch": 1.4444444444444444, "grad_norm": 0.07275390625, "learning_rate": 0.0215, "loss": 0.2334, "num_input_tokens_seen": 40736, "step": 130, "train_runtime": 25.4426, "train_tokens_per_second": 1601.092 }, { "epoch": 1.5, "grad_norm": 0.0279541015625, "learning_rate": 0.022333333333333334, "loss": 0.2422, "num_input_tokens_seen": 42240, "step": 135, "train_runtime": 26.2319, "train_tokens_per_second": 1610.255 }, { "epoch": 1.5555555555555556, "grad_norm": 0.103515625, "learning_rate": 0.023166666666666665, "loss": 0.2405, "num_input_tokens_seen": 43840, "step": 140, "train_runtime": 27.0285, "train_tokens_per_second": 1621.993 }, { "epoch": 1.6111111111111112, "grad_norm": 0.04541015625, "learning_rate": 0.024, "loss": 0.2416, "num_input_tokens_seen": 45408, "step": 145, "train_runtime": 27.823, "train_tokens_per_second": 1632.028 }, { "epoch": 1.6666666666666665, "grad_norm": 0.060791015625, "learning_rate": 0.024833333333333332, "loss": 0.2476, "num_input_tokens_seen": 46976, "step": 150, "train_runtime": 28.6144, "train_tokens_per_second": 1641.69 }, { "epoch": 1.7222222222222223, "grad_norm": 0.08740234375, "learning_rate": 0.025666666666666664, "loss": 0.2165, "num_input_tokens_seen": 48512, "step": 155, "train_runtime": 29.4067, "train_tokens_per_second": 1649.691 }, { "epoch": 1.7777777777777777, "grad_norm": 0.07373046875, "learning_rate": 0.0265, "loss": 0.2679, "num_input_tokens_seen": 50112, "step": 160, "train_runtime": 30.2016, "train_tokens_per_second": 1659.249 }, { "epoch": 1.8333333333333335, "grad_norm": 0.0458984375, "learning_rate": 0.02733333333333333, "loss": 0.2291, "num_input_tokens_seen": 51712, "step": 165, "train_runtime": 30.9983, "train_tokens_per_second": 1668.222 }, { "epoch": 1.8888888888888888, "grad_norm": 0.07373046875, "learning_rate": 0.028166666666666666, "loss": 0.235, "num_input_tokens_seen": 53280, "step": 170, "train_runtime": 31.7939, "train_tokens_per_second": 1675.793 }, { "epoch": 1.9444444444444444, "grad_norm": 0.032470703125, "learning_rate": 0.028999999999999998, "loss": 0.241, "num_input_tokens_seen": 54880, "step": 175, "train_runtime": 32.5915, "train_tokens_per_second": 1683.876 }, { "epoch": 2.0, "grad_norm": 0.01507568359375, "learning_rate": 0.029833333333333333, "loss": 0.2246, "num_input_tokens_seen": 56480, "step": 180, "train_runtime": 33.4262, "train_tokens_per_second": 1689.693 }, { "epoch": 2.0, "eval_loss": 0.22924575209617615, "eval_runtime": 0.8256, "eval_samples_per_second": 48.45, "eval_steps_per_second": 12.112, "num_input_tokens_seen": 56480, "step": 180 }, { "epoch": 2.0555555555555554, "grad_norm": 0.0224609375, "learning_rate": 0.02999954871719651, "loss": 0.2275, "num_input_tokens_seen": 58048, "step": 185, "train_runtime": 36.0921, "train_tokens_per_second": 1608.328 }, { "epoch": 2.111111111111111, "grad_norm": 0.01361083984375, "learning_rate": 0.029997715427345868, "loss": 0.2277, "num_input_tokens_seen": 59584, "step": 190, "train_runtime": 36.8907, "train_tokens_per_second": 1615.149 }, { "epoch": 2.1666666666666665, "grad_norm": 0.052734375, "learning_rate": 0.02999447209750064, "loss": 0.2313, "num_input_tokens_seen": 61216, "step": 195, "train_runtime": 37.6876, "train_tokens_per_second": 1624.3 }, { "epoch": 2.2222222222222223, "grad_norm": 0.033203125, "learning_rate": 0.02998981903258893, "loss": 0.2358, "num_input_tokens_seen": 62784, "step": 200, "train_runtime": 38.4845, "train_tokens_per_second": 1631.408 }, { "epoch": 2.2777777777777777, "grad_norm": 0.044921875, "learning_rate": 0.02998375667007787, "loss": 0.2412, "num_input_tokens_seen": 64352, "step": 205, "train_runtime": 39.2774, "train_tokens_per_second": 1638.396 }, { "epoch": 2.3333333333333335, "grad_norm": 0.0220947265625, "learning_rate": 0.029976285579932503, "loss": 0.2008, "num_input_tokens_seen": 65952, "step": 210, "train_runtime": 40.0729, "train_tokens_per_second": 1645.799 }, { "epoch": 2.388888888888889, "grad_norm": 0.12255859375, "learning_rate": 0.029967406464562214, "loss": 0.2465, "num_input_tokens_seen": 67552, "step": 215, "train_runtime": 40.866, "train_tokens_per_second": 1653.013 }, { "epoch": 2.4444444444444446, "grad_norm": 0.0245361328125, "learning_rate": 0.02995712015875466, "loss": 0.2297, "num_input_tokens_seen": 69120, "step": 220, "train_runtime": 41.6581, "train_tokens_per_second": 1659.219 }, { "epoch": 2.5, "grad_norm": 0.04248046875, "learning_rate": 0.029945427629597305, "loss": 0.2409, "num_input_tokens_seen": 70688, "step": 225, "train_runtime": 42.453, "train_tokens_per_second": 1665.087 }, { "epoch": 2.5555555555555554, "grad_norm": 0.07763671875, "learning_rate": 0.029932329976386493, "loss": 0.2373, "num_input_tokens_seen": 72288, "step": 230, "train_runtime": 43.2489, "train_tokens_per_second": 1671.441 }, { "epoch": 2.611111111111111, "grad_norm": 0.10400390625, "learning_rate": 0.0299178284305241, "loss": 0.2425, "num_input_tokens_seen": 73856, "step": 235, "train_runtime": 44.0446, "train_tokens_per_second": 1676.845 }, { "epoch": 2.6666666666666665, "grad_norm": 0.04052734375, "learning_rate": 0.02990192435540175, "loss": 0.2331, "num_input_tokens_seen": 75392, "step": 240, "train_runtime": 44.8352, "train_tokens_per_second": 1681.537 }, { "epoch": 2.7222222222222223, "grad_norm": 0.02197265625, "learning_rate": 0.029884619246272646, "loss": 0.2384, "num_input_tokens_seen": 76960, "step": 245, "train_runtime": 45.6329, "train_tokens_per_second": 1686.502 }, { "epoch": 2.7777777777777777, "grad_norm": 0.03369140625, "learning_rate": 0.02986591473011098, "loss": 0.2255, "num_input_tokens_seen": 78496, "step": 250, "train_runtime": 46.4329, "train_tokens_per_second": 1690.526 }, { "epoch": 2.8333333333333335, "grad_norm": 0.0205078125, "learning_rate": 0.02984581256545898, "loss": 0.2376, "num_input_tokens_seen": 80000, "step": 255, "train_runtime": 47.2277, "train_tokens_per_second": 1693.922 }, { "epoch": 2.888888888888889, "grad_norm": 0.030029296875, "learning_rate": 0.02982431464226157, "loss": 0.2274, "num_input_tokens_seen": 81568, "step": 260, "train_runtime": 48.0201, "train_tokens_per_second": 1698.622 }, { "epoch": 2.9444444444444446, "grad_norm": 0.03125, "learning_rate": 0.02980142298168869, "loss": 0.2306, "num_input_tokens_seen": 83168, "step": 265, "train_runtime": 48.8131, "train_tokens_per_second": 1703.803 }, { "epoch": 3.0, "grad_norm": 0.02099609375, "learning_rate": 0.029777139735945243, "loss": 0.2336, "num_input_tokens_seen": 84736, "step": 270, "train_runtime": 49.7464, "train_tokens_per_second": 1703.361 }, { "epoch": 3.0, "eval_loss": 0.23087672889232635, "eval_runtime": 0.8189, "eval_samples_per_second": 48.846, "eval_steps_per_second": 12.212, "num_input_tokens_seen": 84736, "step": 270 }, { "epoch": 3.0555555555555554, "grad_norm": 0.03759765625, "learning_rate": 0.029751467188068818, "loss": 0.2376, "num_input_tokens_seen": 86304, "step": 275, "train_runtime": 52.2896, "train_tokens_per_second": 1650.5 }, { "epoch": 3.111111111111111, "grad_norm": 0.029541015625, "learning_rate": 0.02972440775171496, "loss": 0.2289, "num_input_tokens_seen": 87904, "step": 280, "train_runtime": 53.1215, "train_tokens_per_second": 1654.773 }, { "epoch": 3.1666666666666665, "grad_norm": 0.041259765625, "learning_rate": 0.029695963970930307, "loss": 0.2255, "num_input_tokens_seen": 89408, "step": 285, "train_runtime": 53.9107, "train_tokens_per_second": 1658.447 }, { "epoch": 3.2222222222222223, "grad_norm": 0.01312255859375, "learning_rate": 0.029666138519913395, "loss": 0.2251, "num_input_tokens_seen": 91008, "step": 290, "train_runtime": 54.7037, "train_tokens_per_second": 1663.654 }, { "epoch": 3.2777777777777777, "grad_norm": 0.0269775390625, "learning_rate": 0.029634934202763214, "loss": 0.2566, "num_input_tokens_seen": 92512, "step": 295, "train_runtime": 55.4911, "train_tokens_per_second": 1667.149 }, { "epoch": 3.3333333333333335, "grad_norm": 0.02490234375, "learning_rate": 0.0296023539532156, "loss": 0.2391, "num_input_tokens_seen": 94080, "step": 300, "train_runtime": 56.2846, "train_tokens_per_second": 1671.505 }, { "epoch": 3.388888888888889, "grad_norm": 0.042236328125, "learning_rate": 0.029568400834367403, "loss": 0.2269, "num_input_tokens_seen": 95680, "step": 305, "train_runtime": 57.0809, "train_tokens_per_second": 1676.218 }, { "epoch": 3.4444444444444446, "grad_norm": 0.033935546875, "learning_rate": 0.02953307803838851, "loss": 0.2325, "num_input_tokens_seen": 97248, "step": 310, "train_runtime": 57.8741, "train_tokens_per_second": 1680.338 }, { "epoch": 3.5, "grad_norm": 0.00994873046875, "learning_rate": 0.02949638888622172, "loss": 0.2369, "num_input_tokens_seen": 98784, "step": 315, "train_runtime": 58.6661, "train_tokens_per_second": 1683.835 }, { "epoch": 3.5555555555555554, "grad_norm": 0.01507568359375, "learning_rate": 0.029458336827270518, "loss": 0.2209, "num_input_tokens_seen": 100384, "step": 320, "train_runtime": 59.4642, "train_tokens_per_second": 1688.142 }, { "epoch": 3.611111111111111, "grad_norm": 0.0086669921875, "learning_rate": 0.029418925439074782, "loss": 0.2318, "num_input_tokens_seen": 101952, "step": 325, "train_runtime": 60.2574, "train_tokens_per_second": 1691.941 }, { "epoch": 3.6666666666666665, "grad_norm": 0.006805419921875, "learning_rate": 0.029378158426974426, "loss": 0.2321, "num_input_tokens_seen": 103520, "step": 330, "train_runtime": 61.0556, "train_tokens_per_second": 1695.503 }, { "epoch": 3.7222222222222223, "grad_norm": 0.003814697265625, "learning_rate": 0.029336039623761044, "loss": 0.2406, "num_input_tokens_seen": 105120, "step": 335, "train_runtime": 61.8554, "train_tokens_per_second": 1699.448 }, { "epoch": 3.7777777777777777, "grad_norm": 0.003692626953125, "learning_rate": 0.02929257298931754, "loss": 0.2309, "num_input_tokens_seen": 106720, "step": 340, "train_runtime": 62.653, "train_tokens_per_second": 1703.349 }, { "epoch": 3.8333333333333335, "grad_norm": 0.0189208984375, "learning_rate": 0.02924776261024586, "loss": 0.2327, "num_input_tokens_seen": 108320, "step": 345, "train_runtime": 63.447, "train_tokens_per_second": 1707.252 }, { "epoch": 3.888888888888889, "grad_norm": 0.0030975341796875, "learning_rate": 0.02920161269948277, "loss": 0.2304, "num_input_tokens_seen": 109888, "step": 350, "train_runtime": 64.242, "train_tokens_per_second": 1710.531 }, { "epoch": 3.9444444444444446, "grad_norm": 0.00421142578125, "learning_rate": 0.029154127595903752, "loss": 0.2293, "num_input_tokens_seen": 111424, "step": 355, "train_runtime": 65.0354, "train_tokens_per_second": 1713.281 }, { "epoch": 4.0, "grad_norm": 0.00347900390625, "learning_rate": 0.029105311763915113, "loss": 0.2347, "num_input_tokens_seen": 113024, "step": 360, "train_runtime": 65.871, "train_tokens_per_second": 1715.839 }, { "epoch": 4.0, "eval_loss": 0.23181450366973877, "eval_runtime": 0.8261, "eval_samples_per_second": 48.422, "eval_steps_per_second": 12.105, "num_input_tokens_seen": 113024, "step": 360 }, { "epoch": 4.055555555555555, "grad_norm": 0.004241943359375, "learning_rate": 0.029055169793034224, "loss": 0.2306, "num_input_tokens_seen": 114624, "step": 365, "train_runtime": 68.3415, "train_tokens_per_second": 1677.223 }, { "epoch": 4.111111111111111, "grad_norm": 0.00628662109375, "learning_rate": 0.029003706397458022, "loss": 0.2349, "num_input_tokens_seen": 116224, "step": 370, "train_runtime": 69.1577, "train_tokens_per_second": 1680.564 }, { "epoch": 4.166666666666667, "grad_norm": 0.00592041015625, "learning_rate": 0.028950926415619846, "loss": 0.2471, "num_input_tokens_seen": 117760, "step": 375, "train_runtime": 69.9474, "train_tokens_per_second": 1683.55 }, { "epoch": 4.222222222222222, "grad_norm": 0.0169677734375, "learning_rate": 0.028896834809734474, "loss": 0.2298, "num_input_tokens_seen": 119360, "step": 380, "train_runtime": 70.7435, "train_tokens_per_second": 1687.223 }, { "epoch": 4.277777777777778, "grad_norm": 0.0302734375, "learning_rate": 0.028841436665331635, "loss": 0.2254, "num_input_tokens_seen": 120960, "step": 385, "train_runtime": 71.5406, "train_tokens_per_second": 1690.787 }, { "epoch": 4.333333333333333, "grad_norm": 0.003662109375, "learning_rate": 0.02878473719077787, "loss": 0.2393, "num_input_tokens_seen": 122528, "step": 390, "train_runtime": 72.3313, "train_tokens_per_second": 1693.983 }, { "epoch": 4.388888888888889, "grad_norm": 0.0174560546875, "learning_rate": 0.028726741716786866, "loss": 0.2317, "num_input_tokens_seen": 124096, "step": 395, "train_runtime": 73.1276, "train_tokens_per_second": 1696.979 }, { "epoch": 4.444444444444445, "grad_norm": 0.0186767578125, "learning_rate": 0.02866745569591825, "loss": 0.2351, "num_input_tokens_seen": 125696, "step": 400, "train_runtime": 73.9263, "train_tokens_per_second": 1700.287 }, { "epoch": 4.5, "grad_norm": 0.017578125, "learning_rate": 0.028606884702065006, "loss": 0.2317, "num_input_tokens_seen": 127264, "step": 405, "train_runtime": 74.7224, "train_tokens_per_second": 1703.157 }, { "epoch": 4.555555555555555, "grad_norm": 0.005462646484375, "learning_rate": 0.028545034429929377, "loss": 0.2264, "num_input_tokens_seen": 128832, "step": 410, "train_runtime": 75.5264, "train_tokens_per_second": 1705.788 }, { "epoch": 4.611111111111111, "grad_norm": 0.0185546875, "learning_rate": 0.028481910694487505, "loss": 0.2396, "num_input_tokens_seen": 130464, "step": 415, "train_runtime": 76.325, "train_tokens_per_second": 1709.321 }, { "epoch": 4.666666666666667, "grad_norm": 0.00421142578125, "learning_rate": 0.02841751943044271, "loss": 0.2336, "num_input_tokens_seen": 132032, "step": 420, "train_runtime": 77.126, "train_tokens_per_second": 1711.901 }, { "epoch": 4.722222222222222, "grad_norm": 0.01361083984375, "learning_rate": 0.028351866691667543, "loss": 0.2314, "num_input_tokens_seen": 133632, "step": 425, "train_runtime": 77.9247, "train_tokens_per_second": 1714.886 }, { "epoch": 4.777777777777778, "grad_norm": 0.01361083984375, "learning_rate": 0.02828495865063459, "loss": 0.2325, "num_input_tokens_seen": 135232, "step": 430, "train_runtime": 78.7259, "train_tokens_per_second": 1717.757 }, { "epoch": 4.833333333333333, "grad_norm": 0.0042724609375, "learning_rate": 0.028216801597836176, "loss": 0.2216, "num_input_tokens_seen": 136768, "step": 435, "train_runtime": 79.5217, "train_tokens_per_second": 1719.883 }, { "epoch": 4.888888888888889, "grad_norm": 0.005462646484375, "learning_rate": 0.028147401941192952, "loss": 0.2297, "num_input_tokens_seen": 138368, "step": 440, "train_runtime": 80.3232, "train_tokens_per_second": 1722.64 }, { "epoch": 4.944444444444445, "grad_norm": 0.02197265625, "learning_rate": 0.028076766205451433, "loss": 0.2443, "num_input_tokens_seen": 139904, "step": 445, "train_runtime": 81.1198, "train_tokens_per_second": 1724.659 }, { "epoch": 5.0, "grad_norm": 0.005706787109375, "learning_rate": 0.028004901031570568, "loss": 0.2277, "num_input_tokens_seen": 141440, "step": 450, "train_runtime": 81.9601, "train_tokens_per_second": 1725.718 }, { "epoch": 5.0, "eval_loss": 0.2365764081478119, "eval_runtime": 0.8277, "eval_samples_per_second": 48.329, "eval_steps_per_second": 12.082, "num_input_tokens_seen": 141440, "step": 450 }, { "epoch": 5.055555555555555, "grad_norm": 0.0167236328125, "learning_rate": 0.027931813176097366, "loss": 0.2361, "num_input_tokens_seen": 142976, "step": 455, "train_runtime": 84.4881, "train_tokens_per_second": 1692.262 }, { "epoch": 5.111111111111111, "grad_norm": 0.00494384765625, "learning_rate": 0.027857509510531685, "loss": 0.2293, "num_input_tokens_seen": 144576, "step": 460, "train_runtime": 85.2893, "train_tokens_per_second": 1695.125 }, { "epoch": 5.166666666666667, "grad_norm": 0.0245361328125, "learning_rate": 0.02778199702068017, "loss": 0.2307, "num_input_tokens_seen": 146144, "step": 465, "train_runtime": 86.0895, "train_tokens_per_second": 1697.581 }, { "epoch": 5.222222222222222, "grad_norm": 0.012451171875, "learning_rate": 0.02770528280599949, "loss": 0.2336, "num_input_tokens_seen": 147712, "step": 470, "train_runtime": 86.8889, "train_tokens_per_second": 1700.009 }, { "epoch": 5.277777777777778, "grad_norm": 0.006683349609375, "learning_rate": 0.02762737407892886, "loss": 0.2294, "num_input_tokens_seen": 149248, "step": 475, "train_runtime": 87.6835, "train_tokens_per_second": 1702.121 }, { "epoch": 5.333333333333333, "grad_norm": 0.013916015625, "learning_rate": 0.02754827816421195, "loss": 0.2357, "num_input_tokens_seen": 150816, "step": 480, "train_runtime": 88.4782, "train_tokens_per_second": 1704.555 }, { "epoch": 5.388888888888889, "grad_norm": 0.01202392578125, "learning_rate": 0.02746800249820822, "loss": 0.2212, "num_input_tokens_seen": 152352, "step": 485, "train_runtime": 89.2722, "train_tokens_per_second": 1706.6 }, { "epoch": 5.444444444444445, "grad_norm": 0.0166015625, "learning_rate": 0.027386554628193813, "loss": 0.2362, "num_input_tokens_seen": 153888, "step": 490, "train_runtime": 90.0625, "train_tokens_per_second": 1708.681 }, { "epoch": 5.5, "grad_norm": 0.00469970703125, "learning_rate": 0.027303942211651937, "loss": 0.2391, "num_input_tokens_seen": 155488, "step": 495, "train_runtime": 90.8589, "train_tokens_per_second": 1711.313 }, { "epoch": 5.555555555555555, "grad_norm": 0.0223388671875, "learning_rate": 0.02722017301555297, "loss": 0.2305, "num_input_tokens_seen": 157024, "step": 500, "train_runtime": 91.6521, "train_tokens_per_second": 1713.261 }, { "epoch": 5.611111111111111, "grad_norm": 0.023193359375, "learning_rate": 0.02713525491562421, "loss": 0.2316, "num_input_tokens_seen": 158528, "step": 505, "train_runtime": 92.4397, "train_tokens_per_second": 1714.934 }, { "epoch": 5.666666666666667, "grad_norm": 0.0115966796875, "learning_rate": 0.027049195895609432, "loss": 0.2305, "num_input_tokens_seen": 160064, "step": 510, "train_runtime": 93.229, "train_tokens_per_second": 1716.89 }, { "epoch": 5.722222222222222, "grad_norm": 0.009765625, "learning_rate": 0.026962004046518273, "loss": 0.2286, "num_input_tokens_seen": 161664, "step": 515, "train_runtime": 94.0222, "train_tokens_per_second": 1719.424 }, { "epoch": 5.777777777777778, "grad_norm": 0.02099609375, "learning_rate": 0.02687368756586555, "loss": 0.2297, "num_input_tokens_seen": 163264, "step": 520, "train_runtime": 94.8159, "train_tokens_per_second": 1721.905 }, { "epoch": 5.833333333333333, "grad_norm": 0.01226806640625, "learning_rate": 0.02678425475690055, "loss": 0.2348, "num_input_tokens_seen": 164864, "step": 525, "train_runtime": 95.6085, "train_tokens_per_second": 1724.365 }, { "epoch": 5.888888888888889, "grad_norm": 0.0126953125, "learning_rate": 0.02669371402782638, "loss": 0.2286, "num_input_tokens_seen": 166432, "step": 530, "train_runtime": 96.4007, "train_tokens_per_second": 1726.461 }, { "epoch": 5.944444444444445, "grad_norm": 0.01153564453125, "learning_rate": 0.026602073891009458, "loss": 0.2308, "num_input_tokens_seen": 168032, "step": 535, "train_runtime": 97.1955, "train_tokens_per_second": 1728.804 }, { "epoch": 6.0, "grad_norm": 0.0030517578125, "learning_rate": 0.0265093429621792, "loss": 0.2348, "num_input_tokens_seen": 169600, "step": 540, "train_runtime": 98.0299, "train_tokens_per_second": 1730.084 }, { "epoch": 6.0, "eval_loss": 0.2344598025083542, "eval_runtime": 0.8191, "eval_samples_per_second": 48.837, "eval_steps_per_second": 12.209, "num_input_tokens_seen": 169600, "step": 540 }, { "epoch": 6.055555555555555, "grad_norm": 0.00494384765625, "learning_rate": 0.026415529959618007, "loss": 0.2308, "num_input_tokens_seen": 171168, "step": 545, "train_runtime": 100.5102, "train_tokens_per_second": 1702.992 }, { "epoch": 6.111111111111111, "grad_norm": 0.0037078857421875, "learning_rate": 0.02632064370334158, "loss": 0.2311, "num_input_tokens_seen": 172672, "step": 550, "train_runtime": 101.3026, "train_tokens_per_second": 1704.517 }, { "epoch": 6.166666666666667, "grad_norm": 0.0038299560546875, "learning_rate": 0.026224693114269705, "loss": 0.233, "num_input_tokens_seen": 174240, "step": 555, "train_runtime": 102.1187, "train_tokens_per_second": 1706.249 }, { "epoch": 6.222222222222222, "grad_norm": 0.011962890625, "learning_rate": 0.02612768721338753, "loss": 0.2279, "num_input_tokens_seen": 175776, "step": 560, "train_runtime": 102.9125, "train_tokens_per_second": 1708.014 }, { "epoch": 6.277777777777778, "grad_norm": 0.0228271484375, "learning_rate": 0.02602963512089743, "loss": 0.232, "num_input_tokens_seen": 177376, "step": 565, "train_runtime": 103.7126, "train_tokens_per_second": 1710.265 }, { "epoch": 6.333333333333333, "grad_norm": 0.0133056640625, "learning_rate": 0.025930546055361575, "loss": 0.231, "num_input_tokens_seen": 178912, "step": 570, "train_runtime": 104.5028, "train_tokens_per_second": 1712.031 }, { "epoch": 6.388888888888889, "grad_norm": 0.01507568359375, "learning_rate": 0.025830429332835202, "loss": 0.2286, "num_input_tokens_seen": 180480, "step": 575, "train_runtime": 105.2944, "train_tokens_per_second": 1714.052 }, { "epoch": 6.444444444444445, "grad_norm": 0.015380859375, "learning_rate": 0.025729294365990772, "loss": 0.231, "num_input_tokens_seen": 182048, "step": 580, "train_runtime": 106.0853, "train_tokens_per_second": 1716.052 }, { "epoch": 6.5, "grad_norm": 0.007720947265625, "learning_rate": 0.025627150663232998, "loss": 0.2408, "num_input_tokens_seen": 183648, "step": 585, "train_runtime": 106.8794, "train_tokens_per_second": 1718.273 }, { "epoch": 6.555555555555555, "grad_norm": 0.00848388671875, "learning_rate": 0.025524007827804902, "loss": 0.2358, "num_input_tokens_seen": 185248, "step": 590, "train_runtime": 107.6729, "train_tokens_per_second": 1720.47 }, { "epoch": 6.611111111111111, "grad_norm": 0.0162353515625, "learning_rate": 0.025419875556884956, "loss": 0.2302, "num_input_tokens_seen": 186720, "step": 595, "train_runtime": 108.46, "train_tokens_per_second": 1721.556 }, { "epoch": 6.666666666666667, "grad_norm": 0.0142822265625, "learning_rate": 0.025314763640675374, "loss": 0.2313, "num_input_tokens_seen": 188288, "step": 600, "train_runtime": 109.2516, "train_tokens_per_second": 1723.435 }, { "epoch": 6.722222222222222, "grad_norm": 0.006683349609375, "learning_rate": 0.025208681961481655, "loss": 0.2359, "num_input_tokens_seen": 189888, "step": 605, "train_runtime": 110.0491, "train_tokens_per_second": 1725.485 }, { "epoch": 6.777777777777778, "grad_norm": 0.006134033203125, "learning_rate": 0.025101640492783503, "loss": 0.238, "num_input_tokens_seen": 191424, "step": 610, "train_runtime": 110.8418, "train_tokens_per_second": 1727.002 }, { "epoch": 6.833333333333333, "grad_norm": 0.02490234375, "learning_rate": 0.024993649298297137, "loss": 0.2306, "num_input_tokens_seen": 193056, "step": 615, "train_runtime": 111.6358, "train_tokens_per_second": 1729.338 }, { "epoch": 6.888888888888889, "grad_norm": 0.01251220703125, "learning_rate": 0.02488471853102912, "loss": 0.2314, "num_input_tokens_seen": 194592, "step": 620, "train_runtime": 112.4269, "train_tokens_per_second": 1730.831 }, { "epoch": 6.944444444444445, "grad_norm": 0.0252685546875, "learning_rate": 0.024774858432321828, "loss": 0.2347, "num_input_tokens_seen": 196192, "step": 625, "train_runtime": 113.2223, "train_tokens_per_second": 1732.803 }, { "epoch": 7.0, "grad_norm": 0.01275634765625, "learning_rate": 0.024664079330890574, "loss": 0.2294, "num_input_tokens_seen": 197792, "step": 630, "train_runtime": 114.0579, "train_tokens_per_second": 1734.137 }, { "epoch": 7.0, "eval_loss": 0.23141007125377655, "eval_runtime": 0.8186, "eval_samples_per_second": 48.864, "eval_steps_per_second": 12.216, "num_input_tokens_seen": 197792, "step": 630 }, { "epoch": 7.055555555555555, "grad_norm": 0.003936767578125, "learning_rate": 0.02455239164185254, "loss": 0.2314, "num_input_tokens_seen": 199392, "step": 635, "train_runtime": 116.6196, "train_tokens_per_second": 1709.764 }, { "epoch": 7.111111111111111, "grad_norm": 0.005767822265625, "learning_rate": 0.024439805865747562, "loss": 0.2286, "num_input_tokens_seen": 200992, "step": 640, "train_runtime": 117.4275, "train_tokens_per_second": 1711.626 }, { "epoch": 7.166666666666667, "grad_norm": 0.0135498046875, "learning_rate": 0.02432633258755093, "loss": 0.236, "num_input_tokens_seen": 202592, "step": 645, "train_runtime": 118.2277, "train_tokens_per_second": 1713.574 }, { "epoch": 7.222222222222222, "grad_norm": 0.01141357421875, "learning_rate": 0.024211982475678205, "loss": 0.2237, "num_input_tokens_seen": 204064, "step": 650, "train_runtime": 119.016, "train_tokens_per_second": 1714.593 }, { "epoch": 7.277777777777778, "grad_norm": 0.0172119140625, "learning_rate": 0.024096766280982205, "loss": 0.2322, "num_input_tokens_seen": 205664, "step": 655, "train_runtime": 119.8138, "train_tokens_per_second": 1716.53 }, { "epoch": 7.333333333333333, "grad_norm": 0.0172119140625, "learning_rate": 0.023980694835742226, "loss": 0.2317, "num_input_tokens_seen": 207264, "step": 660, "train_runtime": 120.6083, "train_tokens_per_second": 1718.488 }, { "epoch": 7.388888888888889, "grad_norm": 0.0106201171875, "learning_rate": 0.023863779052645667, "loss": 0.2301, "num_input_tokens_seen": 208832, "step": 665, "train_runtime": 121.401, "train_tokens_per_second": 1720.184 }, { "epoch": 7.444444444444445, "grad_norm": 0.02587890625, "learning_rate": 0.02374602992376202, "loss": 0.227, "num_input_tokens_seen": 210368, "step": 670, "train_runtime": 122.192, "train_tokens_per_second": 1721.618 }, { "epoch": 7.5, "grad_norm": 0.020751953125, "learning_rate": 0.023627458519509432, "loss": 0.228, "num_input_tokens_seen": 211936, "step": 675, "train_runtime": 122.9844, "train_tokens_per_second": 1723.275 }, { "epoch": 7.555555555555555, "grad_norm": 0.0233154296875, "learning_rate": 0.023508075987613904, "loss": 0.2143, "num_input_tokens_seen": 213536, "step": 680, "train_runtime": 123.7781, "train_tokens_per_second": 1725.151 }, { "epoch": 7.611111111111111, "grad_norm": 0.05078125, "learning_rate": 0.023387893552061202, "loss": 0.2273, "num_input_tokens_seen": 215136, "step": 685, "train_runtime": 124.613, "train_tokens_per_second": 1726.433 }, { "epoch": 7.666666666666667, "grad_norm": 0.02294921875, "learning_rate": 0.023266922512041644, "loss": 0.2513, "num_input_tokens_seen": 216736, "step": 690, "train_runtime": 125.4907, "train_tokens_per_second": 1727.107 }, { "epoch": 7.722222222222222, "grad_norm": 0.029052734375, "learning_rate": 0.023145174240887748, "loss": 0.2378, "num_input_tokens_seen": 218272, "step": 695, "train_runtime": 126.2833, "train_tokens_per_second": 1728.432 }, { "epoch": 7.777777777777778, "grad_norm": 0.031005859375, "learning_rate": 0.023022660185004967, "loss": 0.2316, "num_input_tokens_seen": 219808, "step": 700, "train_runtime": 127.0771, "train_tokens_per_second": 1729.722 }, { "epoch": 7.833333333333333, "grad_norm": 0.02587890625, "learning_rate": 0.02289939186279551, "loss": 0.2331, "num_input_tokens_seen": 221312, "step": 705, "train_runtime": 127.8643, "train_tokens_per_second": 1730.834 }, { "epoch": 7.888888888888889, "grad_norm": 0.023193359375, "learning_rate": 0.022775380863575456, "loss": 0.2339, "num_input_tokens_seen": 222880, "step": 710, "train_runtime": 128.6548, "train_tokens_per_second": 1732.388 }, { "epoch": 7.944444444444445, "grad_norm": 0.03369140625, "learning_rate": 0.02265063884648513, "loss": 0.2344, "num_input_tokens_seen": 224416, "step": 715, "train_runtime": 129.4473, "train_tokens_per_second": 1733.648 }, { "epoch": 8.0, "grad_norm": 0.049072265625, "learning_rate": 0.022525177539392937, "loss": 0.218, "num_input_tokens_seen": 225984, "step": 720, "train_runtime": 130.2822, "train_tokens_per_second": 1734.573 }, { "epoch": 8.0, "eval_loss": 0.2308429777622223, "eval_runtime": 0.8222, "eval_samples_per_second": 48.649, "eval_steps_per_second": 12.162, "num_input_tokens_seen": 225984, "step": 720 }, { "epoch": 8.055555555555555, "grad_norm": 0.052978515625, "learning_rate": 0.02239900873779278, "loss": 0.2506, "num_input_tokens_seen": 227552, "step": 725, "train_runtime": 132.8103, "train_tokens_per_second": 1713.361 }, { "epoch": 8.11111111111111, "grad_norm": 0.03564453125, "learning_rate": 0.022272144303695056, "loss": 0.2338, "num_input_tokens_seen": 229088, "step": 730, "train_runtime": 133.6036, "train_tokens_per_second": 1714.684 }, { "epoch": 8.166666666666666, "grad_norm": 0.1044921875, "learning_rate": 0.02214459616451143, "loss": 0.2381, "num_input_tokens_seen": 230656, "step": 735, "train_runtime": 134.4027, "train_tokens_per_second": 1716.156 }, { "epoch": 8.222222222222221, "grad_norm": 0.047119140625, "learning_rate": 0.02201637631193346, "loss": 0.2288, "num_input_tokens_seen": 232224, "step": 740, "train_runtime": 135.195, "train_tokens_per_second": 1717.697 }, { "epoch": 8.277777777777779, "grad_norm": 0.2265625, "learning_rate": 0.021887496800805175, "loss": 0.2157, "num_input_tokens_seen": 233792, "step": 745, "train_runtime": 135.9862, "train_tokens_per_second": 1719.233 }, { "epoch": 8.333333333333334, "grad_norm": 0.061279296875, "learning_rate": 0.021757969747989707, "loss": 0.2441, "num_input_tokens_seen": 235328, "step": 750, "train_runtime": 136.775, "train_tokens_per_second": 1720.549 }, { "epoch": 8.38888888888889, "grad_norm": 0.0341796875, "learning_rate": 0.02162780733123012, "loss": 0.2362, "num_input_tokens_seen": 236864, "step": 755, "train_runtime": 137.5647, "train_tokens_per_second": 1721.838 }, { "epoch": 8.444444444444445, "grad_norm": 2.125, "learning_rate": 0.021497021788004445, "loss": 0.9504, "num_input_tokens_seen": 238368, "step": 760, "train_runtime": 138.3522, "train_tokens_per_second": 1722.907 }, { "epoch": 8.5, "grad_norm": 0.1669921875, "learning_rate": 0.021365625414375228, "loss": 0.2414, "num_input_tokens_seen": 239936, "step": 765, "train_runtime": 139.1437, "train_tokens_per_second": 1724.375 }, { "epoch": 8.555555555555555, "grad_norm": 0.10400390625, "learning_rate": 0.021233630563833435, "loss": 0.2626, "num_input_tokens_seen": 241536, "step": 770, "train_runtime": 139.9389, "train_tokens_per_second": 1726.01 }, { "epoch": 8.61111111111111, "grad_norm": 0.038330078125, "learning_rate": 0.021101049646137005, "loss": 0.2398, "num_input_tokens_seen": 243136, "step": 775, "train_runtime": 140.7323, "train_tokens_per_second": 1727.648 }, { "epoch": 8.666666666666666, "grad_norm": 0.051025390625, "learning_rate": 0.02096789512614417, "loss": 0.2382, "num_input_tokens_seen": 244704, "step": 780, "train_runtime": 141.5236, "train_tokens_per_second": 1729.069 }, { "epoch": 8.722222222222221, "grad_norm": 0.048095703125, "learning_rate": 0.020834179522641504, "loss": 0.2276, "num_input_tokens_seen": 246272, "step": 785, "train_runtime": 142.3196, "train_tokens_per_second": 1730.416 }, { "epoch": 8.777777777777779, "grad_norm": 0.01165771484375, "learning_rate": 0.020699915407166987, "loss": 0.2446, "num_input_tokens_seen": 247808, "step": 790, "train_runtime": 143.1092, "train_tokens_per_second": 1731.6 }, { "epoch": 8.833333333333334, "grad_norm": 0.0169677734375, "learning_rate": 0.020565115402828002, "loss": 0.2376, "num_input_tokens_seen": 249376, "step": 795, "train_runtime": 143.9049, "train_tokens_per_second": 1732.922 }, { "epoch": 8.88888888888889, "grad_norm": 0.034912109375, "learning_rate": 0.02042979218311462, "loss": 0.2325, "num_input_tokens_seen": 250944, "step": 800, "train_runtime": 144.6957, "train_tokens_per_second": 1734.288 }, { "epoch": 8.944444444444445, "grad_norm": 0.0235595703125, "learning_rate": 0.02029395847070803, "loss": 0.226, "num_input_tokens_seen": 252512, "step": 805, "train_runtime": 145.4907, "train_tokens_per_second": 1735.588 }, { "epoch": 9.0, "grad_norm": 0.01190185546875, "learning_rate": 0.020157627036284417, "loss": 0.238, "num_input_tokens_seen": 254112, "step": 810, "train_runtime": 146.3281, "train_tokens_per_second": 1736.591 }, { "epoch": 9.0, "eval_loss": 0.23274096846580505, "eval_runtime": 0.8192, "eval_samples_per_second": 48.831, "eval_steps_per_second": 12.208, "num_input_tokens_seen": 254112, "step": 810 }, { "epoch": 9.055555555555555, "grad_norm": 0.02197265625, "learning_rate": 0.02002081069731427, "loss": 0.2334, "num_input_tokens_seen": 255680, "step": 815, "train_runtime": 148.7956, "train_tokens_per_second": 1718.33 }, { "epoch": 9.11111111111111, "grad_norm": 0.0185546875, "learning_rate": 0.01988352231685735, "loss": 0.2236, "num_input_tokens_seen": 257216, "step": 820, "train_runtime": 149.5891, "train_tokens_per_second": 1719.483 }, { "epoch": 9.166666666666666, "grad_norm": 0.049560546875, "learning_rate": 0.019745774802353344, "loss": 0.2579, "num_input_tokens_seen": 258816, "step": 825, "train_runtime": 150.3889, "train_tokens_per_second": 1720.978 }, { "epoch": 9.222222222222221, "grad_norm": 0.00970458984375, "learning_rate": 0.019607581104408342, "loss": 0.2457, "num_input_tokens_seen": 260384, "step": 830, "train_runtime": 151.1849, "train_tokens_per_second": 1722.289 }, { "epoch": 9.277777777777779, "grad_norm": 0.0211181640625, "learning_rate": 0.019468954215577226, "loss": 0.2301, "num_input_tokens_seen": 262048, "step": 835, "train_runtime": 151.9844, "train_tokens_per_second": 1724.177 }, { "epoch": 9.333333333333334, "grad_norm": 0.02734375, "learning_rate": 0.01932990716914222, "loss": 0.244, "num_input_tokens_seen": 263616, "step": 840, "train_runtime": 152.7753, "train_tokens_per_second": 1725.514 }, { "epoch": 9.38888888888889, "grad_norm": 0.0093994140625, "learning_rate": 0.019190453037887464, "loss": 0.2323, "num_input_tokens_seen": 265152, "step": 845, "train_runtime": 153.5949, "train_tokens_per_second": 1726.308 }, { "epoch": 9.444444444444445, "grad_norm": 0.01708984375, "learning_rate": 0.019050604932870013, "loss": 0.2314, "num_input_tokens_seen": 266688, "step": 850, "train_runtime": 154.3846, "train_tokens_per_second": 1727.427 }, { "epoch": 9.5, "grad_norm": 0.0322265625, "learning_rate": 0.01891037600218712, "loss": 0.2338, "num_input_tokens_seen": 268256, "step": 855, "train_runtime": 155.1779, "train_tokens_per_second": 1728.7 }, { "epoch": 9.555555555555555, "grad_norm": 0.022705078125, "learning_rate": 0.018769779429740154, "loss": 0.2379, "num_input_tokens_seen": 269824, "step": 860, "train_runtime": 155.971, "train_tokens_per_second": 1729.963 }, { "epoch": 9.61111111111111, "grad_norm": 0.0140380859375, "learning_rate": 0.018628828433995014, "loss": 0.2388, "num_input_tokens_seen": 271424, "step": 865, "train_runtime": 156.7643, "train_tokens_per_second": 1731.414 }, { "epoch": 9.666666666666666, "grad_norm": 0.007415771484375, "learning_rate": 0.018487536266739445, "loss": 0.2359, "num_input_tokens_seen": 272960, "step": 870, "train_runtime": 157.5546, "train_tokens_per_second": 1732.478 }, { "epoch": 9.722222222222221, "grad_norm": 0.01263427734375, "learning_rate": 0.01834591621183709, "loss": 0.229, "num_input_tokens_seen": 274528, "step": 875, "train_runtime": 158.3508, "train_tokens_per_second": 1733.67 }, { "epoch": 9.777777777777779, "grad_norm": 0.00537109375, "learning_rate": 0.018203981583978603, "loss": 0.235, "num_input_tokens_seen": 276128, "step": 880, "train_runtime": 159.1469, "train_tokens_per_second": 1735.051 }, { "epoch": 9.833333333333334, "grad_norm": 0.006561279296875, "learning_rate": 0.018061745727429836, "loss": 0.2284, "num_input_tokens_seen": 277664, "step": 885, "train_runtime": 159.9388, "train_tokens_per_second": 1736.064 }, { "epoch": 9.88888888888889, "grad_norm": 0.0159912109375, "learning_rate": 0.017919222014777265, "loss": 0.2371, "num_input_tokens_seen": 279232, "step": 890, "train_runtime": 160.7334, "train_tokens_per_second": 1737.237 }, { "epoch": 9.944444444444445, "grad_norm": 0.003265380859375, "learning_rate": 0.017776423845670717, "loss": 0.228, "num_input_tokens_seen": 280768, "step": 895, "train_runtime": 161.5242, "train_tokens_per_second": 1738.241 }, { "epoch": 10.0, "grad_norm": 0.01251220703125, "learning_rate": 0.0176333646455636, "loss": 0.2218, "num_input_tokens_seen": 282368, "step": 900, "train_runtime": 162.3604, "train_tokens_per_second": 1739.143 }, { "epoch": 10.0, "eval_loss": 0.23551960289478302, "eval_runtime": 0.817, "eval_samples_per_second": 48.961, "eval_steps_per_second": 12.24, "num_input_tokens_seen": 282368, "step": 900 }, { "epoch": 10.055555555555555, "grad_norm": 0.0185546875, "learning_rate": 0.017490057864450664, "loss": 0.2283, "num_input_tokens_seen": 283936, "step": 905, "train_runtime": 165.2451, "train_tokens_per_second": 1718.272 }, { "epoch": 10.11111111111111, "grad_norm": 0.00946044921875, "learning_rate": 0.017346516975603462, "loss": 0.2199, "num_input_tokens_seen": 285504, "step": 910, "train_runtime": 166.0581, "train_tokens_per_second": 1719.302 }, { "epoch": 10.166666666666666, "grad_norm": 0.01251220703125, "learning_rate": 0.017202755474303683, "loss": 0.2405, "num_input_tokens_seen": 287072, "step": 915, "train_runtime": 166.8565, "train_tokens_per_second": 1720.472 }, { "epoch": 10.222222222222221, "grad_norm": 0.011474609375, "learning_rate": 0.017058786876574313, "loss": 0.2363, "num_input_tokens_seen": 288576, "step": 920, "train_runtime": 167.6452, "train_tokens_per_second": 1721.35 }, { "epoch": 10.277777777777779, "grad_norm": 0.00555419921875, "learning_rate": 0.016914624717908923, "loss": 0.2355, "num_input_tokens_seen": 290144, "step": 925, "train_runtime": 168.4381, "train_tokens_per_second": 1722.555 }, { "epoch": 10.333333333333334, "grad_norm": 0.0235595703125, "learning_rate": 0.016770282551999093, "loss": 0.2278, "num_input_tokens_seen": 291744, "step": 930, "train_runtime": 169.2344, "train_tokens_per_second": 1723.905 }, { "epoch": 10.38888888888889, "grad_norm": 0.01385498046875, "learning_rate": 0.01662577394946016, "loss": 0.2358, "num_input_tokens_seen": 293344, "step": 935, "train_runtime": 170.0296, "train_tokens_per_second": 1725.253 }, { "epoch": 10.444444444444445, "grad_norm": 0.00482177734375, "learning_rate": 0.016481112496555317, "loss": 0.2315, "num_input_tokens_seen": 294912, "step": 940, "train_runtime": 170.8203, "train_tokens_per_second": 1726.446 }, { "epoch": 10.5, "grad_norm": 0.01275634765625, "learning_rate": 0.016336311793918295, "loss": 0.2304, "num_input_tokens_seen": 296480, "step": 945, "train_runtime": 171.6139, "train_tokens_per_second": 1727.599 }, { "epoch": 10.555555555555555, "grad_norm": 0.00665283203125, "learning_rate": 0.016191385455274654, "loss": 0.2347, "num_input_tokens_seen": 298048, "step": 950, "train_runtime": 172.4051, "train_tokens_per_second": 1728.766 }, { "epoch": 10.61111111111111, "grad_norm": 0.0230712890625, "learning_rate": 0.016046347106161877, "loss": 0.2326, "num_input_tokens_seen": 299648, "step": 955, "train_runtime": 173.1986, "train_tokens_per_second": 1730.083 }, { "epoch": 10.666666666666666, "grad_norm": 0.011474609375, "learning_rate": 0.01590121038264835, "loss": 0.2264, "num_input_tokens_seen": 301216, "step": 960, "train_runtime": 173.995, "train_tokens_per_second": 1731.176 }, { "epoch": 10.722222222222221, "grad_norm": 0.022705078125, "learning_rate": 0.015755988930051302, "loss": 0.2329, "num_input_tokens_seen": 302784, "step": 965, "train_runtime": 174.7881, "train_tokens_per_second": 1732.292 }, { "epoch": 10.777777777777779, "grad_norm": 0.01312255859375, "learning_rate": 0.01561069640165394, "loss": 0.2371, "num_input_tokens_seen": 304320, "step": 970, "train_runtime": 175.5852, "train_tokens_per_second": 1733.175 }, { "epoch": 10.833333333333334, "grad_norm": 0.01214599609375, "learning_rate": 0.015465346457421807, "loss": 0.239, "num_input_tokens_seen": 305856, "step": 975, "train_runtime": 176.3792, "train_tokens_per_second": 1734.082 }, { "epoch": 10.88888888888889, "grad_norm": 0.0140380859375, "learning_rate": 0.015319952762718515, "loss": 0.2338, "num_input_tokens_seen": 307424, "step": 980, "train_runtime": 177.1761, "train_tokens_per_second": 1735.132 }, { "epoch": 10.944444444444445, "grad_norm": 0.01434326171875, "learning_rate": 0.015174528987020958, "loss": 0.234, "num_input_tokens_seen": 308992, "step": 985, "train_runtime": 177.9704, "train_tokens_per_second": 1736.198 }, { "epoch": 11.0, "grad_norm": 0.01129150390625, "learning_rate": 0.015029088802634146, "loss": 0.2349, "num_input_tokens_seen": 310560, "step": 990, "train_runtime": 178.804, "train_tokens_per_second": 1736.874 }, { "epoch": 11.0, "eval_loss": 0.23004861176013947, "eval_runtime": 0.8164, "eval_samples_per_second": 48.995, "eval_steps_per_second": 12.249, "num_input_tokens_seen": 310560, "step": 990 }, { "epoch": 11.055555555555555, "grad_norm": 0.0113525390625, "learning_rate": 0.014883645883405797, "loss": 0.2328, "num_input_tokens_seen": 312160, "step": 995, "train_runtime": 181.2905, "train_tokens_per_second": 1721.877 }, { "epoch": 11.11111111111111, "grad_norm": 0.01300048828125, "learning_rate": 0.014738213903440746, "loss": 0.2319, "num_input_tokens_seen": 313728, "step": 1000, "train_runtime": 182.1312, "train_tokens_per_second": 1722.538 }, { "epoch": 11.166666666666666, "grad_norm": 0.01287841796875, "learning_rate": 0.014592806535815357, "loss": 0.2386, "num_input_tokens_seen": 315264, "step": 1005, "train_runtime": 182.9305, "train_tokens_per_second": 1723.409 }, { "epoch": 11.222222222222221, "grad_norm": 0.02099609375, "learning_rate": 0.014447437451291999, "loss": 0.2291, "num_input_tokens_seen": 316864, "step": 1010, "train_runtime": 183.7272, "train_tokens_per_second": 1724.644 }, { "epoch": 11.277777777777779, "grad_norm": 0.01251220703125, "learning_rate": 0.014302120317033798, "loss": 0.2201, "num_input_tokens_seen": 318432, "step": 1015, "train_runtime": 184.5231, "train_tokens_per_second": 1725.703 }, { "epoch": 11.333333333333334, "grad_norm": 0.004974365234375, "learning_rate": 0.014156868795319669, "loss": 0.2403, "num_input_tokens_seen": 320032, "step": 1020, "train_runtime": 185.3161, "train_tokens_per_second": 1726.952 }, { "epoch": 11.38888888888889, "grad_norm": 0.01190185546875, "learning_rate": 0.014011696542259821, "loss": 0.2356, "num_input_tokens_seen": 321536, "step": 1025, "train_runtime": 186.1035, "train_tokens_per_second": 1727.727 }, { "epoch": 11.444444444444445, "grad_norm": 0.01007080078125, "learning_rate": 0.013866617206511882, "loss": 0.235, "num_input_tokens_seen": 323040, "step": 1030, "train_runtime": 186.8909, "train_tokens_per_second": 1728.495 }, { "epoch": 11.5, "grad_norm": 0.00927734375, "learning_rate": 0.013721644427997651, "loss": 0.2268, "num_input_tokens_seen": 324608, "step": 1035, "train_runtime": 187.6849, "train_tokens_per_second": 1729.537 }, { "epoch": 11.555555555555555, "grad_norm": 0.0125732421875, "learning_rate": 0.01357679183662076, "loss": 0.2333, "num_input_tokens_seen": 326144, "step": 1040, "train_runtime": 188.4763, "train_tokens_per_second": 1730.425 }, { "epoch": 11.61111111111111, "grad_norm": 0.0048828125, "learning_rate": 0.0134320730509852, "loss": 0.2322, "num_input_tokens_seen": 327712, "step": 1045, "train_runtime": 189.2669, "train_tokens_per_second": 1731.481 }, { "epoch": 11.666666666666666, "grad_norm": 0.0029296875, "learning_rate": 0.01328750167711494, "loss": 0.2322, "num_input_tokens_seen": 329248, "step": 1050, "train_runtime": 190.0636, "train_tokens_per_second": 1732.304 }, { "epoch": 11.722222222222221, "grad_norm": 0.004974365234375, "learning_rate": 0.013143091307174755, "loss": 0.2413, "num_input_tokens_seen": 330816, "step": 1055, "train_runtime": 190.8551, "train_tokens_per_second": 1733.336 }, { "epoch": 11.777777777777779, "grad_norm": 0.0208740234375, "learning_rate": 0.012998855518192309, "loss": 0.2275, "num_input_tokens_seen": 332416, "step": 1060, "train_runtime": 191.6505, "train_tokens_per_second": 1734.491 }, { "epoch": 11.833333333333334, "grad_norm": 0.010009765625, "learning_rate": 0.012854807870781686, "loss": 0.2338, "num_input_tokens_seen": 334016, "step": 1065, "train_runtime": 192.4488, "train_tokens_per_second": 1735.61 }, { "epoch": 11.88888888888889, "grad_norm": 0.01092529296875, "learning_rate": 0.012710961907868478, "loss": 0.2338, "num_input_tokens_seen": 335616, "step": 1070, "train_runtime": 193.2439, "train_tokens_per_second": 1736.748 }, { "epoch": 11.944444444444445, "grad_norm": 0.0027923583984375, "learning_rate": 0.012567331153416489, "loss": 0.2359, "num_input_tokens_seen": 337152, "step": 1075, "train_runtime": 194.0342, "train_tokens_per_second": 1737.59 }, { "epoch": 12.0, "grad_norm": 0.004852294921875, "learning_rate": 0.012423929111156296, "loss": 0.2315, "num_input_tokens_seen": 338784, "step": 1080, "train_runtime": 194.8731, "train_tokens_per_second": 1738.486 }, { "epoch": 12.0, "eval_loss": 0.23689353466033936, "eval_runtime": 0.8185, "eval_samples_per_second": 48.871, "eval_steps_per_second": 12.218, "num_input_tokens_seen": 338784, "step": 1080 }, { "epoch": 12.055555555555555, "grad_norm": 0.01953125, "learning_rate": 0.012280769263315627, "loss": 0.2296, "num_input_tokens_seen": 340288, "step": 1085, "train_runtime": 197.3733, "train_tokens_per_second": 1724.083 }, { "epoch": 12.11111111111111, "grad_norm": 0.01123046875, "learning_rate": 0.012137865069351828, "loss": 0.2306, "num_input_tokens_seen": 341888, "step": 1090, "train_runtime": 198.1719, "train_tokens_per_second": 1725.209 }, { "epoch": 12.166666666666666, "grad_norm": 0.0205078125, "learning_rate": 0.01199522996468644, "loss": 0.2317, "num_input_tokens_seen": 343488, "step": 1095, "train_runtime": 198.9687, "train_tokens_per_second": 1726.342 }, { "epoch": 12.222222222222221, "grad_norm": 0.0130615234375, "learning_rate": 0.01185287735944204, "loss": 0.2309, "num_input_tokens_seen": 344992, "step": 1100, "train_runtime": 199.7613, "train_tokens_per_second": 1727.021 }, { "epoch": 12.277777777777779, "grad_norm": 0.0029296875, "learning_rate": 0.011710820637181448, "loss": 0.2392, "num_input_tokens_seen": 346560, "step": 1105, "train_runtime": 200.5543, "train_tokens_per_second": 1728.011 }, { "epoch": 12.333333333333334, "grad_norm": 0.00457763671875, "learning_rate": 0.011569073153649483, "loss": 0.2339, "num_input_tokens_seen": 348160, "step": 1110, "train_runtime": 201.3491, "train_tokens_per_second": 1729.136 }, { "epoch": 12.38888888888889, "grad_norm": 0.0118408203125, "learning_rate": 0.01142764823551724, "loss": 0.234, "num_input_tokens_seen": 349760, "step": 1115, "train_runtime": 202.1428, "train_tokens_per_second": 1730.262 }, { "epoch": 12.444444444444445, "grad_norm": 0.01214599609375, "learning_rate": 0.011286559179129213, "loss": 0.2319, "num_input_tokens_seen": 351328, "step": 1120, "train_runtime": 202.9386, "train_tokens_per_second": 1731.204 }, { "epoch": 12.5, "grad_norm": 0.020263671875, "learning_rate": 0.01114581924925317, "loss": 0.2318, "num_input_tokens_seen": 352896, "step": 1125, "train_runtime": 203.734, "train_tokens_per_second": 1732.141 }, { "epoch": 12.555555555555555, "grad_norm": 0.01019287109375, "learning_rate": 0.011005441677833067, "loss": 0.2295, "num_input_tokens_seen": 354464, "step": 1130, "train_runtime": 204.5288, "train_tokens_per_second": 1733.076 }, { "epoch": 12.61111111111111, "grad_norm": 0.00994873046875, "learning_rate": 0.010865439662745013, "loss": 0.2339, "num_input_tokens_seen": 356032, "step": 1135, "train_runtime": 205.321, "train_tokens_per_second": 1734.026 }, { "epoch": 12.666666666666666, "grad_norm": 0.01141357421875, "learning_rate": 0.01072582636655643, "loss": 0.2263, "num_input_tokens_seen": 357632, "step": 1140, "train_runtime": 206.1151, "train_tokens_per_second": 1735.108 }, { "epoch": 12.722222222222221, "grad_norm": 0.0113525390625, "learning_rate": 0.010586614915288572, "loss": 0.2327, "num_input_tokens_seen": 359168, "step": 1145, "train_runtime": 206.9071, "train_tokens_per_second": 1735.89 }, { "epoch": 12.777777777777779, "grad_norm": 0.005645751953125, "learning_rate": 0.010447818397182444, "loss": 0.2337, "num_input_tokens_seen": 360736, "step": 1150, "train_runtime": 207.6979, "train_tokens_per_second": 1736.83 }, { "epoch": 12.833333333333334, "grad_norm": 0.0033721923828125, "learning_rate": 0.010309449861468272, "loss": 0.2317, "num_input_tokens_seen": 362304, "step": 1155, "train_runtime": 208.4895, "train_tokens_per_second": 1737.757 }, { "epoch": 12.88888888888889, "grad_norm": 0.004791259765625, "learning_rate": 0.010171522317138689, "loss": 0.2318, "num_input_tokens_seen": 363872, "step": 1160, "train_runtime": 209.2816, "train_tokens_per_second": 1738.671 }, { "epoch": 12.944444444444445, "grad_norm": 0.00982666015625, "learning_rate": 0.01003404873172563, "loss": 0.2339, "num_input_tokens_seen": 365376, "step": 1165, "train_runtime": 210.0732, "train_tokens_per_second": 1739.28 }, { "epoch": 13.0, "grad_norm": 0.0029296875, "learning_rate": 0.009897042030081191, "loss": 0.2297, "num_input_tokens_seen": 366944, "step": 1170, "train_runtime": 210.9074, "train_tokens_per_second": 1739.834 }, { "epoch": 13.0, "eval_loss": 0.2312408983707428, "eval_runtime": 0.8181, "eval_samples_per_second": 48.893, "eval_steps_per_second": 12.223, "num_input_tokens_seen": 366944, "step": 1170 }, { "epoch": 13.055555555555555, "grad_norm": 0.0034332275390625, "learning_rate": 0.009760515093162463, "loss": 0.2329, "num_input_tokens_seen": 368384, "step": 1175, "train_runtime": 213.385, "train_tokens_per_second": 1726.382 }, { "epoch": 13.11111111111111, "grad_norm": 0.01220703125, "learning_rate": 0.009624480756820496, "loss": 0.2307, "num_input_tokens_seen": 369984, "step": 1180, "train_runtime": 214.2014, "train_tokens_per_second": 1727.272 }, { "epoch": 13.166666666666666, "grad_norm": 0.011474609375, "learning_rate": 0.009488951810593525, "loss": 0.2327, "num_input_tokens_seen": 371520, "step": 1185, "train_runtime": 214.9913, "train_tokens_per_second": 1728.07 }, { "epoch": 13.222222222222221, "grad_norm": 0.01251220703125, "learning_rate": 0.009353940996504537, "loss": 0.2391, "num_input_tokens_seen": 373120, "step": 1190, "train_runtime": 215.7896, "train_tokens_per_second": 1729.092 }, { "epoch": 13.277777777777779, "grad_norm": 0.01239013671875, "learning_rate": 0.009219461007863278, "loss": 0.2317, "num_input_tokens_seen": 374688, "step": 1195, "train_runtime": 216.5862, "train_tokens_per_second": 1729.972 }, { "epoch": 13.333333333333334, "grad_norm": 0.0036163330078125, "learning_rate": 0.009085524488072901, "loss": 0.2347, "num_input_tokens_seen": 376288, "step": 1200, "train_runtime": 217.3817, "train_tokens_per_second": 1731.001 }, { "epoch": 13.38888888888889, "grad_norm": 0.00543212890625, "learning_rate": 0.008952144029441248, "loss": 0.2304, "num_input_tokens_seen": 377888, "step": 1205, "train_runtime": 218.1804, "train_tokens_per_second": 1731.998 }, { "epoch": 13.444444444444445, "grad_norm": 0.0064697265625, "learning_rate": 0.008819332171996975, "loss": 0.2325, "num_input_tokens_seen": 379424, "step": 1210, "train_runtime": 218.971, "train_tokens_per_second": 1732.759 }, { "epoch": 13.5, "grad_norm": 0.01080322265625, "learning_rate": 0.008687101402310564, "loss": 0.2336, "num_input_tokens_seen": 380992, "step": 1215, "train_runtime": 219.765, "train_tokens_per_second": 1733.634 }, { "epoch": 13.555555555555555, "grad_norm": 0.003936767578125, "learning_rate": 0.008555464152320372, "loss": 0.2295, "num_input_tokens_seen": 382592, "step": 1220, "train_runtime": 220.5584, "train_tokens_per_second": 1734.652 }, { "epoch": 13.61111111111111, "grad_norm": 0.01177978515625, "learning_rate": 0.008424432798163836, "loss": 0.2284, "num_input_tokens_seen": 384192, "step": 1225, "train_runtime": 221.3532, "train_tokens_per_second": 1735.651 }, { "epoch": 13.666666666666666, "grad_norm": 0.01123046875, "learning_rate": 0.008294019659013892, "loss": 0.2325, "num_input_tokens_seen": 385760, "step": 1230, "train_runtime": 222.1454, "train_tokens_per_second": 1736.521 }, { "epoch": 13.722222222222221, "grad_norm": 0.0120849609375, "learning_rate": 0.008164236995920735, "loss": 0.2358, "num_input_tokens_seen": 387328, "step": 1235, "train_runtime": 222.9361, "train_tokens_per_second": 1737.395 }, { "epoch": 13.777777777777779, "grad_norm": 0.00482177734375, "learning_rate": 0.008035097010659147, "loss": 0.2295, "num_input_tokens_seen": 388896, "step": 1240, "train_runtime": 223.7293, "train_tokens_per_second": 1738.244 }, { "epoch": 13.833333333333334, "grad_norm": 0.0101318359375, "learning_rate": 0.00790661184458125, "loss": 0.2346, "num_input_tokens_seen": 390496, "step": 1245, "train_runtime": 224.5255, "train_tokens_per_second": 1739.206 }, { "epoch": 13.88888888888889, "grad_norm": 0.0030670166015625, "learning_rate": 0.007778793577475039, "loss": 0.2284, "num_input_tokens_seen": 392064, "step": 1250, "train_runtime": 225.3179, "train_tokens_per_second": 1740.048 }, { "epoch": 13.944444444444445, "grad_norm": 0.0093994140625, "learning_rate": 0.007651654226428696, "loss": 0.2265, "num_input_tokens_seen": 393632, "step": 1255, "train_runtime": 226.1132, "train_tokens_per_second": 1740.862 }, { "epoch": 14.0, "grad_norm": 0.0101318359375, "learning_rate": 0.0075252057447007465, "loss": 0.2276, "num_input_tokens_seen": 395104, "step": 1260, "train_runtime": 226.95, "train_tokens_per_second": 1740.93 }, { "epoch": 14.0, "eval_loss": 0.2316901683807373, "eval_runtime": 0.8178, "eval_samples_per_second": 48.909, "eval_steps_per_second": 12.227, "num_input_tokens_seen": 395104, "step": 1260 }, { "epoch": 14.055555555555555, "grad_norm": 0.00628662109375, "learning_rate": 0.007399460020596265, "loss": 0.2307, "num_input_tokens_seen": 396672, "step": 1265, "train_runtime": 229.4732, "train_tokens_per_second": 1728.62 }, { "epoch": 14.11111111111111, "grad_norm": 0.005584716796875, "learning_rate": 0.007274428876349185, "loss": 0.2348, "num_input_tokens_seen": 398304, "step": 1270, "train_runtime": 230.292, "train_tokens_per_second": 1729.561 }, { "epoch": 14.166666666666666, "grad_norm": 0.01055908203125, "learning_rate": 0.007150124067010788, "loss": 0.2317, "num_input_tokens_seen": 399840, "step": 1275, "train_runtime": 231.085, "train_tokens_per_second": 1730.272 }, { "epoch": 14.222222222222221, "grad_norm": 0.0037689208984375, "learning_rate": 0.007026557279344533, "loss": 0.2286, "num_input_tokens_seen": 401440, "step": 1280, "train_runtime": 231.8835, "train_tokens_per_second": 1731.214 }, { "epoch": 14.277777777777779, "grad_norm": 0.0035400390625, "learning_rate": 0.006903740130727311, "loss": 0.2264, "num_input_tokens_seen": 403040, "step": 1285, "train_runtime": 232.6814, "train_tokens_per_second": 1732.154 }, { "epoch": 14.333333333333334, "grad_norm": 0.01104736328125, "learning_rate": 0.0067816841680572015, "loss": 0.2337, "num_input_tokens_seen": 404640, "step": 1290, "train_runtime": 233.4758, "train_tokens_per_second": 1733.113 }, { "epoch": 14.38888888888889, "grad_norm": 0.004364013671875, "learning_rate": 0.006660400866667899, "loss": 0.2246, "num_input_tokens_seen": 406208, "step": 1295, "train_runtime": 234.2675, "train_tokens_per_second": 1733.95 }, { "epoch": 14.444444444444445, "grad_norm": 0.005584716796875, "learning_rate": 0.006539901629249787, "loss": 0.2322, "num_input_tokens_seen": 407776, "step": 1300, "train_runtime": 235.0597, "train_tokens_per_second": 1734.776 }, { "epoch": 14.5, "grad_norm": 0.004791259765625, "learning_rate": 0.006420197784777924, "loss": 0.2268, "num_input_tokens_seen": 409312, "step": 1305, "train_runtime": 235.8489, "train_tokens_per_second": 1735.484 }, { "epoch": 14.555555555555555, "grad_norm": 0.02587890625, "learning_rate": 0.006301300587446937, "loss": 0.2314, "num_input_tokens_seen": 410816, "step": 1310, "train_runtime": 236.6364, "train_tokens_per_second": 1736.064 }, { "epoch": 14.61111111111111, "grad_norm": 0.0244140625, "learning_rate": 0.006183221215612904, "loss": 0.2415, "num_input_tokens_seen": 412416, "step": 1315, "train_runtime": 237.4299, "train_tokens_per_second": 1737.001 }, { "epoch": 14.666666666666666, "grad_norm": 0.0108642578125, "learning_rate": 0.00606597077074242, "loss": 0.2288, "num_input_tokens_seen": 414016, "step": 1320, "train_runtime": 238.223, "train_tokens_per_second": 1737.935 }, { "epoch": 14.722222222222221, "grad_norm": 0.003570556640625, "learning_rate": 0.005949560276368865, "loss": 0.2402, "num_input_tokens_seen": 415552, "step": 1325, "train_runtime": 239.012, "train_tokens_per_second": 1738.624 }, { "epoch": 14.777777777777779, "grad_norm": 0.005096435546875, "learning_rate": 0.005834000677056003, "loss": 0.2289, "num_input_tokens_seen": 417088, "step": 1330, "train_runtime": 239.8035, "train_tokens_per_second": 1739.291 }, { "epoch": 14.833333333333334, "grad_norm": 0.010009765625, "learning_rate": 0.005719302837369021, "loss": 0.2317, "num_input_tokens_seen": 418656, "step": 1335, "train_runtime": 240.5946, "train_tokens_per_second": 1740.089 }, { "epoch": 14.88888888888889, "grad_norm": 0.01055908203125, "learning_rate": 0.00560547754085305, "loss": 0.2265, "num_input_tokens_seen": 420256, "step": 1340, "train_runtime": 241.3879, "train_tokens_per_second": 1740.999 }, { "epoch": 14.944444444444445, "grad_norm": 0.0203857421875, "learning_rate": 0.005492535489019344, "loss": 0.2245, "num_input_tokens_seen": 421792, "step": 1345, "train_runtime": 242.1774, "train_tokens_per_second": 1741.665 }, { "epoch": 15.0, "grad_norm": 0.0223388671875, "learning_rate": 0.005380487300339167, "loss": 0.2402, "num_input_tokens_seen": 423360, "step": 1350, "train_runtime": 243.0095, "train_tokens_per_second": 1742.154 }, { "epoch": 15.0, "eval_loss": 0.23129186034202576, "eval_runtime": 0.8149, "eval_samples_per_second": 49.088, "eval_steps_per_second": 12.272, "num_input_tokens_seen": 423360, "step": 1350 }, { "epoch": 15.055555555555555, "grad_norm": 0.01123046875, "learning_rate": 0.005269343509245449, "loss": 0.2339, "num_input_tokens_seen": 424992, "step": 1355, "train_runtime": 245.5308, "train_tokens_per_second": 1730.911 }, { "epoch": 15.11111111111111, "grad_norm": 0.01226806640625, "learning_rate": 0.005159114565142392, "loss": 0.2307, "num_input_tokens_seen": 426528, "step": 1360, "train_runtime": 246.3315, "train_tokens_per_second": 1731.52 }, { "epoch": 15.166666666666666, "grad_norm": 0.0106201171875, "learning_rate": 0.0050498108314230425, "loss": 0.2318, "num_input_tokens_seen": 428096, "step": 1365, "train_runtime": 247.1262, "train_tokens_per_second": 1732.297 }, { "epoch": 15.222222222222221, "grad_norm": 0.00732421875, "learning_rate": 0.0049414425844949445, "loss": 0.2307, "num_input_tokens_seen": 429600, "step": 1370, "train_runtime": 247.9142, "train_tokens_per_second": 1732.858 }, { "epoch": 15.277777777777779, "grad_norm": 0.00335693359375, "learning_rate": 0.004834020012814016, "loss": 0.2337, "num_input_tokens_seen": 431200, "step": 1375, "train_runtime": 248.7142, "train_tokens_per_second": 1733.717 }, { "epoch": 15.333333333333334, "grad_norm": 0.01177978515625, "learning_rate": 0.004727553215926623, "loss": 0.2305, "num_input_tokens_seen": 432736, "step": 1380, "train_runtime": 249.5378, "train_tokens_per_second": 1734.15 }, { "epoch": 15.38888888888889, "grad_norm": 0.010498046875, "learning_rate": 0.004622052203520061, "loss": 0.2276, "num_input_tokens_seen": 434336, "step": 1385, "train_runtime": 250.3618, "train_tokens_per_second": 1734.834 }, { "epoch": 15.444444444444445, "grad_norm": 0.0115966796875, "learning_rate": 0.004517526894481498, "loss": 0.2348, "num_input_tokens_seen": 435904, "step": 1390, "train_runtime": 251.1536, "train_tokens_per_second": 1735.607 }, { "epoch": 15.5, "grad_norm": 0.0107421875, "learning_rate": 0.004413987115965404, "loss": 0.2286, "num_input_tokens_seen": 437440, "step": 1395, "train_runtime": 251.9431, "train_tokens_per_second": 1736.265 }, { "epoch": 15.555555555555555, "grad_norm": 0.01171875, "learning_rate": 0.004311442602469636, "loss": 0.2347, "num_input_tokens_seen": 438976, "step": 1400, "train_runtime": 252.733, "train_tokens_per_second": 1736.916 }, { "epoch": 15.61111111111111, "grad_norm": 0.005950927734375, "learning_rate": 0.004209902994920235, "loss": 0.2255, "num_input_tokens_seen": 440512, "step": 1405, "train_runtime": 253.5249, "train_tokens_per_second": 1737.549 }, { "epoch": 15.666666666666666, "grad_norm": 0.01190185546875, "learning_rate": 0.004109377839765016, "loss": 0.2295, "num_input_tokens_seen": 442112, "step": 1410, "train_runtime": 254.3181, "train_tokens_per_second": 1738.421 }, { "epoch": 15.722222222222221, "grad_norm": 0.012451171875, "learning_rate": 0.004009876588076046, "loss": 0.2339, "num_input_tokens_seen": 443616, "step": 1415, "train_runtime": 255.1075, "train_tokens_per_second": 1738.938 }, { "epoch": 15.777777777777779, "grad_norm": 0.022705078125, "learning_rate": 0.003911408594661061, "loss": 0.2316, "num_input_tokens_seen": 445184, "step": 1420, "train_runtime": 255.8999, "train_tokens_per_second": 1739.68 }, { "epoch": 15.833333333333334, "grad_norm": 0.012451171875, "learning_rate": 0.0038139831171839726, "loss": 0.2308, "num_input_tokens_seen": 446752, "step": 1425, "train_runtime": 256.6958, "train_tokens_per_second": 1740.394 }, { "epoch": 15.88888888888889, "grad_norm": 0.01324462890625, "learning_rate": 0.0037176093152944947, "loss": 0.2318, "num_input_tokens_seen": 448352, "step": 1430, "train_runtime": 257.491, "train_tokens_per_second": 1741.234 }, { "epoch": 15.944444444444445, "grad_norm": 0.0101318359375, "learning_rate": 0.0036222962497669668, "loss": 0.2276, "num_input_tokens_seen": 449888, "step": 1435, "train_runtime": 258.2827, "train_tokens_per_second": 1741.843 }, { "epoch": 16.0, "grad_norm": 0.00531005859375, "learning_rate": 0.003528052881648488, "loss": 0.2338, "num_input_tokens_seen": 451424, "step": 1440, "train_runtime": 259.1151, "train_tokens_per_second": 1742.176 }, { "epoch": 16.0, "eval_loss": 0.23337697982788086, "eval_runtime": 0.8216, "eval_samples_per_second": 48.684, "eval_steps_per_second": 12.171, "num_input_tokens_seen": 451424, "step": 1440 }, { "epoch": 16.055555555555557, "grad_norm": 0.0111083984375, "learning_rate": 0.0034348880714164414, "loss": 0.2306, "num_input_tokens_seen": 452992, "step": 1445, "train_runtime": 261.5987, "train_tokens_per_second": 1731.629 }, { "epoch": 16.11111111111111, "grad_norm": 0.00958251953125, "learning_rate": 0.0033428105781454364, "loss": 0.2266, "num_input_tokens_seen": 454496, "step": 1450, "train_runtime": 262.4288, "train_tokens_per_second": 1731.883 }, { "epoch": 16.166666666666668, "grad_norm": 0.005096435546875, "learning_rate": 0.0032518290586838377, "loss": 0.2359, "num_input_tokens_seen": 456096, "step": 1455, "train_runtime": 263.2247, "train_tokens_per_second": 1732.725 }, { "epoch": 16.22222222222222, "grad_norm": 0.0118408203125, "learning_rate": 0.0031619520668398388, "loss": 0.2308, "num_input_tokens_seen": 457696, "step": 1460, "train_runtime": 264.0196, "train_tokens_per_second": 1733.568 }, { "epoch": 16.27777777777778, "grad_norm": 0.00433349609375, "learning_rate": 0.003073188052577281, "loss": 0.2318, "num_input_tokens_seen": 459232, "step": 1465, "train_runtime": 264.8136, "train_tokens_per_second": 1734.171 }, { "epoch": 16.333333333333332, "grad_norm": 0.0062255859375, "learning_rate": 0.00298554536122122, "loss": 0.2337, "num_input_tokens_seen": 460832, "step": 1470, "train_runtime": 265.6148, "train_tokens_per_second": 1734.964 }, { "epoch": 16.38888888888889, "grad_norm": 0.0040283203125, "learning_rate": 0.0028990322326732957, "loss": 0.2329, "num_input_tokens_seen": 462432, "step": 1475, "train_runtime": 266.4104, "train_tokens_per_second": 1735.788 }, { "epoch": 16.444444444444443, "grad_norm": 0.00653076171875, "learning_rate": 0.0028136568006370643, "loss": 0.2245, "num_input_tokens_seen": 464000, "step": 1480, "train_runtime": 267.2022, "train_tokens_per_second": 1736.513 }, { "epoch": 16.5, "grad_norm": 0.01025390625, "learning_rate": 0.0027294270918532875, "loss": 0.2256, "num_input_tokens_seen": 465536, "step": 1485, "train_runtime": 267.991, "train_tokens_per_second": 1737.133 }, { "epoch": 16.555555555555557, "grad_norm": 0.02001953125, "learning_rate": 0.0026463510253452744, "loss": 0.2255, "num_input_tokens_seen": 467136, "step": 1490, "train_runtime": 268.784, "train_tokens_per_second": 1737.96 }, { "epoch": 16.61111111111111, "grad_norm": 0.002838134765625, "learning_rate": 0.0025644364116743754, "loss": 0.2308, "num_input_tokens_seen": 468672, "step": 1495, "train_runtime": 269.575, "train_tokens_per_second": 1738.559 }, { "epoch": 16.666666666666668, "grad_norm": 0.003387451171875, "learning_rate": 0.002483690952205637, "loss": 0.235, "num_input_tokens_seen": 470272, "step": 1500, "train_runtime": 270.3683, "train_tokens_per_second": 1739.376 }, { "epoch": 16.72222222222222, "grad_norm": 0.0108642578125, "learning_rate": 0.0024041222383837536, "loss": 0.2306, "num_input_tokens_seen": 471872, "step": 1505, "train_runtime": 271.1607, "train_tokens_per_second": 1740.193 }, { "epoch": 16.77777777777778, "grad_norm": 0.011962890625, "learning_rate": 0.002325737751019347, "loss": 0.2276, "num_input_tokens_seen": 473440, "step": 1510, "train_runtime": 271.9576, "train_tokens_per_second": 1740.859 }, { "epoch": 16.833333333333332, "grad_norm": 0.01055908203125, "learning_rate": 0.00224854485958563, "loss": 0.2308, "num_input_tokens_seen": 475008, "step": 1515, "train_runtime": 272.7521, "train_tokens_per_second": 1741.537 }, { "epoch": 16.88888888888889, "grad_norm": 0.0120849609375, "learning_rate": 0.0021725508215255634, "loss": 0.234, "num_input_tokens_seen": 476608, "step": 1520, "train_runtime": 273.5491, "train_tokens_per_second": 1742.312 }, { "epoch": 16.944444444444443, "grad_norm": 0.01226806640625, "learning_rate": 0.0020977627815695213, "loss": 0.2286, "num_input_tokens_seen": 478176, "step": 1525, "train_runtime": 274.3439, "train_tokens_per_second": 1742.98 }, { "epoch": 17.0, "grad_norm": 0.0130615234375, "learning_rate": 0.0020241877710635747, "loss": 0.2339, "num_input_tokens_seen": 479744, "step": 1530, "train_runtime": 275.1758, "train_tokens_per_second": 1743.409 }, { "epoch": 17.0, "eval_loss": 0.2328735888004303, "eval_runtime": 0.8173, "eval_samples_per_second": 48.943, "eval_steps_per_second": 12.236, "num_input_tokens_seen": 479744, "step": 1530 }, { "epoch": 17.055555555555557, "grad_norm": 0.0213623046875, "learning_rate": 0.0019518327073084285, "loss": 0.2328, "num_input_tokens_seen": 481344, "step": 1535, "train_runtime": 277.7336, "train_tokens_per_second": 1733.114 }, { "epoch": 17.11111111111111, "grad_norm": 0.01092529296875, "learning_rate": 0.0018807043929090638, "loss": 0.2328, "num_input_tokens_seen": 482944, "step": 1540, "train_runtime": 278.5559, "train_tokens_per_second": 1733.742 }, { "epoch": 17.166666666666668, "grad_norm": 0.01220703125, "learning_rate": 0.0018108095151351837, "loss": 0.2275, "num_input_tokens_seen": 484480, "step": 1545, "train_runtime": 279.3468, "train_tokens_per_second": 1734.332 }, { "epoch": 17.22222222222222, "grad_norm": 0.01348876953125, "learning_rate": 0.001742154645292508, "loss": 0.2381, "num_input_tokens_seen": 486016, "step": 1550, "train_runtime": 280.1392, "train_tokens_per_second": 1734.909 }, { "epoch": 17.27777777777778, "grad_norm": 0.0045166015625, "learning_rate": 0.0016747462381049415, "loss": 0.2307, "num_input_tokens_seen": 487584, "step": 1555, "train_runtime": 280.9345, "train_tokens_per_second": 1735.579 }, { "epoch": 17.333333333333332, "grad_norm": 0.006256103515625, "learning_rate": 0.0016085906311077212, "loss": 0.2339, "num_input_tokens_seen": 489088, "step": 1560, "train_runtime": 281.7263, "train_tokens_per_second": 1736.04 }, { "epoch": 17.38888888888889, "grad_norm": 0.01171875, "learning_rate": 0.0015436940440516017, "loss": 0.2306, "num_input_tokens_seen": 490688, "step": 1565, "train_runtime": 282.5227, "train_tokens_per_second": 1736.809 }, { "epoch": 17.444444444444443, "grad_norm": 0.022216796875, "learning_rate": 0.0014800625783180658, "loss": 0.237, "num_input_tokens_seen": 492288, "step": 1570, "train_runtime": 283.316, "train_tokens_per_second": 1737.593 }, { "epoch": 17.5, "grad_norm": 0.011474609375, "learning_rate": 0.0014177022163457135, "loss": 0.2308, "num_input_tokens_seen": 493824, "step": 1575, "train_runtime": 284.1065, "train_tokens_per_second": 1738.165 }, { "epoch": 17.555555555555557, "grad_norm": 0.01409912109375, "learning_rate": 0.0013566188210677903, "loss": 0.2338, "num_input_tokens_seen": 495456, "step": 1580, "train_runtime": 284.9046, "train_tokens_per_second": 1739.024 }, { "epoch": 17.61111111111111, "grad_norm": 0.004791259765625, "learning_rate": 0.0012968181353609854, "loss": 0.2307, "num_input_tokens_seen": 497024, "step": 1585, "train_runtime": 285.6964, "train_tokens_per_second": 1739.693 }, { "epoch": 17.666666666666668, "grad_norm": 0.01092529296875, "learning_rate": 0.0012383057815055082, "loss": 0.2266, "num_input_tokens_seen": 498592, "step": 1590, "train_runtime": 286.4968, "train_tokens_per_second": 1740.306 }, { "epoch": 17.72222222222222, "grad_norm": 0.01214599609375, "learning_rate": 0.001181087260656487, "loss": 0.2308, "num_input_tokens_seen": 500128, "step": 1595, "train_runtime": 287.2884, "train_tokens_per_second": 1740.857 }, { "epoch": 17.77777777777778, "grad_norm": 0.0223388671875, "learning_rate": 0.0011251679523267587, "loss": 0.2297, "num_input_tokens_seen": 501696, "step": 1600, "train_runtime": 288.0865, "train_tokens_per_second": 1741.477 }, { "epoch": 17.833333333333332, "grad_norm": 0.003814697265625, "learning_rate": 0.0010705531138811369, "loss": 0.2327, "num_input_tokens_seen": 503232, "step": 1605, "train_runtime": 288.877, "train_tokens_per_second": 1742.029 }, { "epoch": 17.88888888888889, "grad_norm": 0.01214599609375, "learning_rate": 0.0010172478800420954, "loss": 0.2296, "num_input_tokens_seen": 504736, "step": 1610, "train_runtime": 289.6642, "train_tokens_per_second": 1742.487 }, { "epoch": 17.944444444444443, "grad_norm": 0.021484375, "learning_rate": 0.0009652572624070293, "loss": 0.2256, "num_input_tokens_seen": 506304, "step": 1615, "train_runtime": 290.4568, "train_tokens_per_second": 1743.13 }, { "epoch": 18.0, "grad_norm": 0.00579833984375, "learning_rate": 0.0009145861489770912, "loss": 0.2307, "num_input_tokens_seen": 507872, "step": 1620, "train_runtime": 291.2951, "train_tokens_per_second": 1743.497 }, { "epoch": 18.0, "eval_loss": 0.23335394263267517, "eval_runtime": 0.8176, "eval_samples_per_second": 48.921, "eval_steps_per_second": 12.23, "num_input_tokens_seen": 507872, "step": 1620 }, { "epoch": 18.055555555555557, "grad_norm": 0.00408935546875, "learning_rate": 0.0008652393036976157, "loss": 0.2286, "num_input_tokens_seen": 509408, "step": 1625, "train_runtime": 293.8353, "train_tokens_per_second": 1733.651 }, { "epoch": 18.11111111111111, "grad_norm": 0.020751953125, "learning_rate": 0.0008172213660102473, "loss": 0.2267, "num_input_tokens_seen": 510912, "step": 1630, "train_runtime": 294.6455, "train_tokens_per_second": 1733.989 }, { "epoch": 18.166666666666668, "grad_norm": 0.0113525390625, "learning_rate": 0.0007705368504167398, "loss": 0.2329, "num_input_tokens_seen": 512384, "step": 1635, "train_runtime": 295.4336, "train_tokens_per_second": 1734.346 }, { "epoch": 18.22222222222222, "grad_norm": 0.01153564453125, "learning_rate": 0.0007251901460545118, "loss": 0.2307, "num_input_tokens_seen": 513952, "step": 1640, "train_runtime": 296.2304, "train_tokens_per_second": 1734.974 }, { "epoch": 18.27777777777778, "grad_norm": 0.003753662109375, "learning_rate": 0.0006811855162840213, "loss": 0.238, "num_input_tokens_seen": 515520, "step": 1645, "train_runtime": 297.0246, "train_tokens_per_second": 1735.614 }, { "epoch": 18.333333333333332, "grad_norm": 0.003143310546875, "learning_rate": 0.0006385270982879065, "loss": 0.236, "num_input_tokens_seen": 517120, "step": 1650, "train_runtime": 297.8218, "train_tokens_per_second": 1736.34 }, { "epoch": 18.38888888888889, "grad_norm": 0.0034332275390625, "learning_rate": 0.0005972189026820351, "loss": 0.2276, "num_input_tokens_seen": 518688, "step": 1655, "train_runtime": 298.6166, "train_tokens_per_second": 1736.97 }, { "epoch": 18.444444444444443, "grad_norm": 0.01287841796875, "learning_rate": 0.0005572648131384361, "loss": 0.2358, "num_input_tokens_seen": 520224, "step": 1660, "train_runtime": 299.4059, "train_tokens_per_second": 1737.521 }, { "epoch": 18.5, "grad_norm": 0.01214599609375, "learning_rate": 0.0005186685860201717, "loss": 0.2255, "num_input_tokens_seen": 521824, "step": 1665, "train_runtime": 300.2061, "train_tokens_per_second": 1738.219 }, { "epoch": 18.555555555555557, "grad_norm": 0.00579833984375, "learning_rate": 0.0004814338500281634, "loss": 0.2297, "num_input_tokens_seen": 523424, "step": 1670, "train_runtime": 301.0084, "train_tokens_per_second": 1738.902 }, { "epoch": 18.61111111111111, "grad_norm": 0.004425048828125, "learning_rate": 0.0004455641058600529, "loss": 0.2307, "num_input_tokens_seen": 524960, "step": 1675, "train_runtime": 301.8068, "train_tokens_per_second": 1739.391 }, { "epoch": 18.666666666666668, "grad_norm": 0.020751953125, "learning_rate": 0.00041106272588105564, "loss": 0.2255, "num_input_tokens_seen": 526496, "step": 1680, "train_runtime": 302.6028, "train_tokens_per_second": 1739.891 }, { "epoch": 18.72222222222222, "grad_norm": 0.01141357421875, "learning_rate": 0.0003779329538069159, "loss": 0.2317, "num_input_tokens_seen": 528064, "step": 1685, "train_runtime": 303.3978, "train_tokens_per_second": 1740.5 }, { "epoch": 18.77777777777778, "grad_norm": 0.00982666015625, "learning_rate": 0.00034617790439893603, "loss": 0.2255, "num_input_tokens_seen": 529632, "step": 1690, "train_runtime": 304.1914, "train_tokens_per_second": 1741.114 }, { "epoch": 18.833333333333332, "grad_norm": 0.011474609375, "learning_rate": 0.00031580056317113525, "loss": 0.2327, "num_input_tokens_seen": 531232, "step": 1695, "train_runtime": 304.9886, "train_tokens_per_second": 1741.809 }, { "epoch": 18.88888888888889, "grad_norm": 0.0033721923828125, "learning_rate": 0.00028680378610956793, "loss": 0.2338, "num_input_tokens_seen": 532800, "step": 1700, "train_runtime": 305.7795, "train_tokens_per_second": 1742.432 }, { "epoch": 18.944444444444443, "grad_norm": 0.00628662109375, "learning_rate": 0.00025919029940380146, "loss": 0.2245, "num_input_tokens_seen": 534400, "step": 1705, "train_runtime": 306.5763, "train_tokens_per_second": 1743.123 }, { "epoch": 19.0, "grad_norm": 0.005096435546875, "learning_rate": 0.0002329626991906164, "loss": 0.2307, "num_input_tokens_seen": 535968, "step": 1710, "train_runtime": 307.4089, "train_tokens_per_second": 1743.502 }, { "epoch": 19.0, "eval_loss": 0.23493099212646484, "eval_runtime": 0.8178, "eval_samples_per_second": 48.914, "eval_steps_per_second": 12.229, "num_input_tokens_seen": 535968, "step": 1710 }, { "epoch": 19.055555555555557, "grad_norm": 0.0224609375, "learning_rate": 0.00020812345130992503, "loss": 0.2327, "num_input_tokens_seen": 537536, "step": 1715, "train_runtime": 309.9741, "train_tokens_per_second": 1734.132 }, { "epoch": 19.11111111111111, "grad_norm": 0.006378173828125, "learning_rate": 0.0001846748910729351, "loss": 0.2297, "num_input_tokens_seen": 539072, "step": 1720, "train_runtime": 310.7692, "train_tokens_per_second": 1734.638 }, { "epoch": 19.166666666666668, "grad_norm": 0.0224609375, "learning_rate": 0.0001626192230425938, "loss": 0.2286, "num_input_tokens_seen": 540608, "step": 1725, "train_runtime": 311.5604, "train_tokens_per_second": 1735.163 }, { "epoch": 19.22222222222222, "grad_norm": 0.00372314453125, "learning_rate": 0.00014195852082632686, "loss": 0.2339, "num_input_tokens_seen": 542208, "step": 1730, "train_runtime": 312.3593, "train_tokens_per_second": 1735.847 }, { "epoch": 19.27777777777778, "grad_norm": 0.013671875, "learning_rate": 0.00012269472688107463, "loss": 0.2328, "num_input_tokens_seen": 543776, "step": 1735, "train_runtime": 313.1529, "train_tokens_per_second": 1736.455 }, { "epoch": 19.333333333333332, "grad_norm": 0.0050048828125, "learning_rate": 0.00010482965233067298, "loss": 0.2287, "num_input_tokens_seen": 545280, "step": 1740, "train_runtime": 313.9486, "train_tokens_per_second": 1736.845 }, { "epoch": 19.38888888888889, "grad_norm": 0.01202392578125, "learning_rate": 8.836497679557964e-05, "loss": 0.2422, "num_input_tokens_seen": 546848, "step": 1745, "train_runtime": 314.7443, "train_tokens_per_second": 1737.436 }, { "epoch": 19.444444444444443, "grad_norm": 0.00311279296875, "learning_rate": 7.330224823495379e-05, "loss": 0.2369, "num_input_tokens_seen": 548416, "step": 1750, "train_runtime": 315.5381, "train_tokens_per_second": 1738.034 }, { "epoch": 19.5, "grad_norm": 0.01019287109375, "learning_rate": 5.96428828011325e-05, "loss": 0.2306, "num_input_tokens_seen": 550016, "step": 1755, "train_runtime": 316.3398, "train_tokens_per_second": 1738.687 }, { "epoch": 19.555555555555557, "grad_norm": 0.0026092529296875, "learning_rate": 4.738816470647389e-05, "loss": 0.236, "num_input_tokens_seen": 551584, "step": 1760, "train_runtime": 317.1375, "train_tokens_per_second": 1739.258 }, { "epoch": 19.61111111111111, "grad_norm": 0.01263427734375, "learning_rate": 3.653924610263703e-05, "loss": 0.2297, "num_input_tokens_seen": 553152, "step": 1765, "train_runtime": 317.9345, "train_tokens_per_second": 1739.83 }, { "epoch": 19.666666666666668, "grad_norm": 0.0211181640625, "learning_rate": 2.7097146972240305e-05, "loss": 0.2276, "num_input_tokens_seen": 554752, "step": 1770, "train_runtime": 318.7296, "train_tokens_per_second": 1740.51 }, { "epoch": 19.72222222222222, "grad_norm": 0.004180908203125, "learning_rate": 1.9062755032984713e-05, "loss": 0.2235, "num_input_tokens_seen": 556288, "step": 1775, "train_runtime": 319.5234, "train_tokens_per_second": 1740.993 }, { "epoch": 19.77777777777778, "grad_norm": 0.00506591796875, "learning_rate": 1.2436825654180693e-05, "loss": 0.2308, "num_input_tokens_seen": 557888, "step": 1780, "train_runtime": 320.3196, "train_tokens_per_second": 1741.661 }, { "epoch": 19.833333333333332, "grad_norm": 0.0048828125, "learning_rate": 7.219981785733242e-06, "loss": 0.2307, "num_input_tokens_seen": 559424, "step": 1785, "train_runtime": 321.1115, "train_tokens_per_second": 1742.149 }, { "epoch": 19.88888888888889, "grad_norm": 0.007659912109375, "learning_rate": 3.4127138995787565e-06, "loss": 0.2297, "num_input_tokens_seen": 560960, "step": 1790, "train_runtime": 321.902, "train_tokens_per_second": 1742.642 }, { "epoch": 19.944444444444443, "grad_norm": 0.01220703125, "learning_rate": 1.0153799435669298e-06, "loss": 0.2234, "num_input_tokens_seen": 562592, "step": 1795, "train_runtime": 322.6996, "train_tokens_per_second": 1743.393 }, { "epoch": 20.0, "grad_norm": 0.00958251953125, "learning_rate": 2.820530780767161e-08, "loss": 0.2288, "num_input_tokens_seen": 564096, "step": 1800, "train_runtime": 323.5286, "train_tokens_per_second": 1743.574 }, { "epoch": 20.0, "eval_loss": 0.2323770523071289, "eval_runtime": 0.8134, "eval_samples_per_second": 49.178, "eval_steps_per_second": 12.295, "num_input_tokens_seen": 564096, "step": 1800 }, { "epoch": 20.0, "num_input_tokens_seen": 564096, "step": 1800, "total_flos": 2.540098792665907e+16, "train_loss": 0.2664620706770155, "train_runtime": 325.1935, "train_samples_per_second": 22.141, "train_steps_per_second": 5.535 } ], "logging_steps": 5, "max_steps": 1800, "num_input_tokens_seen": 564096, "num_train_epochs": 20, "save_steps": 90, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.540098792665907e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }