| { | |
| "best_global_step": 180, | |
| "best_metric": 0.22924575209617615, | |
| "best_model_checkpoint": "saves_multiple/prompt-tuning/llama-3-8b-instruct/train_copa_42_1760623604/checkpoint-180", | |
| "epoch": 20.0, | |
| "eval_steps": 90, | |
| "global_step": 1800, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05555555555555555, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 0.0006666666666666666, | |
| "loss": 0.1508, | |
| "num_input_tokens_seen": 1600, | |
| "step": 5, | |
| "train_runtime": 3.1038, | |
| "train_tokens_per_second": 515.503 | |
| }, | |
| { | |
| "epoch": 0.1111111111111111, | |
| "grad_norm": 12.25, | |
| "learning_rate": 0.0015, | |
| "loss": 0.1225, | |
| "num_input_tokens_seen": 3200, | |
| "step": 10, | |
| "train_runtime": 3.9613, | |
| "train_tokens_per_second": 807.808 | |
| }, | |
| { | |
| "epoch": 0.16666666666666666, | |
| "grad_norm": 11.625, | |
| "learning_rate": 0.002333333333333333, | |
| "loss": 0.2189, | |
| "num_input_tokens_seen": 4768, | |
| "step": 15, | |
| "train_runtime": 4.8299, | |
| "train_tokens_per_second": 987.182 | |
| }, | |
| { | |
| "epoch": 0.2222222222222222, | |
| "grad_norm": 41.25, | |
| "learning_rate": 0.0031666666666666666, | |
| "loss": 0.2288, | |
| "num_input_tokens_seen": 6336, | |
| "step": 20, | |
| "train_runtime": 5.6824, | |
| "train_tokens_per_second": 1115.019 | |
| }, | |
| { | |
| "epoch": 0.2777777777777778, | |
| "grad_norm": 100.0, | |
| "learning_rate": 0.004, | |
| "loss": 4.5867, | |
| "num_input_tokens_seen": 7904, | |
| "step": 25, | |
| "train_runtime": 6.501, | |
| "train_tokens_per_second": 1215.811 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 34.25, | |
| "learning_rate": 0.004833333333333334, | |
| "loss": 3.3424, | |
| "num_input_tokens_seen": 9504, | |
| "step": 30, | |
| "train_runtime": 7.296, | |
| "train_tokens_per_second": 1302.633 | |
| }, | |
| { | |
| "epoch": 0.3888888888888889, | |
| "grad_norm": 10.0, | |
| "learning_rate": 0.005666666666666666, | |
| "loss": 1.1922, | |
| "num_input_tokens_seen": 11072, | |
| "step": 35, | |
| "train_runtime": 8.0915, | |
| "train_tokens_per_second": 1368.343 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 31.875, | |
| "learning_rate": 0.0065, | |
| "loss": 0.5121, | |
| "num_input_tokens_seen": 12672, | |
| "step": 40, | |
| "train_runtime": 8.8889, | |
| "train_tokens_per_second": 1425.604 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 0.007333333333333333, | |
| "loss": 0.2904, | |
| "num_input_tokens_seen": 14176, | |
| "step": 45, | |
| "train_runtime": 9.678, | |
| "train_tokens_per_second": 1464.762 | |
| }, | |
| { | |
| "epoch": 0.5555555555555556, | |
| "grad_norm": 4.21875, | |
| "learning_rate": 0.008166666666666666, | |
| "loss": 0.3048, | |
| "num_input_tokens_seen": 15776, | |
| "step": 50, | |
| "train_runtime": 10.472, | |
| "train_tokens_per_second": 1506.489 | |
| }, | |
| { | |
| "epoch": 0.6111111111111112, | |
| "grad_norm": 4.21875, | |
| "learning_rate": 0.009, | |
| "loss": 0.3677, | |
| "num_input_tokens_seen": 17312, | |
| "step": 55, | |
| "train_runtime": 11.2639, | |
| "train_tokens_per_second": 1536.94 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 0.009833333333333333, | |
| "loss": 0.2604, | |
| "num_input_tokens_seen": 18848, | |
| "step": 60, | |
| "train_runtime": 12.0538, | |
| "train_tokens_per_second": 1563.659 | |
| }, | |
| { | |
| "epoch": 0.7222222222222222, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 0.010666666666666666, | |
| "loss": 0.256, | |
| "num_input_tokens_seen": 20448, | |
| "step": 65, | |
| "train_runtime": 12.8468, | |
| "train_tokens_per_second": 1591.68 | |
| }, | |
| { | |
| "epoch": 0.7777777777777778, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 0.0115, | |
| "loss": 0.5609, | |
| "num_input_tokens_seen": 22016, | |
| "step": 70, | |
| "train_runtime": 13.6383, | |
| "train_tokens_per_second": 1614.278 | |
| }, | |
| { | |
| "epoch": 0.8333333333333334, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 0.012333333333333332, | |
| "loss": 1.0653, | |
| "num_input_tokens_seen": 23616, | |
| "step": 75, | |
| "train_runtime": 14.4335, | |
| "train_tokens_per_second": 1636.199 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 0.013166666666666667, | |
| "loss": 0.2765, | |
| "num_input_tokens_seen": 25152, | |
| "step": 80, | |
| "train_runtime": 15.2249, | |
| "train_tokens_per_second": 1652.025 | |
| }, | |
| { | |
| "epoch": 0.9444444444444444, | |
| "grad_norm": 22.875, | |
| "learning_rate": 0.014, | |
| "loss": 1.0491, | |
| "num_input_tokens_seen": 26688, | |
| "step": 85, | |
| "train_runtime": 16.0181, | |
| "train_tokens_per_second": 1666.117 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 0.014833333333333334, | |
| "loss": 0.26, | |
| "num_input_tokens_seen": 28256, | |
| "step": 90, | |
| "train_runtime": 16.9498, | |
| "train_tokens_per_second": 1667.037 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.23620998859405518, | |
| "eval_runtime": 1.2704, | |
| "eval_samples_per_second": 31.486, | |
| "eval_steps_per_second": 7.872, | |
| "num_input_tokens_seen": 28256, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.0555555555555556, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 0.015666666666666666, | |
| "loss": 0.29, | |
| "num_input_tokens_seen": 29824, | |
| "step": 95, | |
| "train_runtime": 19.874, | |
| "train_tokens_per_second": 1500.656 | |
| }, | |
| { | |
| "epoch": 1.1111111111111112, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 0.0165, | |
| "loss": 0.3073, | |
| "num_input_tokens_seen": 31360, | |
| "step": 100, | |
| "train_runtime": 20.6816, | |
| "train_tokens_per_second": 1516.326 | |
| }, | |
| { | |
| "epoch": 1.1666666666666667, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 0.017333333333333333, | |
| "loss": 0.231, | |
| "num_input_tokens_seen": 32960, | |
| "step": 105, | |
| "train_runtime": 21.4794, | |
| "train_tokens_per_second": 1534.493 | |
| }, | |
| { | |
| "epoch": 1.2222222222222223, | |
| "grad_norm": 0.1591796875, | |
| "learning_rate": 0.018166666666666664, | |
| "loss": 0.2256, | |
| "num_input_tokens_seen": 34464, | |
| "step": 110, | |
| "train_runtime": 22.2709, | |
| "train_tokens_per_second": 1547.491 | |
| }, | |
| { | |
| "epoch": 1.2777777777777777, | |
| "grad_norm": 0.375, | |
| "learning_rate": 0.019, | |
| "loss": 0.7092, | |
| "num_input_tokens_seen": 36032, | |
| "step": 115, | |
| "train_runtime": 23.0649, | |
| "train_tokens_per_second": 1562.201 | |
| }, | |
| { | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 0.01983333333333333, | |
| "loss": 0.2771, | |
| "num_input_tokens_seen": 37600, | |
| "step": 120, | |
| "train_runtime": 23.8592, | |
| "train_tokens_per_second": 1575.909 | |
| }, | |
| { | |
| "epoch": 1.3888888888888888, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 0.020666666666666667, | |
| "loss": 0.2618, | |
| "num_input_tokens_seen": 39168, | |
| "step": 125, | |
| "train_runtime": 24.6509, | |
| "train_tokens_per_second": 1588.905 | |
| }, | |
| { | |
| "epoch": 1.4444444444444444, | |
| "grad_norm": 0.07275390625, | |
| "learning_rate": 0.0215, | |
| "loss": 0.2334, | |
| "num_input_tokens_seen": 40736, | |
| "step": 130, | |
| "train_runtime": 25.4426, | |
| "train_tokens_per_second": 1601.092 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.0279541015625, | |
| "learning_rate": 0.022333333333333334, | |
| "loss": 0.2422, | |
| "num_input_tokens_seen": 42240, | |
| "step": 135, | |
| "train_runtime": 26.2319, | |
| "train_tokens_per_second": 1610.255 | |
| }, | |
| { | |
| "epoch": 1.5555555555555556, | |
| "grad_norm": 0.103515625, | |
| "learning_rate": 0.023166666666666665, | |
| "loss": 0.2405, | |
| "num_input_tokens_seen": 43840, | |
| "step": 140, | |
| "train_runtime": 27.0285, | |
| "train_tokens_per_second": 1621.993 | |
| }, | |
| { | |
| "epoch": 1.6111111111111112, | |
| "grad_norm": 0.04541015625, | |
| "learning_rate": 0.024, | |
| "loss": 0.2416, | |
| "num_input_tokens_seen": 45408, | |
| "step": 145, | |
| "train_runtime": 27.823, | |
| "train_tokens_per_second": 1632.028 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 0.060791015625, | |
| "learning_rate": 0.024833333333333332, | |
| "loss": 0.2476, | |
| "num_input_tokens_seen": 46976, | |
| "step": 150, | |
| "train_runtime": 28.6144, | |
| "train_tokens_per_second": 1641.69 | |
| }, | |
| { | |
| "epoch": 1.7222222222222223, | |
| "grad_norm": 0.08740234375, | |
| "learning_rate": 0.025666666666666664, | |
| "loss": 0.2165, | |
| "num_input_tokens_seen": 48512, | |
| "step": 155, | |
| "train_runtime": 29.4067, | |
| "train_tokens_per_second": 1649.691 | |
| }, | |
| { | |
| "epoch": 1.7777777777777777, | |
| "grad_norm": 0.07373046875, | |
| "learning_rate": 0.0265, | |
| "loss": 0.2679, | |
| "num_input_tokens_seen": 50112, | |
| "step": 160, | |
| "train_runtime": 30.2016, | |
| "train_tokens_per_second": 1659.249 | |
| }, | |
| { | |
| "epoch": 1.8333333333333335, | |
| "grad_norm": 0.0458984375, | |
| "learning_rate": 0.02733333333333333, | |
| "loss": 0.2291, | |
| "num_input_tokens_seen": 51712, | |
| "step": 165, | |
| "train_runtime": 30.9983, | |
| "train_tokens_per_second": 1668.222 | |
| }, | |
| { | |
| "epoch": 1.8888888888888888, | |
| "grad_norm": 0.07373046875, | |
| "learning_rate": 0.028166666666666666, | |
| "loss": 0.235, | |
| "num_input_tokens_seen": 53280, | |
| "step": 170, | |
| "train_runtime": 31.7939, | |
| "train_tokens_per_second": 1675.793 | |
| }, | |
| { | |
| "epoch": 1.9444444444444444, | |
| "grad_norm": 0.032470703125, | |
| "learning_rate": 0.028999999999999998, | |
| "loss": 0.241, | |
| "num_input_tokens_seen": 54880, | |
| "step": 175, | |
| "train_runtime": 32.5915, | |
| "train_tokens_per_second": 1683.876 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.01507568359375, | |
| "learning_rate": 0.029833333333333333, | |
| "loss": 0.2246, | |
| "num_input_tokens_seen": 56480, | |
| "step": 180, | |
| "train_runtime": 33.4262, | |
| "train_tokens_per_second": 1689.693 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.22924575209617615, | |
| "eval_runtime": 0.8256, | |
| "eval_samples_per_second": 48.45, | |
| "eval_steps_per_second": 12.112, | |
| "num_input_tokens_seen": 56480, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.0555555555555554, | |
| "grad_norm": 0.0224609375, | |
| "learning_rate": 0.02999954871719651, | |
| "loss": 0.2275, | |
| "num_input_tokens_seen": 58048, | |
| "step": 185, | |
| "train_runtime": 36.0921, | |
| "train_tokens_per_second": 1608.328 | |
| }, | |
| { | |
| "epoch": 2.111111111111111, | |
| "grad_norm": 0.01361083984375, | |
| "learning_rate": 0.029997715427345868, | |
| "loss": 0.2277, | |
| "num_input_tokens_seen": 59584, | |
| "step": 190, | |
| "train_runtime": 36.8907, | |
| "train_tokens_per_second": 1615.149 | |
| }, | |
| { | |
| "epoch": 2.1666666666666665, | |
| "grad_norm": 0.052734375, | |
| "learning_rate": 0.02999447209750064, | |
| "loss": 0.2313, | |
| "num_input_tokens_seen": 61216, | |
| "step": 195, | |
| "train_runtime": 37.6876, | |
| "train_tokens_per_second": 1624.3 | |
| }, | |
| { | |
| "epoch": 2.2222222222222223, | |
| "grad_norm": 0.033203125, | |
| "learning_rate": 0.02998981903258893, | |
| "loss": 0.2358, | |
| "num_input_tokens_seen": 62784, | |
| "step": 200, | |
| "train_runtime": 38.4845, | |
| "train_tokens_per_second": 1631.408 | |
| }, | |
| { | |
| "epoch": 2.2777777777777777, | |
| "grad_norm": 0.044921875, | |
| "learning_rate": 0.02998375667007787, | |
| "loss": 0.2412, | |
| "num_input_tokens_seen": 64352, | |
| "step": 205, | |
| "train_runtime": 39.2774, | |
| "train_tokens_per_second": 1638.396 | |
| }, | |
| { | |
| "epoch": 2.3333333333333335, | |
| "grad_norm": 0.0220947265625, | |
| "learning_rate": 0.029976285579932503, | |
| "loss": 0.2008, | |
| "num_input_tokens_seen": 65952, | |
| "step": 210, | |
| "train_runtime": 40.0729, | |
| "train_tokens_per_second": 1645.799 | |
| }, | |
| { | |
| "epoch": 2.388888888888889, | |
| "grad_norm": 0.12255859375, | |
| "learning_rate": 0.029967406464562214, | |
| "loss": 0.2465, | |
| "num_input_tokens_seen": 67552, | |
| "step": 215, | |
| "train_runtime": 40.866, | |
| "train_tokens_per_second": 1653.013 | |
| }, | |
| { | |
| "epoch": 2.4444444444444446, | |
| "grad_norm": 0.0245361328125, | |
| "learning_rate": 0.02995712015875466, | |
| "loss": 0.2297, | |
| "num_input_tokens_seen": 69120, | |
| "step": 220, | |
| "train_runtime": 41.6581, | |
| "train_tokens_per_second": 1659.219 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.04248046875, | |
| "learning_rate": 0.029945427629597305, | |
| "loss": 0.2409, | |
| "num_input_tokens_seen": 70688, | |
| "step": 225, | |
| "train_runtime": 42.453, | |
| "train_tokens_per_second": 1665.087 | |
| }, | |
| { | |
| "epoch": 2.5555555555555554, | |
| "grad_norm": 0.07763671875, | |
| "learning_rate": 0.029932329976386493, | |
| "loss": 0.2373, | |
| "num_input_tokens_seen": 72288, | |
| "step": 230, | |
| "train_runtime": 43.2489, | |
| "train_tokens_per_second": 1671.441 | |
| }, | |
| { | |
| "epoch": 2.611111111111111, | |
| "grad_norm": 0.10400390625, | |
| "learning_rate": 0.0299178284305241, | |
| "loss": 0.2425, | |
| "num_input_tokens_seen": 73856, | |
| "step": 235, | |
| "train_runtime": 44.0446, | |
| "train_tokens_per_second": 1676.845 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 0.04052734375, | |
| "learning_rate": 0.02990192435540175, | |
| "loss": 0.2331, | |
| "num_input_tokens_seen": 75392, | |
| "step": 240, | |
| "train_runtime": 44.8352, | |
| "train_tokens_per_second": 1681.537 | |
| }, | |
| { | |
| "epoch": 2.7222222222222223, | |
| "grad_norm": 0.02197265625, | |
| "learning_rate": 0.029884619246272646, | |
| "loss": 0.2384, | |
| "num_input_tokens_seen": 76960, | |
| "step": 245, | |
| "train_runtime": 45.6329, | |
| "train_tokens_per_second": 1686.502 | |
| }, | |
| { | |
| "epoch": 2.7777777777777777, | |
| "grad_norm": 0.03369140625, | |
| "learning_rate": 0.02986591473011098, | |
| "loss": 0.2255, | |
| "num_input_tokens_seen": 78496, | |
| "step": 250, | |
| "train_runtime": 46.4329, | |
| "train_tokens_per_second": 1690.526 | |
| }, | |
| { | |
| "epoch": 2.8333333333333335, | |
| "grad_norm": 0.0205078125, | |
| "learning_rate": 0.02984581256545898, | |
| "loss": 0.2376, | |
| "num_input_tokens_seen": 80000, | |
| "step": 255, | |
| "train_runtime": 47.2277, | |
| "train_tokens_per_second": 1693.922 | |
| }, | |
| { | |
| "epoch": 2.888888888888889, | |
| "grad_norm": 0.030029296875, | |
| "learning_rate": 0.02982431464226157, | |
| "loss": 0.2274, | |
| "num_input_tokens_seen": 81568, | |
| "step": 260, | |
| "train_runtime": 48.0201, | |
| "train_tokens_per_second": 1698.622 | |
| }, | |
| { | |
| "epoch": 2.9444444444444446, | |
| "grad_norm": 0.03125, | |
| "learning_rate": 0.02980142298168869, | |
| "loss": 0.2306, | |
| "num_input_tokens_seen": 83168, | |
| "step": 265, | |
| "train_runtime": 48.8131, | |
| "train_tokens_per_second": 1703.803 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.02099609375, | |
| "learning_rate": 0.029777139735945243, | |
| "loss": 0.2336, | |
| "num_input_tokens_seen": 84736, | |
| "step": 270, | |
| "train_runtime": 49.7464, | |
| "train_tokens_per_second": 1703.361 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.23087672889232635, | |
| "eval_runtime": 0.8189, | |
| "eval_samples_per_second": 48.846, | |
| "eval_steps_per_second": 12.212, | |
| "num_input_tokens_seen": 84736, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.0555555555555554, | |
| "grad_norm": 0.03759765625, | |
| "learning_rate": 0.029751467188068818, | |
| "loss": 0.2376, | |
| "num_input_tokens_seen": 86304, | |
| "step": 275, | |
| "train_runtime": 52.2896, | |
| "train_tokens_per_second": 1650.5 | |
| }, | |
| { | |
| "epoch": 3.111111111111111, | |
| "grad_norm": 0.029541015625, | |
| "learning_rate": 0.02972440775171496, | |
| "loss": 0.2289, | |
| "num_input_tokens_seen": 87904, | |
| "step": 280, | |
| "train_runtime": 53.1215, | |
| "train_tokens_per_second": 1654.773 | |
| }, | |
| { | |
| "epoch": 3.1666666666666665, | |
| "grad_norm": 0.041259765625, | |
| "learning_rate": 0.029695963970930307, | |
| "loss": 0.2255, | |
| "num_input_tokens_seen": 89408, | |
| "step": 285, | |
| "train_runtime": 53.9107, | |
| "train_tokens_per_second": 1658.447 | |
| }, | |
| { | |
| "epoch": 3.2222222222222223, | |
| "grad_norm": 0.01312255859375, | |
| "learning_rate": 0.029666138519913395, | |
| "loss": 0.2251, | |
| "num_input_tokens_seen": 91008, | |
| "step": 290, | |
| "train_runtime": 54.7037, | |
| "train_tokens_per_second": 1663.654 | |
| }, | |
| { | |
| "epoch": 3.2777777777777777, | |
| "grad_norm": 0.0269775390625, | |
| "learning_rate": 0.029634934202763214, | |
| "loss": 0.2566, | |
| "num_input_tokens_seen": 92512, | |
| "step": 295, | |
| "train_runtime": 55.4911, | |
| "train_tokens_per_second": 1667.149 | |
| }, | |
| { | |
| "epoch": 3.3333333333333335, | |
| "grad_norm": 0.02490234375, | |
| "learning_rate": 0.0296023539532156, | |
| "loss": 0.2391, | |
| "num_input_tokens_seen": 94080, | |
| "step": 300, | |
| "train_runtime": 56.2846, | |
| "train_tokens_per_second": 1671.505 | |
| }, | |
| { | |
| "epoch": 3.388888888888889, | |
| "grad_norm": 0.042236328125, | |
| "learning_rate": 0.029568400834367403, | |
| "loss": 0.2269, | |
| "num_input_tokens_seen": 95680, | |
| "step": 305, | |
| "train_runtime": 57.0809, | |
| "train_tokens_per_second": 1676.218 | |
| }, | |
| { | |
| "epoch": 3.4444444444444446, | |
| "grad_norm": 0.033935546875, | |
| "learning_rate": 0.02953307803838851, | |
| "loss": 0.2325, | |
| "num_input_tokens_seen": 97248, | |
| "step": 310, | |
| "train_runtime": 57.8741, | |
| "train_tokens_per_second": 1680.338 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 0.00994873046875, | |
| "learning_rate": 0.02949638888622172, | |
| "loss": 0.2369, | |
| "num_input_tokens_seen": 98784, | |
| "step": 315, | |
| "train_runtime": 58.6661, | |
| "train_tokens_per_second": 1683.835 | |
| }, | |
| { | |
| "epoch": 3.5555555555555554, | |
| "grad_norm": 0.01507568359375, | |
| "learning_rate": 0.029458336827270518, | |
| "loss": 0.2209, | |
| "num_input_tokens_seen": 100384, | |
| "step": 320, | |
| "train_runtime": 59.4642, | |
| "train_tokens_per_second": 1688.142 | |
| }, | |
| { | |
| "epoch": 3.611111111111111, | |
| "grad_norm": 0.0086669921875, | |
| "learning_rate": 0.029418925439074782, | |
| "loss": 0.2318, | |
| "num_input_tokens_seen": 101952, | |
| "step": 325, | |
| "train_runtime": 60.2574, | |
| "train_tokens_per_second": 1691.941 | |
| }, | |
| { | |
| "epoch": 3.6666666666666665, | |
| "grad_norm": 0.006805419921875, | |
| "learning_rate": 0.029378158426974426, | |
| "loss": 0.2321, | |
| "num_input_tokens_seen": 103520, | |
| "step": 330, | |
| "train_runtime": 61.0556, | |
| "train_tokens_per_second": 1695.503 | |
| }, | |
| { | |
| "epoch": 3.7222222222222223, | |
| "grad_norm": 0.003814697265625, | |
| "learning_rate": 0.029336039623761044, | |
| "loss": 0.2406, | |
| "num_input_tokens_seen": 105120, | |
| "step": 335, | |
| "train_runtime": 61.8554, | |
| "train_tokens_per_second": 1699.448 | |
| }, | |
| { | |
| "epoch": 3.7777777777777777, | |
| "grad_norm": 0.003692626953125, | |
| "learning_rate": 0.02929257298931754, | |
| "loss": 0.2309, | |
| "num_input_tokens_seen": 106720, | |
| "step": 340, | |
| "train_runtime": 62.653, | |
| "train_tokens_per_second": 1703.349 | |
| }, | |
| { | |
| "epoch": 3.8333333333333335, | |
| "grad_norm": 0.0189208984375, | |
| "learning_rate": 0.02924776261024586, | |
| "loss": 0.2327, | |
| "num_input_tokens_seen": 108320, | |
| "step": 345, | |
| "train_runtime": 63.447, | |
| "train_tokens_per_second": 1707.252 | |
| }, | |
| { | |
| "epoch": 3.888888888888889, | |
| "grad_norm": 0.0030975341796875, | |
| "learning_rate": 0.02920161269948277, | |
| "loss": 0.2304, | |
| "num_input_tokens_seen": 109888, | |
| "step": 350, | |
| "train_runtime": 64.242, | |
| "train_tokens_per_second": 1710.531 | |
| }, | |
| { | |
| "epoch": 3.9444444444444446, | |
| "grad_norm": 0.00421142578125, | |
| "learning_rate": 0.029154127595903752, | |
| "loss": 0.2293, | |
| "num_input_tokens_seen": 111424, | |
| "step": 355, | |
| "train_runtime": 65.0354, | |
| "train_tokens_per_second": 1713.281 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.00347900390625, | |
| "learning_rate": 0.029105311763915113, | |
| "loss": 0.2347, | |
| "num_input_tokens_seen": 113024, | |
| "step": 360, | |
| "train_runtime": 65.871, | |
| "train_tokens_per_second": 1715.839 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.23181450366973877, | |
| "eval_runtime": 0.8261, | |
| "eval_samples_per_second": 48.422, | |
| "eval_steps_per_second": 12.105, | |
| "num_input_tokens_seen": 113024, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 4.055555555555555, | |
| "grad_norm": 0.004241943359375, | |
| "learning_rate": 0.029055169793034224, | |
| "loss": 0.2306, | |
| "num_input_tokens_seen": 114624, | |
| "step": 365, | |
| "train_runtime": 68.3415, | |
| "train_tokens_per_second": 1677.223 | |
| }, | |
| { | |
| "epoch": 4.111111111111111, | |
| "grad_norm": 0.00628662109375, | |
| "learning_rate": 0.029003706397458022, | |
| "loss": 0.2349, | |
| "num_input_tokens_seen": 116224, | |
| "step": 370, | |
| "train_runtime": 69.1577, | |
| "train_tokens_per_second": 1680.564 | |
| }, | |
| { | |
| "epoch": 4.166666666666667, | |
| "grad_norm": 0.00592041015625, | |
| "learning_rate": 0.028950926415619846, | |
| "loss": 0.2471, | |
| "num_input_tokens_seen": 117760, | |
| "step": 375, | |
| "train_runtime": 69.9474, | |
| "train_tokens_per_second": 1683.55 | |
| }, | |
| { | |
| "epoch": 4.222222222222222, | |
| "grad_norm": 0.0169677734375, | |
| "learning_rate": 0.028896834809734474, | |
| "loss": 0.2298, | |
| "num_input_tokens_seen": 119360, | |
| "step": 380, | |
| "train_runtime": 70.7435, | |
| "train_tokens_per_second": 1687.223 | |
| }, | |
| { | |
| "epoch": 4.277777777777778, | |
| "grad_norm": 0.0302734375, | |
| "learning_rate": 0.028841436665331635, | |
| "loss": 0.2254, | |
| "num_input_tokens_seen": 120960, | |
| "step": 385, | |
| "train_runtime": 71.5406, | |
| "train_tokens_per_second": 1690.787 | |
| }, | |
| { | |
| "epoch": 4.333333333333333, | |
| "grad_norm": 0.003662109375, | |
| "learning_rate": 0.02878473719077787, | |
| "loss": 0.2393, | |
| "num_input_tokens_seen": 122528, | |
| "step": 390, | |
| "train_runtime": 72.3313, | |
| "train_tokens_per_second": 1693.983 | |
| }, | |
| { | |
| "epoch": 4.388888888888889, | |
| "grad_norm": 0.0174560546875, | |
| "learning_rate": 0.028726741716786866, | |
| "loss": 0.2317, | |
| "num_input_tokens_seen": 124096, | |
| "step": 395, | |
| "train_runtime": 73.1276, | |
| "train_tokens_per_second": 1696.979 | |
| }, | |
| { | |
| "epoch": 4.444444444444445, | |
| "grad_norm": 0.0186767578125, | |
| "learning_rate": 0.02866745569591825, | |
| "loss": 0.2351, | |
| "num_input_tokens_seen": 125696, | |
| "step": 400, | |
| "train_runtime": 73.9263, | |
| "train_tokens_per_second": 1700.287 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 0.017578125, | |
| "learning_rate": 0.028606884702065006, | |
| "loss": 0.2317, | |
| "num_input_tokens_seen": 127264, | |
| "step": 405, | |
| "train_runtime": 74.7224, | |
| "train_tokens_per_second": 1703.157 | |
| }, | |
| { | |
| "epoch": 4.555555555555555, | |
| "grad_norm": 0.005462646484375, | |
| "learning_rate": 0.028545034429929377, | |
| "loss": 0.2264, | |
| "num_input_tokens_seen": 128832, | |
| "step": 410, | |
| "train_runtime": 75.5264, | |
| "train_tokens_per_second": 1705.788 | |
| }, | |
| { | |
| "epoch": 4.611111111111111, | |
| "grad_norm": 0.0185546875, | |
| "learning_rate": 0.028481910694487505, | |
| "loss": 0.2396, | |
| "num_input_tokens_seen": 130464, | |
| "step": 415, | |
| "train_runtime": 76.325, | |
| "train_tokens_per_second": 1709.321 | |
| }, | |
| { | |
| "epoch": 4.666666666666667, | |
| "grad_norm": 0.00421142578125, | |
| "learning_rate": 0.02841751943044271, | |
| "loss": 0.2336, | |
| "num_input_tokens_seen": 132032, | |
| "step": 420, | |
| "train_runtime": 77.126, | |
| "train_tokens_per_second": 1711.901 | |
| }, | |
| { | |
| "epoch": 4.722222222222222, | |
| "grad_norm": 0.01361083984375, | |
| "learning_rate": 0.028351866691667543, | |
| "loss": 0.2314, | |
| "num_input_tokens_seen": 133632, | |
| "step": 425, | |
| "train_runtime": 77.9247, | |
| "train_tokens_per_second": 1714.886 | |
| }, | |
| { | |
| "epoch": 4.777777777777778, | |
| "grad_norm": 0.01361083984375, | |
| "learning_rate": 0.02828495865063459, | |
| "loss": 0.2325, | |
| "num_input_tokens_seen": 135232, | |
| "step": 430, | |
| "train_runtime": 78.7259, | |
| "train_tokens_per_second": 1717.757 | |
| }, | |
| { | |
| "epoch": 4.833333333333333, | |
| "grad_norm": 0.0042724609375, | |
| "learning_rate": 0.028216801597836176, | |
| "loss": 0.2216, | |
| "num_input_tokens_seen": 136768, | |
| "step": 435, | |
| "train_runtime": 79.5217, | |
| "train_tokens_per_second": 1719.883 | |
| }, | |
| { | |
| "epoch": 4.888888888888889, | |
| "grad_norm": 0.005462646484375, | |
| "learning_rate": 0.028147401941192952, | |
| "loss": 0.2297, | |
| "num_input_tokens_seen": 138368, | |
| "step": 440, | |
| "train_runtime": 80.3232, | |
| "train_tokens_per_second": 1722.64 | |
| }, | |
| { | |
| "epoch": 4.944444444444445, | |
| "grad_norm": 0.02197265625, | |
| "learning_rate": 0.028076766205451433, | |
| "loss": 0.2443, | |
| "num_input_tokens_seen": 139904, | |
| "step": 445, | |
| "train_runtime": 81.1198, | |
| "train_tokens_per_second": 1724.659 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.005706787109375, | |
| "learning_rate": 0.028004901031570568, | |
| "loss": 0.2277, | |
| "num_input_tokens_seen": 141440, | |
| "step": 450, | |
| "train_runtime": 81.9601, | |
| "train_tokens_per_second": 1725.718 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.2365764081478119, | |
| "eval_runtime": 0.8277, | |
| "eval_samples_per_second": 48.329, | |
| "eval_steps_per_second": 12.082, | |
| "num_input_tokens_seen": 141440, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 5.055555555555555, | |
| "grad_norm": 0.0167236328125, | |
| "learning_rate": 0.027931813176097366, | |
| "loss": 0.2361, | |
| "num_input_tokens_seen": 142976, | |
| "step": 455, | |
| "train_runtime": 84.4881, | |
| "train_tokens_per_second": 1692.262 | |
| }, | |
| { | |
| "epoch": 5.111111111111111, | |
| "grad_norm": 0.00494384765625, | |
| "learning_rate": 0.027857509510531685, | |
| "loss": 0.2293, | |
| "num_input_tokens_seen": 144576, | |
| "step": 460, | |
| "train_runtime": 85.2893, | |
| "train_tokens_per_second": 1695.125 | |
| }, | |
| { | |
| "epoch": 5.166666666666667, | |
| "grad_norm": 0.0245361328125, | |
| "learning_rate": 0.02778199702068017, | |
| "loss": 0.2307, | |
| "num_input_tokens_seen": 146144, | |
| "step": 465, | |
| "train_runtime": 86.0895, | |
| "train_tokens_per_second": 1697.581 | |
| }, | |
| { | |
| "epoch": 5.222222222222222, | |
| "grad_norm": 0.012451171875, | |
| "learning_rate": 0.02770528280599949, | |
| "loss": 0.2336, | |
| "num_input_tokens_seen": 147712, | |
| "step": 470, | |
| "train_runtime": 86.8889, | |
| "train_tokens_per_second": 1700.009 | |
| }, | |
| { | |
| "epoch": 5.277777777777778, | |
| "grad_norm": 0.006683349609375, | |
| "learning_rate": 0.02762737407892886, | |
| "loss": 0.2294, | |
| "num_input_tokens_seen": 149248, | |
| "step": 475, | |
| "train_runtime": 87.6835, | |
| "train_tokens_per_second": 1702.121 | |
| }, | |
| { | |
| "epoch": 5.333333333333333, | |
| "grad_norm": 0.013916015625, | |
| "learning_rate": 0.02754827816421195, | |
| "loss": 0.2357, | |
| "num_input_tokens_seen": 150816, | |
| "step": 480, | |
| "train_runtime": 88.4782, | |
| "train_tokens_per_second": 1704.555 | |
| }, | |
| { | |
| "epoch": 5.388888888888889, | |
| "grad_norm": 0.01202392578125, | |
| "learning_rate": 0.02746800249820822, | |
| "loss": 0.2212, | |
| "num_input_tokens_seen": 152352, | |
| "step": 485, | |
| "train_runtime": 89.2722, | |
| "train_tokens_per_second": 1706.6 | |
| }, | |
| { | |
| "epoch": 5.444444444444445, | |
| "grad_norm": 0.0166015625, | |
| "learning_rate": 0.027386554628193813, | |
| "loss": 0.2362, | |
| "num_input_tokens_seen": 153888, | |
| "step": 490, | |
| "train_runtime": 90.0625, | |
| "train_tokens_per_second": 1708.681 | |
| }, | |
| { | |
| "epoch": 5.5, | |
| "grad_norm": 0.00469970703125, | |
| "learning_rate": 0.027303942211651937, | |
| "loss": 0.2391, | |
| "num_input_tokens_seen": 155488, | |
| "step": 495, | |
| "train_runtime": 90.8589, | |
| "train_tokens_per_second": 1711.313 | |
| }, | |
| { | |
| "epoch": 5.555555555555555, | |
| "grad_norm": 0.0223388671875, | |
| "learning_rate": 0.02722017301555297, | |
| "loss": 0.2305, | |
| "num_input_tokens_seen": 157024, | |
| "step": 500, | |
| "train_runtime": 91.6521, | |
| "train_tokens_per_second": 1713.261 | |
| }, | |
| { | |
| "epoch": 5.611111111111111, | |
| "grad_norm": 0.023193359375, | |
| "learning_rate": 0.02713525491562421, | |
| "loss": 0.2316, | |
| "num_input_tokens_seen": 158528, | |
| "step": 505, | |
| "train_runtime": 92.4397, | |
| "train_tokens_per_second": 1714.934 | |
| }, | |
| { | |
| "epoch": 5.666666666666667, | |
| "grad_norm": 0.0115966796875, | |
| "learning_rate": 0.027049195895609432, | |
| "loss": 0.2305, | |
| "num_input_tokens_seen": 160064, | |
| "step": 510, | |
| "train_runtime": 93.229, | |
| "train_tokens_per_second": 1716.89 | |
| }, | |
| { | |
| "epoch": 5.722222222222222, | |
| "grad_norm": 0.009765625, | |
| "learning_rate": 0.026962004046518273, | |
| "loss": 0.2286, | |
| "num_input_tokens_seen": 161664, | |
| "step": 515, | |
| "train_runtime": 94.0222, | |
| "train_tokens_per_second": 1719.424 | |
| }, | |
| { | |
| "epoch": 5.777777777777778, | |
| "grad_norm": 0.02099609375, | |
| "learning_rate": 0.02687368756586555, | |
| "loss": 0.2297, | |
| "num_input_tokens_seen": 163264, | |
| "step": 520, | |
| "train_runtime": 94.8159, | |
| "train_tokens_per_second": 1721.905 | |
| }, | |
| { | |
| "epoch": 5.833333333333333, | |
| "grad_norm": 0.01226806640625, | |
| "learning_rate": 0.02678425475690055, | |
| "loss": 0.2348, | |
| "num_input_tokens_seen": 164864, | |
| "step": 525, | |
| "train_runtime": 95.6085, | |
| "train_tokens_per_second": 1724.365 | |
| }, | |
| { | |
| "epoch": 5.888888888888889, | |
| "grad_norm": 0.0126953125, | |
| "learning_rate": 0.02669371402782638, | |
| "loss": 0.2286, | |
| "num_input_tokens_seen": 166432, | |
| "step": 530, | |
| "train_runtime": 96.4007, | |
| "train_tokens_per_second": 1726.461 | |
| }, | |
| { | |
| "epoch": 5.944444444444445, | |
| "grad_norm": 0.01153564453125, | |
| "learning_rate": 0.026602073891009458, | |
| "loss": 0.2308, | |
| "num_input_tokens_seen": 168032, | |
| "step": 535, | |
| "train_runtime": 97.1955, | |
| "train_tokens_per_second": 1728.804 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 0.0030517578125, | |
| "learning_rate": 0.0265093429621792, | |
| "loss": 0.2348, | |
| "num_input_tokens_seen": 169600, | |
| "step": 540, | |
| "train_runtime": 98.0299, | |
| "train_tokens_per_second": 1730.084 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.2344598025083542, | |
| "eval_runtime": 0.8191, | |
| "eval_samples_per_second": 48.837, | |
| "eval_steps_per_second": 12.209, | |
| "num_input_tokens_seen": 169600, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 6.055555555555555, | |
| "grad_norm": 0.00494384765625, | |
| "learning_rate": 0.026415529959618007, | |
| "loss": 0.2308, | |
| "num_input_tokens_seen": 171168, | |
| "step": 545, | |
| "train_runtime": 100.5102, | |
| "train_tokens_per_second": 1702.992 | |
| }, | |
| { | |
| "epoch": 6.111111111111111, | |
| "grad_norm": 0.0037078857421875, | |
| "learning_rate": 0.02632064370334158, | |
| "loss": 0.2311, | |
| "num_input_tokens_seen": 172672, | |
| "step": 550, | |
| "train_runtime": 101.3026, | |
| "train_tokens_per_second": 1704.517 | |
| }, | |
| { | |
| "epoch": 6.166666666666667, | |
| "grad_norm": 0.0038299560546875, | |
| "learning_rate": 0.026224693114269705, | |
| "loss": 0.233, | |
| "num_input_tokens_seen": 174240, | |
| "step": 555, | |
| "train_runtime": 102.1187, | |
| "train_tokens_per_second": 1706.249 | |
| }, | |
| { | |
| "epoch": 6.222222222222222, | |
| "grad_norm": 0.011962890625, | |
| "learning_rate": 0.02612768721338753, | |
| "loss": 0.2279, | |
| "num_input_tokens_seen": 175776, | |
| "step": 560, | |
| "train_runtime": 102.9125, | |
| "train_tokens_per_second": 1708.014 | |
| }, | |
| { | |
| "epoch": 6.277777777777778, | |
| "grad_norm": 0.0228271484375, | |
| "learning_rate": 0.02602963512089743, | |
| "loss": 0.232, | |
| "num_input_tokens_seen": 177376, | |
| "step": 565, | |
| "train_runtime": 103.7126, | |
| "train_tokens_per_second": 1710.265 | |
| }, | |
| { | |
| "epoch": 6.333333333333333, | |
| "grad_norm": 0.0133056640625, | |
| "learning_rate": 0.025930546055361575, | |
| "loss": 0.231, | |
| "num_input_tokens_seen": 178912, | |
| "step": 570, | |
| "train_runtime": 104.5028, | |
| "train_tokens_per_second": 1712.031 | |
| }, | |
| { | |
| "epoch": 6.388888888888889, | |
| "grad_norm": 0.01507568359375, | |
| "learning_rate": 0.025830429332835202, | |
| "loss": 0.2286, | |
| "num_input_tokens_seen": 180480, | |
| "step": 575, | |
| "train_runtime": 105.2944, | |
| "train_tokens_per_second": 1714.052 | |
| }, | |
| { | |
| "epoch": 6.444444444444445, | |
| "grad_norm": 0.015380859375, | |
| "learning_rate": 0.025729294365990772, | |
| "loss": 0.231, | |
| "num_input_tokens_seen": 182048, | |
| "step": 580, | |
| "train_runtime": 106.0853, | |
| "train_tokens_per_second": 1716.052 | |
| }, | |
| { | |
| "epoch": 6.5, | |
| "grad_norm": 0.007720947265625, | |
| "learning_rate": 0.025627150663232998, | |
| "loss": 0.2408, | |
| "num_input_tokens_seen": 183648, | |
| "step": 585, | |
| "train_runtime": 106.8794, | |
| "train_tokens_per_second": 1718.273 | |
| }, | |
| { | |
| "epoch": 6.555555555555555, | |
| "grad_norm": 0.00848388671875, | |
| "learning_rate": 0.025524007827804902, | |
| "loss": 0.2358, | |
| "num_input_tokens_seen": 185248, | |
| "step": 590, | |
| "train_runtime": 107.6729, | |
| "train_tokens_per_second": 1720.47 | |
| }, | |
| { | |
| "epoch": 6.611111111111111, | |
| "grad_norm": 0.0162353515625, | |
| "learning_rate": 0.025419875556884956, | |
| "loss": 0.2302, | |
| "num_input_tokens_seen": 186720, | |
| "step": 595, | |
| "train_runtime": 108.46, | |
| "train_tokens_per_second": 1721.556 | |
| }, | |
| { | |
| "epoch": 6.666666666666667, | |
| "grad_norm": 0.0142822265625, | |
| "learning_rate": 0.025314763640675374, | |
| "loss": 0.2313, | |
| "num_input_tokens_seen": 188288, | |
| "step": 600, | |
| "train_runtime": 109.2516, | |
| "train_tokens_per_second": 1723.435 | |
| }, | |
| { | |
| "epoch": 6.722222222222222, | |
| "grad_norm": 0.006683349609375, | |
| "learning_rate": 0.025208681961481655, | |
| "loss": 0.2359, | |
| "num_input_tokens_seen": 189888, | |
| "step": 605, | |
| "train_runtime": 110.0491, | |
| "train_tokens_per_second": 1725.485 | |
| }, | |
| { | |
| "epoch": 6.777777777777778, | |
| "grad_norm": 0.006134033203125, | |
| "learning_rate": 0.025101640492783503, | |
| "loss": 0.238, | |
| "num_input_tokens_seen": 191424, | |
| "step": 610, | |
| "train_runtime": 110.8418, | |
| "train_tokens_per_second": 1727.002 | |
| }, | |
| { | |
| "epoch": 6.833333333333333, | |
| "grad_norm": 0.02490234375, | |
| "learning_rate": 0.024993649298297137, | |
| "loss": 0.2306, | |
| "num_input_tokens_seen": 193056, | |
| "step": 615, | |
| "train_runtime": 111.6358, | |
| "train_tokens_per_second": 1729.338 | |
| }, | |
| { | |
| "epoch": 6.888888888888889, | |
| "grad_norm": 0.01251220703125, | |
| "learning_rate": 0.02488471853102912, | |
| "loss": 0.2314, | |
| "num_input_tokens_seen": 194592, | |
| "step": 620, | |
| "train_runtime": 112.4269, | |
| "train_tokens_per_second": 1730.831 | |
| }, | |
| { | |
| "epoch": 6.944444444444445, | |
| "grad_norm": 0.0252685546875, | |
| "learning_rate": 0.024774858432321828, | |
| "loss": 0.2347, | |
| "num_input_tokens_seen": 196192, | |
| "step": 625, | |
| "train_runtime": 113.2223, | |
| "train_tokens_per_second": 1732.803 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 0.01275634765625, | |
| "learning_rate": 0.024664079330890574, | |
| "loss": 0.2294, | |
| "num_input_tokens_seen": 197792, | |
| "step": 630, | |
| "train_runtime": 114.0579, | |
| "train_tokens_per_second": 1734.137 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 0.23141007125377655, | |
| "eval_runtime": 0.8186, | |
| "eval_samples_per_second": 48.864, | |
| "eval_steps_per_second": 12.216, | |
| "num_input_tokens_seen": 197792, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 7.055555555555555, | |
| "grad_norm": 0.003936767578125, | |
| "learning_rate": 0.02455239164185254, | |
| "loss": 0.2314, | |
| "num_input_tokens_seen": 199392, | |
| "step": 635, | |
| "train_runtime": 116.6196, | |
| "train_tokens_per_second": 1709.764 | |
| }, | |
| { | |
| "epoch": 7.111111111111111, | |
| "grad_norm": 0.005767822265625, | |
| "learning_rate": 0.024439805865747562, | |
| "loss": 0.2286, | |
| "num_input_tokens_seen": 200992, | |
| "step": 640, | |
| "train_runtime": 117.4275, | |
| "train_tokens_per_second": 1711.626 | |
| }, | |
| { | |
| "epoch": 7.166666666666667, | |
| "grad_norm": 0.0135498046875, | |
| "learning_rate": 0.02432633258755093, | |
| "loss": 0.236, | |
| "num_input_tokens_seen": 202592, | |
| "step": 645, | |
| "train_runtime": 118.2277, | |
| "train_tokens_per_second": 1713.574 | |
| }, | |
| { | |
| "epoch": 7.222222222222222, | |
| "grad_norm": 0.01141357421875, | |
| "learning_rate": 0.024211982475678205, | |
| "loss": 0.2237, | |
| "num_input_tokens_seen": 204064, | |
| "step": 650, | |
| "train_runtime": 119.016, | |
| "train_tokens_per_second": 1714.593 | |
| }, | |
| { | |
| "epoch": 7.277777777777778, | |
| "grad_norm": 0.0172119140625, | |
| "learning_rate": 0.024096766280982205, | |
| "loss": 0.2322, | |
| "num_input_tokens_seen": 205664, | |
| "step": 655, | |
| "train_runtime": 119.8138, | |
| "train_tokens_per_second": 1716.53 | |
| }, | |
| { | |
| "epoch": 7.333333333333333, | |
| "grad_norm": 0.0172119140625, | |
| "learning_rate": 0.023980694835742226, | |
| "loss": 0.2317, | |
| "num_input_tokens_seen": 207264, | |
| "step": 660, | |
| "train_runtime": 120.6083, | |
| "train_tokens_per_second": 1718.488 | |
| }, | |
| { | |
| "epoch": 7.388888888888889, | |
| "grad_norm": 0.0106201171875, | |
| "learning_rate": 0.023863779052645667, | |
| "loss": 0.2301, | |
| "num_input_tokens_seen": 208832, | |
| "step": 665, | |
| "train_runtime": 121.401, | |
| "train_tokens_per_second": 1720.184 | |
| }, | |
| { | |
| "epoch": 7.444444444444445, | |
| "grad_norm": 0.02587890625, | |
| "learning_rate": 0.02374602992376202, | |
| "loss": 0.227, | |
| "num_input_tokens_seen": 210368, | |
| "step": 670, | |
| "train_runtime": 122.192, | |
| "train_tokens_per_second": 1721.618 | |
| }, | |
| { | |
| "epoch": 7.5, | |
| "grad_norm": 0.020751953125, | |
| "learning_rate": 0.023627458519509432, | |
| "loss": 0.228, | |
| "num_input_tokens_seen": 211936, | |
| "step": 675, | |
| "train_runtime": 122.9844, | |
| "train_tokens_per_second": 1723.275 | |
| }, | |
| { | |
| "epoch": 7.555555555555555, | |
| "grad_norm": 0.0233154296875, | |
| "learning_rate": 0.023508075987613904, | |
| "loss": 0.2143, | |
| "num_input_tokens_seen": 213536, | |
| "step": 680, | |
| "train_runtime": 123.7781, | |
| "train_tokens_per_second": 1725.151 | |
| }, | |
| { | |
| "epoch": 7.611111111111111, | |
| "grad_norm": 0.05078125, | |
| "learning_rate": 0.023387893552061202, | |
| "loss": 0.2273, | |
| "num_input_tokens_seen": 215136, | |
| "step": 685, | |
| "train_runtime": 124.613, | |
| "train_tokens_per_second": 1726.433 | |
| }, | |
| { | |
| "epoch": 7.666666666666667, | |
| "grad_norm": 0.02294921875, | |
| "learning_rate": 0.023266922512041644, | |
| "loss": 0.2513, | |
| "num_input_tokens_seen": 216736, | |
| "step": 690, | |
| "train_runtime": 125.4907, | |
| "train_tokens_per_second": 1727.107 | |
| }, | |
| { | |
| "epoch": 7.722222222222222, | |
| "grad_norm": 0.029052734375, | |
| "learning_rate": 0.023145174240887748, | |
| "loss": 0.2378, | |
| "num_input_tokens_seen": 218272, | |
| "step": 695, | |
| "train_runtime": 126.2833, | |
| "train_tokens_per_second": 1728.432 | |
| }, | |
| { | |
| "epoch": 7.777777777777778, | |
| "grad_norm": 0.031005859375, | |
| "learning_rate": 0.023022660185004967, | |
| "loss": 0.2316, | |
| "num_input_tokens_seen": 219808, | |
| "step": 700, | |
| "train_runtime": 127.0771, | |
| "train_tokens_per_second": 1729.722 | |
| }, | |
| { | |
| "epoch": 7.833333333333333, | |
| "grad_norm": 0.02587890625, | |
| "learning_rate": 0.02289939186279551, | |
| "loss": 0.2331, | |
| "num_input_tokens_seen": 221312, | |
| "step": 705, | |
| "train_runtime": 127.8643, | |
| "train_tokens_per_second": 1730.834 | |
| }, | |
| { | |
| "epoch": 7.888888888888889, | |
| "grad_norm": 0.023193359375, | |
| "learning_rate": 0.022775380863575456, | |
| "loss": 0.2339, | |
| "num_input_tokens_seen": 222880, | |
| "step": 710, | |
| "train_runtime": 128.6548, | |
| "train_tokens_per_second": 1732.388 | |
| }, | |
| { | |
| "epoch": 7.944444444444445, | |
| "grad_norm": 0.03369140625, | |
| "learning_rate": 0.02265063884648513, | |
| "loss": 0.2344, | |
| "num_input_tokens_seen": 224416, | |
| "step": 715, | |
| "train_runtime": 129.4473, | |
| "train_tokens_per_second": 1733.648 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.049072265625, | |
| "learning_rate": 0.022525177539392937, | |
| "loss": 0.218, | |
| "num_input_tokens_seen": 225984, | |
| "step": 720, | |
| "train_runtime": 130.2822, | |
| "train_tokens_per_second": 1734.573 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 0.2308429777622223, | |
| "eval_runtime": 0.8222, | |
| "eval_samples_per_second": 48.649, | |
| "eval_steps_per_second": 12.162, | |
| "num_input_tokens_seen": 225984, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 8.055555555555555, | |
| "grad_norm": 0.052978515625, | |
| "learning_rate": 0.02239900873779278, | |
| "loss": 0.2506, | |
| "num_input_tokens_seen": 227552, | |
| "step": 725, | |
| "train_runtime": 132.8103, | |
| "train_tokens_per_second": 1713.361 | |
| }, | |
| { | |
| "epoch": 8.11111111111111, | |
| "grad_norm": 0.03564453125, | |
| "learning_rate": 0.022272144303695056, | |
| "loss": 0.2338, | |
| "num_input_tokens_seen": 229088, | |
| "step": 730, | |
| "train_runtime": 133.6036, | |
| "train_tokens_per_second": 1714.684 | |
| }, | |
| { | |
| "epoch": 8.166666666666666, | |
| "grad_norm": 0.1044921875, | |
| "learning_rate": 0.02214459616451143, | |
| "loss": 0.2381, | |
| "num_input_tokens_seen": 230656, | |
| "step": 735, | |
| "train_runtime": 134.4027, | |
| "train_tokens_per_second": 1716.156 | |
| }, | |
| { | |
| "epoch": 8.222222222222221, | |
| "grad_norm": 0.047119140625, | |
| "learning_rate": 0.02201637631193346, | |
| "loss": 0.2288, | |
| "num_input_tokens_seen": 232224, | |
| "step": 740, | |
| "train_runtime": 135.195, | |
| "train_tokens_per_second": 1717.697 | |
| }, | |
| { | |
| "epoch": 8.277777777777779, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 0.021887496800805175, | |
| "loss": 0.2157, | |
| "num_input_tokens_seen": 233792, | |
| "step": 745, | |
| "train_runtime": 135.9862, | |
| "train_tokens_per_second": 1719.233 | |
| }, | |
| { | |
| "epoch": 8.333333333333334, | |
| "grad_norm": 0.061279296875, | |
| "learning_rate": 0.021757969747989707, | |
| "loss": 0.2441, | |
| "num_input_tokens_seen": 235328, | |
| "step": 750, | |
| "train_runtime": 136.775, | |
| "train_tokens_per_second": 1720.549 | |
| }, | |
| { | |
| "epoch": 8.38888888888889, | |
| "grad_norm": 0.0341796875, | |
| "learning_rate": 0.02162780733123012, | |
| "loss": 0.2362, | |
| "num_input_tokens_seen": 236864, | |
| "step": 755, | |
| "train_runtime": 137.5647, | |
| "train_tokens_per_second": 1721.838 | |
| }, | |
| { | |
| "epoch": 8.444444444444445, | |
| "grad_norm": 2.125, | |
| "learning_rate": 0.021497021788004445, | |
| "loss": 0.9504, | |
| "num_input_tokens_seen": 238368, | |
| "step": 760, | |
| "train_runtime": 138.3522, | |
| "train_tokens_per_second": 1722.907 | |
| }, | |
| { | |
| "epoch": 8.5, | |
| "grad_norm": 0.1669921875, | |
| "learning_rate": 0.021365625414375228, | |
| "loss": 0.2414, | |
| "num_input_tokens_seen": 239936, | |
| "step": 765, | |
| "train_runtime": 139.1437, | |
| "train_tokens_per_second": 1724.375 | |
| }, | |
| { | |
| "epoch": 8.555555555555555, | |
| "grad_norm": 0.10400390625, | |
| "learning_rate": 0.021233630563833435, | |
| "loss": 0.2626, | |
| "num_input_tokens_seen": 241536, | |
| "step": 770, | |
| "train_runtime": 139.9389, | |
| "train_tokens_per_second": 1726.01 | |
| }, | |
| { | |
| "epoch": 8.61111111111111, | |
| "grad_norm": 0.038330078125, | |
| "learning_rate": 0.021101049646137005, | |
| "loss": 0.2398, | |
| "num_input_tokens_seen": 243136, | |
| "step": 775, | |
| "train_runtime": 140.7323, | |
| "train_tokens_per_second": 1727.648 | |
| }, | |
| { | |
| "epoch": 8.666666666666666, | |
| "grad_norm": 0.051025390625, | |
| "learning_rate": 0.02096789512614417, | |
| "loss": 0.2382, | |
| "num_input_tokens_seen": 244704, | |
| "step": 780, | |
| "train_runtime": 141.5236, | |
| "train_tokens_per_second": 1729.069 | |
| }, | |
| { | |
| "epoch": 8.722222222222221, | |
| "grad_norm": 0.048095703125, | |
| "learning_rate": 0.020834179522641504, | |
| "loss": 0.2276, | |
| "num_input_tokens_seen": 246272, | |
| "step": 785, | |
| "train_runtime": 142.3196, | |
| "train_tokens_per_second": 1730.416 | |
| }, | |
| { | |
| "epoch": 8.777777777777779, | |
| "grad_norm": 0.01165771484375, | |
| "learning_rate": 0.020699915407166987, | |
| "loss": 0.2446, | |
| "num_input_tokens_seen": 247808, | |
| "step": 790, | |
| "train_runtime": 143.1092, | |
| "train_tokens_per_second": 1731.6 | |
| }, | |
| { | |
| "epoch": 8.833333333333334, | |
| "grad_norm": 0.0169677734375, | |
| "learning_rate": 0.020565115402828002, | |
| "loss": 0.2376, | |
| "num_input_tokens_seen": 249376, | |
| "step": 795, | |
| "train_runtime": 143.9049, | |
| "train_tokens_per_second": 1732.922 | |
| }, | |
| { | |
| "epoch": 8.88888888888889, | |
| "grad_norm": 0.034912109375, | |
| "learning_rate": 0.02042979218311462, | |
| "loss": 0.2325, | |
| "num_input_tokens_seen": 250944, | |
| "step": 800, | |
| "train_runtime": 144.6957, | |
| "train_tokens_per_second": 1734.288 | |
| }, | |
| { | |
| "epoch": 8.944444444444445, | |
| "grad_norm": 0.0235595703125, | |
| "learning_rate": 0.02029395847070803, | |
| "loss": 0.226, | |
| "num_input_tokens_seen": 252512, | |
| "step": 805, | |
| "train_runtime": 145.4907, | |
| "train_tokens_per_second": 1735.588 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 0.01190185546875, | |
| "learning_rate": 0.020157627036284417, | |
| "loss": 0.238, | |
| "num_input_tokens_seen": 254112, | |
| "step": 810, | |
| "train_runtime": 146.3281, | |
| "train_tokens_per_second": 1736.591 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 0.23274096846580505, | |
| "eval_runtime": 0.8192, | |
| "eval_samples_per_second": 48.831, | |
| "eval_steps_per_second": 12.208, | |
| "num_input_tokens_seen": 254112, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 9.055555555555555, | |
| "grad_norm": 0.02197265625, | |
| "learning_rate": 0.02002081069731427, | |
| "loss": 0.2334, | |
| "num_input_tokens_seen": 255680, | |
| "step": 815, | |
| "train_runtime": 148.7956, | |
| "train_tokens_per_second": 1718.33 | |
| }, | |
| { | |
| "epoch": 9.11111111111111, | |
| "grad_norm": 0.0185546875, | |
| "learning_rate": 0.01988352231685735, | |
| "loss": 0.2236, | |
| "num_input_tokens_seen": 257216, | |
| "step": 820, | |
| "train_runtime": 149.5891, | |
| "train_tokens_per_second": 1719.483 | |
| }, | |
| { | |
| "epoch": 9.166666666666666, | |
| "grad_norm": 0.049560546875, | |
| "learning_rate": 0.019745774802353344, | |
| "loss": 0.2579, | |
| "num_input_tokens_seen": 258816, | |
| "step": 825, | |
| "train_runtime": 150.3889, | |
| "train_tokens_per_second": 1720.978 | |
| }, | |
| { | |
| "epoch": 9.222222222222221, | |
| "grad_norm": 0.00970458984375, | |
| "learning_rate": 0.019607581104408342, | |
| "loss": 0.2457, | |
| "num_input_tokens_seen": 260384, | |
| "step": 830, | |
| "train_runtime": 151.1849, | |
| "train_tokens_per_second": 1722.289 | |
| }, | |
| { | |
| "epoch": 9.277777777777779, | |
| "grad_norm": 0.0211181640625, | |
| "learning_rate": 0.019468954215577226, | |
| "loss": 0.2301, | |
| "num_input_tokens_seen": 262048, | |
| "step": 835, | |
| "train_runtime": 151.9844, | |
| "train_tokens_per_second": 1724.177 | |
| }, | |
| { | |
| "epoch": 9.333333333333334, | |
| "grad_norm": 0.02734375, | |
| "learning_rate": 0.01932990716914222, | |
| "loss": 0.244, | |
| "num_input_tokens_seen": 263616, | |
| "step": 840, | |
| "train_runtime": 152.7753, | |
| "train_tokens_per_second": 1725.514 | |
| }, | |
| { | |
| "epoch": 9.38888888888889, | |
| "grad_norm": 0.0093994140625, | |
| "learning_rate": 0.019190453037887464, | |
| "loss": 0.2323, | |
| "num_input_tokens_seen": 265152, | |
| "step": 845, | |
| "train_runtime": 153.5949, | |
| "train_tokens_per_second": 1726.308 | |
| }, | |
| { | |
| "epoch": 9.444444444444445, | |
| "grad_norm": 0.01708984375, | |
| "learning_rate": 0.019050604932870013, | |
| "loss": 0.2314, | |
| "num_input_tokens_seen": 266688, | |
| "step": 850, | |
| "train_runtime": 154.3846, | |
| "train_tokens_per_second": 1727.427 | |
| }, | |
| { | |
| "epoch": 9.5, | |
| "grad_norm": 0.0322265625, | |
| "learning_rate": 0.01891037600218712, | |
| "loss": 0.2338, | |
| "num_input_tokens_seen": 268256, | |
| "step": 855, | |
| "train_runtime": 155.1779, | |
| "train_tokens_per_second": 1728.7 | |
| }, | |
| { | |
| "epoch": 9.555555555555555, | |
| "grad_norm": 0.022705078125, | |
| "learning_rate": 0.018769779429740154, | |
| "loss": 0.2379, | |
| "num_input_tokens_seen": 269824, | |
| "step": 860, | |
| "train_runtime": 155.971, | |
| "train_tokens_per_second": 1729.963 | |
| }, | |
| { | |
| "epoch": 9.61111111111111, | |
| "grad_norm": 0.0140380859375, | |
| "learning_rate": 0.018628828433995014, | |
| "loss": 0.2388, | |
| "num_input_tokens_seen": 271424, | |
| "step": 865, | |
| "train_runtime": 156.7643, | |
| "train_tokens_per_second": 1731.414 | |
| }, | |
| { | |
| "epoch": 9.666666666666666, | |
| "grad_norm": 0.007415771484375, | |
| "learning_rate": 0.018487536266739445, | |
| "loss": 0.2359, | |
| "num_input_tokens_seen": 272960, | |
| "step": 870, | |
| "train_runtime": 157.5546, | |
| "train_tokens_per_second": 1732.478 | |
| }, | |
| { | |
| "epoch": 9.722222222222221, | |
| "grad_norm": 0.01263427734375, | |
| "learning_rate": 0.01834591621183709, | |
| "loss": 0.229, | |
| "num_input_tokens_seen": 274528, | |
| "step": 875, | |
| "train_runtime": 158.3508, | |
| "train_tokens_per_second": 1733.67 | |
| }, | |
| { | |
| "epoch": 9.777777777777779, | |
| "grad_norm": 0.00537109375, | |
| "learning_rate": 0.018203981583978603, | |
| "loss": 0.235, | |
| "num_input_tokens_seen": 276128, | |
| "step": 880, | |
| "train_runtime": 159.1469, | |
| "train_tokens_per_second": 1735.051 | |
| }, | |
| { | |
| "epoch": 9.833333333333334, | |
| "grad_norm": 0.006561279296875, | |
| "learning_rate": 0.018061745727429836, | |
| "loss": 0.2284, | |
| "num_input_tokens_seen": 277664, | |
| "step": 885, | |
| "train_runtime": 159.9388, | |
| "train_tokens_per_second": 1736.064 | |
| }, | |
| { | |
| "epoch": 9.88888888888889, | |
| "grad_norm": 0.0159912109375, | |
| "learning_rate": 0.017919222014777265, | |
| "loss": 0.2371, | |
| "num_input_tokens_seen": 279232, | |
| "step": 890, | |
| "train_runtime": 160.7334, | |
| "train_tokens_per_second": 1737.237 | |
| }, | |
| { | |
| "epoch": 9.944444444444445, | |
| "grad_norm": 0.003265380859375, | |
| "learning_rate": 0.017776423845670717, | |
| "loss": 0.228, | |
| "num_input_tokens_seen": 280768, | |
| "step": 895, | |
| "train_runtime": 161.5242, | |
| "train_tokens_per_second": 1738.241 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.01251220703125, | |
| "learning_rate": 0.0176333646455636, | |
| "loss": 0.2218, | |
| "num_input_tokens_seen": 282368, | |
| "step": 900, | |
| "train_runtime": 162.3604, | |
| "train_tokens_per_second": 1739.143 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 0.23551960289478302, | |
| "eval_runtime": 0.817, | |
| "eval_samples_per_second": 48.961, | |
| "eval_steps_per_second": 12.24, | |
| "num_input_tokens_seen": 282368, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 10.055555555555555, | |
| "grad_norm": 0.0185546875, | |
| "learning_rate": 0.017490057864450664, | |
| "loss": 0.2283, | |
| "num_input_tokens_seen": 283936, | |
| "step": 905, | |
| "train_runtime": 165.2451, | |
| "train_tokens_per_second": 1718.272 | |
| }, | |
| { | |
| "epoch": 10.11111111111111, | |
| "grad_norm": 0.00946044921875, | |
| "learning_rate": 0.017346516975603462, | |
| "loss": 0.2199, | |
| "num_input_tokens_seen": 285504, | |
| "step": 910, | |
| "train_runtime": 166.0581, | |
| "train_tokens_per_second": 1719.302 | |
| }, | |
| { | |
| "epoch": 10.166666666666666, | |
| "grad_norm": 0.01251220703125, | |
| "learning_rate": 0.017202755474303683, | |
| "loss": 0.2405, | |
| "num_input_tokens_seen": 287072, | |
| "step": 915, | |
| "train_runtime": 166.8565, | |
| "train_tokens_per_second": 1720.472 | |
| }, | |
| { | |
| "epoch": 10.222222222222221, | |
| "grad_norm": 0.011474609375, | |
| "learning_rate": 0.017058786876574313, | |
| "loss": 0.2363, | |
| "num_input_tokens_seen": 288576, | |
| "step": 920, | |
| "train_runtime": 167.6452, | |
| "train_tokens_per_second": 1721.35 | |
| }, | |
| { | |
| "epoch": 10.277777777777779, | |
| "grad_norm": 0.00555419921875, | |
| "learning_rate": 0.016914624717908923, | |
| "loss": 0.2355, | |
| "num_input_tokens_seen": 290144, | |
| "step": 925, | |
| "train_runtime": 168.4381, | |
| "train_tokens_per_second": 1722.555 | |
| }, | |
| { | |
| "epoch": 10.333333333333334, | |
| "grad_norm": 0.0235595703125, | |
| "learning_rate": 0.016770282551999093, | |
| "loss": 0.2278, | |
| "num_input_tokens_seen": 291744, | |
| "step": 930, | |
| "train_runtime": 169.2344, | |
| "train_tokens_per_second": 1723.905 | |
| }, | |
| { | |
| "epoch": 10.38888888888889, | |
| "grad_norm": 0.01385498046875, | |
| "learning_rate": 0.01662577394946016, | |
| "loss": 0.2358, | |
| "num_input_tokens_seen": 293344, | |
| "step": 935, | |
| "train_runtime": 170.0296, | |
| "train_tokens_per_second": 1725.253 | |
| }, | |
| { | |
| "epoch": 10.444444444444445, | |
| "grad_norm": 0.00482177734375, | |
| "learning_rate": 0.016481112496555317, | |
| "loss": 0.2315, | |
| "num_input_tokens_seen": 294912, | |
| "step": 940, | |
| "train_runtime": 170.8203, | |
| "train_tokens_per_second": 1726.446 | |
| }, | |
| { | |
| "epoch": 10.5, | |
| "grad_norm": 0.01275634765625, | |
| "learning_rate": 0.016336311793918295, | |
| "loss": 0.2304, | |
| "num_input_tokens_seen": 296480, | |
| "step": 945, | |
| "train_runtime": 171.6139, | |
| "train_tokens_per_second": 1727.599 | |
| }, | |
| { | |
| "epoch": 10.555555555555555, | |
| "grad_norm": 0.00665283203125, | |
| "learning_rate": 0.016191385455274654, | |
| "loss": 0.2347, | |
| "num_input_tokens_seen": 298048, | |
| "step": 950, | |
| "train_runtime": 172.4051, | |
| "train_tokens_per_second": 1728.766 | |
| }, | |
| { | |
| "epoch": 10.61111111111111, | |
| "grad_norm": 0.0230712890625, | |
| "learning_rate": 0.016046347106161877, | |
| "loss": 0.2326, | |
| "num_input_tokens_seen": 299648, | |
| "step": 955, | |
| "train_runtime": 173.1986, | |
| "train_tokens_per_second": 1730.083 | |
| }, | |
| { | |
| "epoch": 10.666666666666666, | |
| "grad_norm": 0.011474609375, | |
| "learning_rate": 0.01590121038264835, | |
| "loss": 0.2264, | |
| "num_input_tokens_seen": 301216, | |
| "step": 960, | |
| "train_runtime": 173.995, | |
| "train_tokens_per_second": 1731.176 | |
| }, | |
| { | |
| "epoch": 10.722222222222221, | |
| "grad_norm": 0.022705078125, | |
| "learning_rate": 0.015755988930051302, | |
| "loss": 0.2329, | |
| "num_input_tokens_seen": 302784, | |
| "step": 965, | |
| "train_runtime": 174.7881, | |
| "train_tokens_per_second": 1732.292 | |
| }, | |
| { | |
| "epoch": 10.777777777777779, | |
| "grad_norm": 0.01312255859375, | |
| "learning_rate": 0.01561069640165394, | |
| "loss": 0.2371, | |
| "num_input_tokens_seen": 304320, | |
| "step": 970, | |
| "train_runtime": 175.5852, | |
| "train_tokens_per_second": 1733.175 | |
| }, | |
| { | |
| "epoch": 10.833333333333334, | |
| "grad_norm": 0.01214599609375, | |
| "learning_rate": 0.015465346457421807, | |
| "loss": 0.239, | |
| "num_input_tokens_seen": 305856, | |
| "step": 975, | |
| "train_runtime": 176.3792, | |
| "train_tokens_per_second": 1734.082 | |
| }, | |
| { | |
| "epoch": 10.88888888888889, | |
| "grad_norm": 0.0140380859375, | |
| "learning_rate": 0.015319952762718515, | |
| "loss": 0.2338, | |
| "num_input_tokens_seen": 307424, | |
| "step": 980, | |
| "train_runtime": 177.1761, | |
| "train_tokens_per_second": 1735.132 | |
| }, | |
| { | |
| "epoch": 10.944444444444445, | |
| "grad_norm": 0.01434326171875, | |
| "learning_rate": 0.015174528987020958, | |
| "loss": 0.234, | |
| "num_input_tokens_seen": 308992, | |
| "step": 985, | |
| "train_runtime": 177.9704, | |
| "train_tokens_per_second": 1736.198 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "grad_norm": 0.01129150390625, | |
| "learning_rate": 0.015029088802634146, | |
| "loss": 0.2349, | |
| "num_input_tokens_seen": 310560, | |
| "step": 990, | |
| "train_runtime": 178.804, | |
| "train_tokens_per_second": 1736.874 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_loss": 0.23004861176013947, | |
| "eval_runtime": 0.8164, | |
| "eval_samples_per_second": 48.995, | |
| "eval_steps_per_second": 12.249, | |
| "num_input_tokens_seen": 310560, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 11.055555555555555, | |
| "grad_norm": 0.0113525390625, | |
| "learning_rate": 0.014883645883405797, | |
| "loss": 0.2328, | |
| "num_input_tokens_seen": 312160, | |
| "step": 995, | |
| "train_runtime": 181.2905, | |
| "train_tokens_per_second": 1721.877 | |
| }, | |
| { | |
| "epoch": 11.11111111111111, | |
| "grad_norm": 0.01300048828125, | |
| "learning_rate": 0.014738213903440746, | |
| "loss": 0.2319, | |
| "num_input_tokens_seen": 313728, | |
| "step": 1000, | |
| "train_runtime": 182.1312, | |
| "train_tokens_per_second": 1722.538 | |
| }, | |
| { | |
| "epoch": 11.166666666666666, | |
| "grad_norm": 0.01287841796875, | |
| "learning_rate": 0.014592806535815357, | |
| "loss": 0.2386, | |
| "num_input_tokens_seen": 315264, | |
| "step": 1005, | |
| "train_runtime": 182.9305, | |
| "train_tokens_per_second": 1723.409 | |
| }, | |
| { | |
| "epoch": 11.222222222222221, | |
| "grad_norm": 0.02099609375, | |
| "learning_rate": 0.014447437451291999, | |
| "loss": 0.2291, | |
| "num_input_tokens_seen": 316864, | |
| "step": 1010, | |
| "train_runtime": 183.7272, | |
| "train_tokens_per_second": 1724.644 | |
| }, | |
| { | |
| "epoch": 11.277777777777779, | |
| "grad_norm": 0.01251220703125, | |
| "learning_rate": 0.014302120317033798, | |
| "loss": 0.2201, | |
| "num_input_tokens_seen": 318432, | |
| "step": 1015, | |
| "train_runtime": 184.5231, | |
| "train_tokens_per_second": 1725.703 | |
| }, | |
| { | |
| "epoch": 11.333333333333334, | |
| "grad_norm": 0.004974365234375, | |
| "learning_rate": 0.014156868795319669, | |
| "loss": 0.2403, | |
| "num_input_tokens_seen": 320032, | |
| "step": 1020, | |
| "train_runtime": 185.3161, | |
| "train_tokens_per_second": 1726.952 | |
| }, | |
| { | |
| "epoch": 11.38888888888889, | |
| "grad_norm": 0.01190185546875, | |
| "learning_rate": 0.014011696542259821, | |
| "loss": 0.2356, | |
| "num_input_tokens_seen": 321536, | |
| "step": 1025, | |
| "train_runtime": 186.1035, | |
| "train_tokens_per_second": 1727.727 | |
| }, | |
| { | |
| "epoch": 11.444444444444445, | |
| "grad_norm": 0.01007080078125, | |
| "learning_rate": 0.013866617206511882, | |
| "loss": 0.235, | |
| "num_input_tokens_seen": 323040, | |
| "step": 1030, | |
| "train_runtime": 186.8909, | |
| "train_tokens_per_second": 1728.495 | |
| }, | |
| { | |
| "epoch": 11.5, | |
| "grad_norm": 0.00927734375, | |
| "learning_rate": 0.013721644427997651, | |
| "loss": 0.2268, | |
| "num_input_tokens_seen": 324608, | |
| "step": 1035, | |
| "train_runtime": 187.6849, | |
| "train_tokens_per_second": 1729.537 | |
| }, | |
| { | |
| "epoch": 11.555555555555555, | |
| "grad_norm": 0.0125732421875, | |
| "learning_rate": 0.01357679183662076, | |
| "loss": 0.2333, | |
| "num_input_tokens_seen": 326144, | |
| "step": 1040, | |
| "train_runtime": 188.4763, | |
| "train_tokens_per_second": 1730.425 | |
| }, | |
| { | |
| "epoch": 11.61111111111111, | |
| "grad_norm": 0.0048828125, | |
| "learning_rate": 0.0134320730509852, | |
| "loss": 0.2322, | |
| "num_input_tokens_seen": 327712, | |
| "step": 1045, | |
| "train_runtime": 189.2669, | |
| "train_tokens_per_second": 1731.481 | |
| }, | |
| { | |
| "epoch": 11.666666666666666, | |
| "grad_norm": 0.0029296875, | |
| "learning_rate": 0.01328750167711494, | |
| "loss": 0.2322, | |
| "num_input_tokens_seen": 329248, | |
| "step": 1050, | |
| "train_runtime": 190.0636, | |
| "train_tokens_per_second": 1732.304 | |
| }, | |
| { | |
| "epoch": 11.722222222222221, | |
| "grad_norm": 0.004974365234375, | |
| "learning_rate": 0.013143091307174755, | |
| "loss": 0.2413, | |
| "num_input_tokens_seen": 330816, | |
| "step": 1055, | |
| "train_runtime": 190.8551, | |
| "train_tokens_per_second": 1733.336 | |
| }, | |
| { | |
| "epoch": 11.777777777777779, | |
| "grad_norm": 0.0208740234375, | |
| "learning_rate": 0.012998855518192309, | |
| "loss": 0.2275, | |
| "num_input_tokens_seen": 332416, | |
| "step": 1060, | |
| "train_runtime": 191.6505, | |
| "train_tokens_per_second": 1734.491 | |
| }, | |
| { | |
| "epoch": 11.833333333333334, | |
| "grad_norm": 0.010009765625, | |
| "learning_rate": 0.012854807870781686, | |
| "loss": 0.2338, | |
| "num_input_tokens_seen": 334016, | |
| "step": 1065, | |
| "train_runtime": 192.4488, | |
| "train_tokens_per_second": 1735.61 | |
| }, | |
| { | |
| "epoch": 11.88888888888889, | |
| "grad_norm": 0.01092529296875, | |
| "learning_rate": 0.012710961907868478, | |
| "loss": 0.2338, | |
| "num_input_tokens_seen": 335616, | |
| "step": 1070, | |
| "train_runtime": 193.2439, | |
| "train_tokens_per_second": 1736.748 | |
| }, | |
| { | |
| "epoch": 11.944444444444445, | |
| "grad_norm": 0.0027923583984375, | |
| "learning_rate": 0.012567331153416489, | |
| "loss": 0.2359, | |
| "num_input_tokens_seen": 337152, | |
| "step": 1075, | |
| "train_runtime": 194.0342, | |
| "train_tokens_per_second": 1737.59 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "grad_norm": 0.004852294921875, | |
| "learning_rate": 0.012423929111156296, | |
| "loss": 0.2315, | |
| "num_input_tokens_seen": 338784, | |
| "step": 1080, | |
| "train_runtime": 194.8731, | |
| "train_tokens_per_second": 1738.486 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 0.23689353466033936, | |
| "eval_runtime": 0.8185, | |
| "eval_samples_per_second": 48.871, | |
| "eval_steps_per_second": 12.218, | |
| "num_input_tokens_seen": 338784, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 12.055555555555555, | |
| "grad_norm": 0.01953125, | |
| "learning_rate": 0.012280769263315627, | |
| "loss": 0.2296, | |
| "num_input_tokens_seen": 340288, | |
| "step": 1085, | |
| "train_runtime": 197.3733, | |
| "train_tokens_per_second": 1724.083 | |
| }, | |
| { | |
| "epoch": 12.11111111111111, | |
| "grad_norm": 0.01123046875, | |
| "learning_rate": 0.012137865069351828, | |
| "loss": 0.2306, | |
| "num_input_tokens_seen": 341888, | |
| "step": 1090, | |
| "train_runtime": 198.1719, | |
| "train_tokens_per_second": 1725.209 | |
| }, | |
| { | |
| "epoch": 12.166666666666666, | |
| "grad_norm": 0.0205078125, | |
| "learning_rate": 0.01199522996468644, | |
| "loss": 0.2317, | |
| "num_input_tokens_seen": 343488, | |
| "step": 1095, | |
| "train_runtime": 198.9687, | |
| "train_tokens_per_second": 1726.342 | |
| }, | |
| { | |
| "epoch": 12.222222222222221, | |
| "grad_norm": 0.0130615234375, | |
| "learning_rate": 0.01185287735944204, | |
| "loss": 0.2309, | |
| "num_input_tokens_seen": 344992, | |
| "step": 1100, | |
| "train_runtime": 199.7613, | |
| "train_tokens_per_second": 1727.021 | |
| }, | |
| { | |
| "epoch": 12.277777777777779, | |
| "grad_norm": 0.0029296875, | |
| "learning_rate": 0.011710820637181448, | |
| "loss": 0.2392, | |
| "num_input_tokens_seen": 346560, | |
| "step": 1105, | |
| "train_runtime": 200.5543, | |
| "train_tokens_per_second": 1728.011 | |
| }, | |
| { | |
| "epoch": 12.333333333333334, | |
| "grad_norm": 0.00457763671875, | |
| "learning_rate": 0.011569073153649483, | |
| "loss": 0.2339, | |
| "num_input_tokens_seen": 348160, | |
| "step": 1110, | |
| "train_runtime": 201.3491, | |
| "train_tokens_per_second": 1729.136 | |
| }, | |
| { | |
| "epoch": 12.38888888888889, | |
| "grad_norm": 0.0118408203125, | |
| "learning_rate": 0.01142764823551724, | |
| "loss": 0.234, | |
| "num_input_tokens_seen": 349760, | |
| "step": 1115, | |
| "train_runtime": 202.1428, | |
| "train_tokens_per_second": 1730.262 | |
| }, | |
| { | |
| "epoch": 12.444444444444445, | |
| "grad_norm": 0.01214599609375, | |
| "learning_rate": 0.011286559179129213, | |
| "loss": 0.2319, | |
| "num_input_tokens_seen": 351328, | |
| "step": 1120, | |
| "train_runtime": 202.9386, | |
| "train_tokens_per_second": 1731.204 | |
| }, | |
| { | |
| "epoch": 12.5, | |
| "grad_norm": 0.020263671875, | |
| "learning_rate": 0.01114581924925317, | |
| "loss": 0.2318, | |
| "num_input_tokens_seen": 352896, | |
| "step": 1125, | |
| "train_runtime": 203.734, | |
| "train_tokens_per_second": 1732.141 | |
| }, | |
| { | |
| "epoch": 12.555555555555555, | |
| "grad_norm": 0.01019287109375, | |
| "learning_rate": 0.011005441677833067, | |
| "loss": 0.2295, | |
| "num_input_tokens_seen": 354464, | |
| "step": 1130, | |
| "train_runtime": 204.5288, | |
| "train_tokens_per_second": 1733.076 | |
| }, | |
| { | |
| "epoch": 12.61111111111111, | |
| "grad_norm": 0.00994873046875, | |
| "learning_rate": 0.010865439662745013, | |
| "loss": 0.2339, | |
| "num_input_tokens_seen": 356032, | |
| "step": 1135, | |
| "train_runtime": 205.321, | |
| "train_tokens_per_second": 1734.026 | |
| }, | |
| { | |
| "epoch": 12.666666666666666, | |
| "grad_norm": 0.01141357421875, | |
| "learning_rate": 0.01072582636655643, | |
| "loss": 0.2263, | |
| "num_input_tokens_seen": 357632, | |
| "step": 1140, | |
| "train_runtime": 206.1151, | |
| "train_tokens_per_second": 1735.108 | |
| }, | |
| { | |
| "epoch": 12.722222222222221, | |
| "grad_norm": 0.0113525390625, | |
| "learning_rate": 0.010586614915288572, | |
| "loss": 0.2327, | |
| "num_input_tokens_seen": 359168, | |
| "step": 1145, | |
| "train_runtime": 206.9071, | |
| "train_tokens_per_second": 1735.89 | |
| }, | |
| { | |
| "epoch": 12.777777777777779, | |
| "grad_norm": 0.005645751953125, | |
| "learning_rate": 0.010447818397182444, | |
| "loss": 0.2337, | |
| "num_input_tokens_seen": 360736, | |
| "step": 1150, | |
| "train_runtime": 207.6979, | |
| "train_tokens_per_second": 1736.83 | |
| }, | |
| { | |
| "epoch": 12.833333333333334, | |
| "grad_norm": 0.0033721923828125, | |
| "learning_rate": 0.010309449861468272, | |
| "loss": 0.2317, | |
| "num_input_tokens_seen": 362304, | |
| "step": 1155, | |
| "train_runtime": 208.4895, | |
| "train_tokens_per_second": 1737.757 | |
| }, | |
| { | |
| "epoch": 12.88888888888889, | |
| "grad_norm": 0.004791259765625, | |
| "learning_rate": 0.010171522317138689, | |
| "loss": 0.2318, | |
| "num_input_tokens_seen": 363872, | |
| "step": 1160, | |
| "train_runtime": 209.2816, | |
| "train_tokens_per_second": 1738.671 | |
| }, | |
| { | |
| "epoch": 12.944444444444445, | |
| "grad_norm": 0.00982666015625, | |
| "learning_rate": 0.01003404873172563, | |
| "loss": 0.2339, | |
| "num_input_tokens_seen": 365376, | |
| "step": 1165, | |
| "train_runtime": 210.0732, | |
| "train_tokens_per_second": 1739.28 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "grad_norm": 0.0029296875, | |
| "learning_rate": 0.009897042030081191, | |
| "loss": 0.2297, | |
| "num_input_tokens_seen": 366944, | |
| "step": 1170, | |
| "train_runtime": 210.9074, | |
| "train_tokens_per_second": 1739.834 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_loss": 0.2312408983707428, | |
| "eval_runtime": 0.8181, | |
| "eval_samples_per_second": 48.893, | |
| "eval_steps_per_second": 12.223, | |
| "num_input_tokens_seen": 366944, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 13.055555555555555, | |
| "grad_norm": 0.0034332275390625, | |
| "learning_rate": 0.009760515093162463, | |
| "loss": 0.2329, | |
| "num_input_tokens_seen": 368384, | |
| "step": 1175, | |
| "train_runtime": 213.385, | |
| "train_tokens_per_second": 1726.382 | |
| }, | |
| { | |
| "epoch": 13.11111111111111, | |
| "grad_norm": 0.01220703125, | |
| "learning_rate": 0.009624480756820496, | |
| "loss": 0.2307, | |
| "num_input_tokens_seen": 369984, | |
| "step": 1180, | |
| "train_runtime": 214.2014, | |
| "train_tokens_per_second": 1727.272 | |
| }, | |
| { | |
| "epoch": 13.166666666666666, | |
| "grad_norm": 0.011474609375, | |
| "learning_rate": 0.009488951810593525, | |
| "loss": 0.2327, | |
| "num_input_tokens_seen": 371520, | |
| "step": 1185, | |
| "train_runtime": 214.9913, | |
| "train_tokens_per_second": 1728.07 | |
| }, | |
| { | |
| "epoch": 13.222222222222221, | |
| "grad_norm": 0.01251220703125, | |
| "learning_rate": 0.009353940996504537, | |
| "loss": 0.2391, | |
| "num_input_tokens_seen": 373120, | |
| "step": 1190, | |
| "train_runtime": 215.7896, | |
| "train_tokens_per_second": 1729.092 | |
| }, | |
| { | |
| "epoch": 13.277777777777779, | |
| "grad_norm": 0.01239013671875, | |
| "learning_rate": 0.009219461007863278, | |
| "loss": 0.2317, | |
| "num_input_tokens_seen": 374688, | |
| "step": 1195, | |
| "train_runtime": 216.5862, | |
| "train_tokens_per_second": 1729.972 | |
| }, | |
| { | |
| "epoch": 13.333333333333334, | |
| "grad_norm": 0.0036163330078125, | |
| "learning_rate": 0.009085524488072901, | |
| "loss": 0.2347, | |
| "num_input_tokens_seen": 376288, | |
| "step": 1200, | |
| "train_runtime": 217.3817, | |
| "train_tokens_per_second": 1731.001 | |
| }, | |
| { | |
| "epoch": 13.38888888888889, | |
| "grad_norm": 0.00543212890625, | |
| "learning_rate": 0.008952144029441248, | |
| "loss": 0.2304, | |
| "num_input_tokens_seen": 377888, | |
| "step": 1205, | |
| "train_runtime": 218.1804, | |
| "train_tokens_per_second": 1731.998 | |
| }, | |
| { | |
| "epoch": 13.444444444444445, | |
| "grad_norm": 0.0064697265625, | |
| "learning_rate": 0.008819332171996975, | |
| "loss": 0.2325, | |
| "num_input_tokens_seen": 379424, | |
| "step": 1210, | |
| "train_runtime": 218.971, | |
| "train_tokens_per_second": 1732.759 | |
| }, | |
| { | |
| "epoch": 13.5, | |
| "grad_norm": 0.01080322265625, | |
| "learning_rate": 0.008687101402310564, | |
| "loss": 0.2336, | |
| "num_input_tokens_seen": 380992, | |
| "step": 1215, | |
| "train_runtime": 219.765, | |
| "train_tokens_per_second": 1733.634 | |
| }, | |
| { | |
| "epoch": 13.555555555555555, | |
| "grad_norm": 0.003936767578125, | |
| "learning_rate": 0.008555464152320372, | |
| "loss": 0.2295, | |
| "num_input_tokens_seen": 382592, | |
| "step": 1220, | |
| "train_runtime": 220.5584, | |
| "train_tokens_per_second": 1734.652 | |
| }, | |
| { | |
| "epoch": 13.61111111111111, | |
| "grad_norm": 0.01177978515625, | |
| "learning_rate": 0.008424432798163836, | |
| "loss": 0.2284, | |
| "num_input_tokens_seen": 384192, | |
| "step": 1225, | |
| "train_runtime": 221.3532, | |
| "train_tokens_per_second": 1735.651 | |
| }, | |
| { | |
| "epoch": 13.666666666666666, | |
| "grad_norm": 0.01123046875, | |
| "learning_rate": 0.008294019659013892, | |
| "loss": 0.2325, | |
| "num_input_tokens_seen": 385760, | |
| "step": 1230, | |
| "train_runtime": 222.1454, | |
| "train_tokens_per_second": 1736.521 | |
| }, | |
| { | |
| "epoch": 13.722222222222221, | |
| "grad_norm": 0.0120849609375, | |
| "learning_rate": 0.008164236995920735, | |
| "loss": 0.2358, | |
| "num_input_tokens_seen": 387328, | |
| "step": 1235, | |
| "train_runtime": 222.9361, | |
| "train_tokens_per_second": 1737.395 | |
| }, | |
| { | |
| "epoch": 13.777777777777779, | |
| "grad_norm": 0.00482177734375, | |
| "learning_rate": 0.008035097010659147, | |
| "loss": 0.2295, | |
| "num_input_tokens_seen": 388896, | |
| "step": 1240, | |
| "train_runtime": 223.7293, | |
| "train_tokens_per_second": 1738.244 | |
| }, | |
| { | |
| "epoch": 13.833333333333334, | |
| "grad_norm": 0.0101318359375, | |
| "learning_rate": 0.00790661184458125, | |
| "loss": 0.2346, | |
| "num_input_tokens_seen": 390496, | |
| "step": 1245, | |
| "train_runtime": 224.5255, | |
| "train_tokens_per_second": 1739.206 | |
| }, | |
| { | |
| "epoch": 13.88888888888889, | |
| "grad_norm": 0.0030670166015625, | |
| "learning_rate": 0.007778793577475039, | |
| "loss": 0.2284, | |
| "num_input_tokens_seen": 392064, | |
| "step": 1250, | |
| "train_runtime": 225.3179, | |
| "train_tokens_per_second": 1740.048 | |
| }, | |
| { | |
| "epoch": 13.944444444444445, | |
| "grad_norm": 0.0093994140625, | |
| "learning_rate": 0.007651654226428696, | |
| "loss": 0.2265, | |
| "num_input_tokens_seen": 393632, | |
| "step": 1255, | |
| "train_runtime": 226.1132, | |
| "train_tokens_per_second": 1740.862 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "grad_norm": 0.0101318359375, | |
| "learning_rate": 0.0075252057447007465, | |
| "loss": 0.2276, | |
| "num_input_tokens_seen": 395104, | |
| "step": 1260, | |
| "train_runtime": 226.95, | |
| "train_tokens_per_second": 1740.93 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 0.2316901683807373, | |
| "eval_runtime": 0.8178, | |
| "eval_samples_per_second": 48.909, | |
| "eval_steps_per_second": 12.227, | |
| "num_input_tokens_seen": 395104, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 14.055555555555555, | |
| "grad_norm": 0.00628662109375, | |
| "learning_rate": 0.007399460020596265, | |
| "loss": 0.2307, | |
| "num_input_tokens_seen": 396672, | |
| "step": 1265, | |
| "train_runtime": 229.4732, | |
| "train_tokens_per_second": 1728.62 | |
| }, | |
| { | |
| "epoch": 14.11111111111111, | |
| "grad_norm": 0.005584716796875, | |
| "learning_rate": 0.007274428876349185, | |
| "loss": 0.2348, | |
| "num_input_tokens_seen": 398304, | |
| "step": 1270, | |
| "train_runtime": 230.292, | |
| "train_tokens_per_second": 1729.561 | |
| }, | |
| { | |
| "epoch": 14.166666666666666, | |
| "grad_norm": 0.01055908203125, | |
| "learning_rate": 0.007150124067010788, | |
| "loss": 0.2317, | |
| "num_input_tokens_seen": 399840, | |
| "step": 1275, | |
| "train_runtime": 231.085, | |
| "train_tokens_per_second": 1730.272 | |
| }, | |
| { | |
| "epoch": 14.222222222222221, | |
| "grad_norm": 0.0037689208984375, | |
| "learning_rate": 0.007026557279344533, | |
| "loss": 0.2286, | |
| "num_input_tokens_seen": 401440, | |
| "step": 1280, | |
| "train_runtime": 231.8835, | |
| "train_tokens_per_second": 1731.214 | |
| }, | |
| { | |
| "epoch": 14.277777777777779, | |
| "grad_norm": 0.0035400390625, | |
| "learning_rate": 0.006903740130727311, | |
| "loss": 0.2264, | |
| "num_input_tokens_seen": 403040, | |
| "step": 1285, | |
| "train_runtime": 232.6814, | |
| "train_tokens_per_second": 1732.154 | |
| }, | |
| { | |
| "epoch": 14.333333333333334, | |
| "grad_norm": 0.01104736328125, | |
| "learning_rate": 0.0067816841680572015, | |
| "loss": 0.2337, | |
| "num_input_tokens_seen": 404640, | |
| "step": 1290, | |
| "train_runtime": 233.4758, | |
| "train_tokens_per_second": 1733.113 | |
| }, | |
| { | |
| "epoch": 14.38888888888889, | |
| "grad_norm": 0.004364013671875, | |
| "learning_rate": 0.006660400866667899, | |
| "loss": 0.2246, | |
| "num_input_tokens_seen": 406208, | |
| "step": 1295, | |
| "train_runtime": 234.2675, | |
| "train_tokens_per_second": 1733.95 | |
| }, | |
| { | |
| "epoch": 14.444444444444445, | |
| "grad_norm": 0.005584716796875, | |
| "learning_rate": 0.006539901629249787, | |
| "loss": 0.2322, | |
| "num_input_tokens_seen": 407776, | |
| "step": 1300, | |
| "train_runtime": 235.0597, | |
| "train_tokens_per_second": 1734.776 | |
| }, | |
| { | |
| "epoch": 14.5, | |
| "grad_norm": 0.004791259765625, | |
| "learning_rate": 0.006420197784777924, | |
| "loss": 0.2268, | |
| "num_input_tokens_seen": 409312, | |
| "step": 1305, | |
| "train_runtime": 235.8489, | |
| "train_tokens_per_second": 1735.484 | |
| }, | |
| { | |
| "epoch": 14.555555555555555, | |
| "grad_norm": 0.02587890625, | |
| "learning_rate": 0.006301300587446937, | |
| "loss": 0.2314, | |
| "num_input_tokens_seen": 410816, | |
| "step": 1310, | |
| "train_runtime": 236.6364, | |
| "train_tokens_per_second": 1736.064 | |
| }, | |
| { | |
| "epoch": 14.61111111111111, | |
| "grad_norm": 0.0244140625, | |
| "learning_rate": 0.006183221215612904, | |
| "loss": 0.2415, | |
| "num_input_tokens_seen": 412416, | |
| "step": 1315, | |
| "train_runtime": 237.4299, | |
| "train_tokens_per_second": 1737.001 | |
| }, | |
| { | |
| "epoch": 14.666666666666666, | |
| "grad_norm": 0.0108642578125, | |
| "learning_rate": 0.00606597077074242, | |
| "loss": 0.2288, | |
| "num_input_tokens_seen": 414016, | |
| "step": 1320, | |
| "train_runtime": 238.223, | |
| "train_tokens_per_second": 1737.935 | |
| }, | |
| { | |
| "epoch": 14.722222222222221, | |
| "grad_norm": 0.003570556640625, | |
| "learning_rate": 0.005949560276368865, | |
| "loss": 0.2402, | |
| "num_input_tokens_seen": 415552, | |
| "step": 1325, | |
| "train_runtime": 239.012, | |
| "train_tokens_per_second": 1738.624 | |
| }, | |
| { | |
| "epoch": 14.777777777777779, | |
| "grad_norm": 0.005096435546875, | |
| "learning_rate": 0.005834000677056003, | |
| "loss": 0.2289, | |
| "num_input_tokens_seen": 417088, | |
| "step": 1330, | |
| "train_runtime": 239.8035, | |
| "train_tokens_per_second": 1739.291 | |
| }, | |
| { | |
| "epoch": 14.833333333333334, | |
| "grad_norm": 0.010009765625, | |
| "learning_rate": 0.005719302837369021, | |
| "loss": 0.2317, | |
| "num_input_tokens_seen": 418656, | |
| "step": 1335, | |
| "train_runtime": 240.5946, | |
| "train_tokens_per_second": 1740.089 | |
| }, | |
| { | |
| "epoch": 14.88888888888889, | |
| "grad_norm": 0.01055908203125, | |
| "learning_rate": 0.00560547754085305, | |
| "loss": 0.2265, | |
| "num_input_tokens_seen": 420256, | |
| "step": 1340, | |
| "train_runtime": 241.3879, | |
| "train_tokens_per_second": 1740.999 | |
| }, | |
| { | |
| "epoch": 14.944444444444445, | |
| "grad_norm": 0.0203857421875, | |
| "learning_rate": 0.005492535489019344, | |
| "loss": 0.2245, | |
| "num_input_tokens_seen": 421792, | |
| "step": 1345, | |
| "train_runtime": 242.1774, | |
| "train_tokens_per_second": 1741.665 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 0.0223388671875, | |
| "learning_rate": 0.005380487300339167, | |
| "loss": 0.2402, | |
| "num_input_tokens_seen": 423360, | |
| "step": 1350, | |
| "train_runtime": 243.0095, | |
| "train_tokens_per_second": 1742.154 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 0.23129186034202576, | |
| "eval_runtime": 0.8149, | |
| "eval_samples_per_second": 49.088, | |
| "eval_steps_per_second": 12.272, | |
| "num_input_tokens_seen": 423360, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 15.055555555555555, | |
| "grad_norm": 0.01123046875, | |
| "learning_rate": 0.005269343509245449, | |
| "loss": 0.2339, | |
| "num_input_tokens_seen": 424992, | |
| "step": 1355, | |
| "train_runtime": 245.5308, | |
| "train_tokens_per_second": 1730.911 | |
| }, | |
| { | |
| "epoch": 15.11111111111111, | |
| "grad_norm": 0.01226806640625, | |
| "learning_rate": 0.005159114565142392, | |
| "loss": 0.2307, | |
| "num_input_tokens_seen": 426528, | |
| "step": 1360, | |
| "train_runtime": 246.3315, | |
| "train_tokens_per_second": 1731.52 | |
| }, | |
| { | |
| "epoch": 15.166666666666666, | |
| "grad_norm": 0.0106201171875, | |
| "learning_rate": 0.0050498108314230425, | |
| "loss": 0.2318, | |
| "num_input_tokens_seen": 428096, | |
| "step": 1365, | |
| "train_runtime": 247.1262, | |
| "train_tokens_per_second": 1732.297 | |
| }, | |
| { | |
| "epoch": 15.222222222222221, | |
| "grad_norm": 0.00732421875, | |
| "learning_rate": 0.0049414425844949445, | |
| "loss": 0.2307, | |
| "num_input_tokens_seen": 429600, | |
| "step": 1370, | |
| "train_runtime": 247.9142, | |
| "train_tokens_per_second": 1732.858 | |
| }, | |
| { | |
| "epoch": 15.277777777777779, | |
| "grad_norm": 0.00335693359375, | |
| "learning_rate": 0.004834020012814016, | |
| "loss": 0.2337, | |
| "num_input_tokens_seen": 431200, | |
| "step": 1375, | |
| "train_runtime": 248.7142, | |
| "train_tokens_per_second": 1733.717 | |
| }, | |
| { | |
| "epoch": 15.333333333333334, | |
| "grad_norm": 0.01177978515625, | |
| "learning_rate": 0.004727553215926623, | |
| "loss": 0.2305, | |
| "num_input_tokens_seen": 432736, | |
| "step": 1380, | |
| "train_runtime": 249.5378, | |
| "train_tokens_per_second": 1734.15 | |
| }, | |
| { | |
| "epoch": 15.38888888888889, | |
| "grad_norm": 0.010498046875, | |
| "learning_rate": 0.004622052203520061, | |
| "loss": 0.2276, | |
| "num_input_tokens_seen": 434336, | |
| "step": 1385, | |
| "train_runtime": 250.3618, | |
| "train_tokens_per_second": 1734.834 | |
| }, | |
| { | |
| "epoch": 15.444444444444445, | |
| "grad_norm": 0.0115966796875, | |
| "learning_rate": 0.004517526894481498, | |
| "loss": 0.2348, | |
| "num_input_tokens_seen": 435904, | |
| "step": 1390, | |
| "train_runtime": 251.1536, | |
| "train_tokens_per_second": 1735.607 | |
| }, | |
| { | |
| "epoch": 15.5, | |
| "grad_norm": 0.0107421875, | |
| "learning_rate": 0.004413987115965404, | |
| "loss": 0.2286, | |
| "num_input_tokens_seen": 437440, | |
| "step": 1395, | |
| "train_runtime": 251.9431, | |
| "train_tokens_per_second": 1736.265 | |
| }, | |
| { | |
| "epoch": 15.555555555555555, | |
| "grad_norm": 0.01171875, | |
| "learning_rate": 0.004311442602469636, | |
| "loss": 0.2347, | |
| "num_input_tokens_seen": 438976, | |
| "step": 1400, | |
| "train_runtime": 252.733, | |
| "train_tokens_per_second": 1736.916 | |
| }, | |
| { | |
| "epoch": 15.61111111111111, | |
| "grad_norm": 0.005950927734375, | |
| "learning_rate": 0.004209902994920235, | |
| "loss": 0.2255, | |
| "num_input_tokens_seen": 440512, | |
| "step": 1405, | |
| "train_runtime": 253.5249, | |
| "train_tokens_per_second": 1737.549 | |
| }, | |
| { | |
| "epoch": 15.666666666666666, | |
| "grad_norm": 0.01190185546875, | |
| "learning_rate": 0.004109377839765016, | |
| "loss": 0.2295, | |
| "num_input_tokens_seen": 442112, | |
| "step": 1410, | |
| "train_runtime": 254.3181, | |
| "train_tokens_per_second": 1738.421 | |
| }, | |
| { | |
| "epoch": 15.722222222222221, | |
| "grad_norm": 0.012451171875, | |
| "learning_rate": 0.004009876588076046, | |
| "loss": 0.2339, | |
| "num_input_tokens_seen": 443616, | |
| "step": 1415, | |
| "train_runtime": 255.1075, | |
| "train_tokens_per_second": 1738.938 | |
| }, | |
| { | |
| "epoch": 15.777777777777779, | |
| "grad_norm": 0.022705078125, | |
| "learning_rate": 0.003911408594661061, | |
| "loss": 0.2316, | |
| "num_input_tokens_seen": 445184, | |
| "step": 1420, | |
| "train_runtime": 255.8999, | |
| "train_tokens_per_second": 1739.68 | |
| }, | |
| { | |
| "epoch": 15.833333333333334, | |
| "grad_norm": 0.012451171875, | |
| "learning_rate": 0.0038139831171839726, | |
| "loss": 0.2308, | |
| "num_input_tokens_seen": 446752, | |
| "step": 1425, | |
| "train_runtime": 256.6958, | |
| "train_tokens_per_second": 1740.394 | |
| }, | |
| { | |
| "epoch": 15.88888888888889, | |
| "grad_norm": 0.01324462890625, | |
| "learning_rate": 0.0037176093152944947, | |
| "loss": 0.2318, | |
| "num_input_tokens_seen": 448352, | |
| "step": 1430, | |
| "train_runtime": 257.491, | |
| "train_tokens_per_second": 1741.234 | |
| }, | |
| { | |
| "epoch": 15.944444444444445, | |
| "grad_norm": 0.0101318359375, | |
| "learning_rate": 0.0036222962497669668, | |
| "loss": 0.2276, | |
| "num_input_tokens_seen": 449888, | |
| "step": 1435, | |
| "train_runtime": 258.2827, | |
| "train_tokens_per_second": 1741.843 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 0.00531005859375, | |
| "learning_rate": 0.003528052881648488, | |
| "loss": 0.2338, | |
| "num_input_tokens_seen": 451424, | |
| "step": 1440, | |
| "train_runtime": 259.1151, | |
| "train_tokens_per_second": 1742.176 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 0.23337697982788086, | |
| "eval_runtime": 0.8216, | |
| "eval_samples_per_second": 48.684, | |
| "eval_steps_per_second": 12.171, | |
| "num_input_tokens_seen": 451424, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 16.055555555555557, | |
| "grad_norm": 0.0111083984375, | |
| "learning_rate": 0.0034348880714164414, | |
| "loss": 0.2306, | |
| "num_input_tokens_seen": 452992, | |
| "step": 1445, | |
| "train_runtime": 261.5987, | |
| "train_tokens_per_second": 1731.629 | |
| }, | |
| { | |
| "epoch": 16.11111111111111, | |
| "grad_norm": 0.00958251953125, | |
| "learning_rate": 0.0033428105781454364, | |
| "loss": 0.2266, | |
| "num_input_tokens_seen": 454496, | |
| "step": 1450, | |
| "train_runtime": 262.4288, | |
| "train_tokens_per_second": 1731.883 | |
| }, | |
| { | |
| "epoch": 16.166666666666668, | |
| "grad_norm": 0.005096435546875, | |
| "learning_rate": 0.0032518290586838377, | |
| "loss": 0.2359, | |
| "num_input_tokens_seen": 456096, | |
| "step": 1455, | |
| "train_runtime": 263.2247, | |
| "train_tokens_per_second": 1732.725 | |
| }, | |
| { | |
| "epoch": 16.22222222222222, | |
| "grad_norm": 0.0118408203125, | |
| "learning_rate": 0.0031619520668398388, | |
| "loss": 0.2308, | |
| "num_input_tokens_seen": 457696, | |
| "step": 1460, | |
| "train_runtime": 264.0196, | |
| "train_tokens_per_second": 1733.568 | |
| }, | |
| { | |
| "epoch": 16.27777777777778, | |
| "grad_norm": 0.00433349609375, | |
| "learning_rate": 0.003073188052577281, | |
| "loss": 0.2318, | |
| "num_input_tokens_seen": 459232, | |
| "step": 1465, | |
| "train_runtime": 264.8136, | |
| "train_tokens_per_second": 1734.171 | |
| }, | |
| { | |
| "epoch": 16.333333333333332, | |
| "grad_norm": 0.0062255859375, | |
| "learning_rate": 0.00298554536122122, | |
| "loss": 0.2337, | |
| "num_input_tokens_seen": 460832, | |
| "step": 1470, | |
| "train_runtime": 265.6148, | |
| "train_tokens_per_second": 1734.964 | |
| }, | |
| { | |
| "epoch": 16.38888888888889, | |
| "grad_norm": 0.0040283203125, | |
| "learning_rate": 0.0028990322326732957, | |
| "loss": 0.2329, | |
| "num_input_tokens_seen": 462432, | |
| "step": 1475, | |
| "train_runtime": 266.4104, | |
| "train_tokens_per_second": 1735.788 | |
| }, | |
| { | |
| "epoch": 16.444444444444443, | |
| "grad_norm": 0.00653076171875, | |
| "learning_rate": 0.0028136568006370643, | |
| "loss": 0.2245, | |
| "num_input_tokens_seen": 464000, | |
| "step": 1480, | |
| "train_runtime": 267.2022, | |
| "train_tokens_per_second": 1736.513 | |
| }, | |
| { | |
| "epoch": 16.5, | |
| "grad_norm": 0.01025390625, | |
| "learning_rate": 0.0027294270918532875, | |
| "loss": 0.2256, | |
| "num_input_tokens_seen": 465536, | |
| "step": 1485, | |
| "train_runtime": 267.991, | |
| "train_tokens_per_second": 1737.133 | |
| }, | |
| { | |
| "epoch": 16.555555555555557, | |
| "grad_norm": 0.02001953125, | |
| "learning_rate": 0.0026463510253452744, | |
| "loss": 0.2255, | |
| "num_input_tokens_seen": 467136, | |
| "step": 1490, | |
| "train_runtime": 268.784, | |
| "train_tokens_per_second": 1737.96 | |
| }, | |
| { | |
| "epoch": 16.61111111111111, | |
| "grad_norm": 0.002838134765625, | |
| "learning_rate": 0.0025644364116743754, | |
| "loss": 0.2308, | |
| "num_input_tokens_seen": 468672, | |
| "step": 1495, | |
| "train_runtime": 269.575, | |
| "train_tokens_per_second": 1738.559 | |
| }, | |
| { | |
| "epoch": 16.666666666666668, | |
| "grad_norm": 0.003387451171875, | |
| "learning_rate": 0.002483690952205637, | |
| "loss": 0.235, | |
| "num_input_tokens_seen": 470272, | |
| "step": 1500, | |
| "train_runtime": 270.3683, | |
| "train_tokens_per_second": 1739.376 | |
| }, | |
| { | |
| "epoch": 16.72222222222222, | |
| "grad_norm": 0.0108642578125, | |
| "learning_rate": 0.0024041222383837536, | |
| "loss": 0.2306, | |
| "num_input_tokens_seen": 471872, | |
| "step": 1505, | |
| "train_runtime": 271.1607, | |
| "train_tokens_per_second": 1740.193 | |
| }, | |
| { | |
| "epoch": 16.77777777777778, | |
| "grad_norm": 0.011962890625, | |
| "learning_rate": 0.002325737751019347, | |
| "loss": 0.2276, | |
| "num_input_tokens_seen": 473440, | |
| "step": 1510, | |
| "train_runtime": 271.9576, | |
| "train_tokens_per_second": 1740.859 | |
| }, | |
| { | |
| "epoch": 16.833333333333332, | |
| "grad_norm": 0.01055908203125, | |
| "learning_rate": 0.00224854485958563, | |
| "loss": 0.2308, | |
| "num_input_tokens_seen": 475008, | |
| "step": 1515, | |
| "train_runtime": 272.7521, | |
| "train_tokens_per_second": 1741.537 | |
| }, | |
| { | |
| "epoch": 16.88888888888889, | |
| "grad_norm": 0.0120849609375, | |
| "learning_rate": 0.0021725508215255634, | |
| "loss": 0.234, | |
| "num_input_tokens_seen": 476608, | |
| "step": 1520, | |
| "train_runtime": 273.5491, | |
| "train_tokens_per_second": 1742.312 | |
| }, | |
| { | |
| "epoch": 16.944444444444443, | |
| "grad_norm": 0.01226806640625, | |
| "learning_rate": 0.0020977627815695213, | |
| "loss": 0.2286, | |
| "num_input_tokens_seen": 478176, | |
| "step": 1525, | |
| "train_runtime": 274.3439, | |
| "train_tokens_per_second": 1742.98 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "grad_norm": 0.0130615234375, | |
| "learning_rate": 0.0020241877710635747, | |
| "loss": 0.2339, | |
| "num_input_tokens_seen": 479744, | |
| "step": 1530, | |
| "train_runtime": 275.1758, | |
| "train_tokens_per_second": 1743.409 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_loss": 0.2328735888004303, | |
| "eval_runtime": 0.8173, | |
| "eval_samples_per_second": 48.943, | |
| "eval_steps_per_second": 12.236, | |
| "num_input_tokens_seen": 479744, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 17.055555555555557, | |
| "grad_norm": 0.0213623046875, | |
| "learning_rate": 0.0019518327073084285, | |
| "loss": 0.2328, | |
| "num_input_tokens_seen": 481344, | |
| "step": 1535, | |
| "train_runtime": 277.7336, | |
| "train_tokens_per_second": 1733.114 | |
| }, | |
| { | |
| "epoch": 17.11111111111111, | |
| "grad_norm": 0.01092529296875, | |
| "learning_rate": 0.0018807043929090638, | |
| "loss": 0.2328, | |
| "num_input_tokens_seen": 482944, | |
| "step": 1540, | |
| "train_runtime": 278.5559, | |
| "train_tokens_per_second": 1733.742 | |
| }, | |
| { | |
| "epoch": 17.166666666666668, | |
| "grad_norm": 0.01220703125, | |
| "learning_rate": 0.0018108095151351837, | |
| "loss": 0.2275, | |
| "num_input_tokens_seen": 484480, | |
| "step": 1545, | |
| "train_runtime": 279.3468, | |
| "train_tokens_per_second": 1734.332 | |
| }, | |
| { | |
| "epoch": 17.22222222222222, | |
| "grad_norm": 0.01348876953125, | |
| "learning_rate": 0.001742154645292508, | |
| "loss": 0.2381, | |
| "num_input_tokens_seen": 486016, | |
| "step": 1550, | |
| "train_runtime": 280.1392, | |
| "train_tokens_per_second": 1734.909 | |
| }, | |
| { | |
| "epoch": 17.27777777777778, | |
| "grad_norm": 0.0045166015625, | |
| "learning_rate": 0.0016747462381049415, | |
| "loss": 0.2307, | |
| "num_input_tokens_seen": 487584, | |
| "step": 1555, | |
| "train_runtime": 280.9345, | |
| "train_tokens_per_second": 1735.579 | |
| }, | |
| { | |
| "epoch": 17.333333333333332, | |
| "grad_norm": 0.006256103515625, | |
| "learning_rate": 0.0016085906311077212, | |
| "loss": 0.2339, | |
| "num_input_tokens_seen": 489088, | |
| "step": 1560, | |
| "train_runtime": 281.7263, | |
| "train_tokens_per_second": 1736.04 | |
| }, | |
| { | |
| "epoch": 17.38888888888889, | |
| "grad_norm": 0.01171875, | |
| "learning_rate": 0.0015436940440516017, | |
| "loss": 0.2306, | |
| "num_input_tokens_seen": 490688, | |
| "step": 1565, | |
| "train_runtime": 282.5227, | |
| "train_tokens_per_second": 1736.809 | |
| }, | |
| { | |
| "epoch": 17.444444444444443, | |
| "grad_norm": 0.022216796875, | |
| "learning_rate": 0.0014800625783180658, | |
| "loss": 0.237, | |
| "num_input_tokens_seen": 492288, | |
| "step": 1570, | |
| "train_runtime": 283.316, | |
| "train_tokens_per_second": 1737.593 | |
| }, | |
| { | |
| "epoch": 17.5, | |
| "grad_norm": 0.011474609375, | |
| "learning_rate": 0.0014177022163457135, | |
| "loss": 0.2308, | |
| "num_input_tokens_seen": 493824, | |
| "step": 1575, | |
| "train_runtime": 284.1065, | |
| "train_tokens_per_second": 1738.165 | |
| }, | |
| { | |
| "epoch": 17.555555555555557, | |
| "grad_norm": 0.01409912109375, | |
| "learning_rate": 0.0013566188210677903, | |
| "loss": 0.2338, | |
| "num_input_tokens_seen": 495456, | |
| "step": 1580, | |
| "train_runtime": 284.9046, | |
| "train_tokens_per_second": 1739.024 | |
| }, | |
| { | |
| "epoch": 17.61111111111111, | |
| "grad_norm": 0.004791259765625, | |
| "learning_rate": 0.0012968181353609854, | |
| "loss": 0.2307, | |
| "num_input_tokens_seen": 497024, | |
| "step": 1585, | |
| "train_runtime": 285.6964, | |
| "train_tokens_per_second": 1739.693 | |
| }, | |
| { | |
| "epoch": 17.666666666666668, | |
| "grad_norm": 0.01092529296875, | |
| "learning_rate": 0.0012383057815055082, | |
| "loss": 0.2266, | |
| "num_input_tokens_seen": 498592, | |
| "step": 1590, | |
| "train_runtime": 286.4968, | |
| "train_tokens_per_second": 1740.306 | |
| }, | |
| { | |
| "epoch": 17.72222222222222, | |
| "grad_norm": 0.01214599609375, | |
| "learning_rate": 0.001181087260656487, | |
| "loss": 0.2308, | |
| "num_input_tokens_seen": 500128, | |
| "step": 1595, | |
| "train_runtime": 287.2884, | |
| "train_tokens_per_second": 1740.857 | |
| }, | |
| { | |
| "epoch": 17.77777777777778, | |
| "grad_norm": 0.0223388671875, | |
| "learning_rate": 0.0011251679523267587, | |
| "loss": 0.2297, | |
| "num_input_tokens_seen": 501696, | |
| "step": 1600, | |
| "train_runtime": 288.0865, | |
| "train_tokens_per_second": 1741.477 | |
| }, | |
| { | |
| "epoch": 17.833333333333332, | |
| "grad_norm": 0.003814697265625, | |
| "learning_rate": 0.0010705531138811369, | |
| "loss": 0.2327, | |
| "num_input_tokens_seen": 503232, | |
| "step": 1605, | |
| "train_runtime": 288.877, | |
| "train_tokens_per_second": 1742.029 | |
| }, | |
| { | |
| "epoch": 17.88888888888889, | |
| "grad_norm": 0.01214599609375, | |
| "learning_rate": 0.0010172478800420954, | |
| "loss": 0.2296, | |
| "num_input_tokens_seen": 504736, | |
| "step": 1610, | |
| "train_runtime": 289.6642, | |
| "train_tokens_per_second": 1742.487 | |
| }, | |
| { | |
| "epoch": 17.944444444444443, | |
| "grad_norm": 0.021484375, | |
| "learning_rate": 0.0009652572624070293, | |
| "loss": 0.2256, | |
| "num_input_tokens_seen": 506304, | |
| "step": 1615, | |
| "train_runtime": 290.4568, | |
| "train_tokens_per_second": 1743.13 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "grad_norm": 0.00579833984375, | |
| "learning_rate": 0.0009145861489770912, | |
| "loss": 0.2307, | |
| "num_input_tokens_seen": 507872, | |
| "step": 1620, | |
| "train_runtime": 291.2951, | |
| "train_tokens_per_second": 1743.497 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_loss": 0.23335394263267517, | |
| "eval_runtime": 0.8176, | |
| "eval_samples_per_second": 48.921, | |
| "eval_steps_per_second": 12.23, | |
| "num_input_tokens_seen": 507872, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 18.055555555555557, | |
| "grad_norm": 0.00408935546875, | |
| "learning_rate": 0.0008652393036976157, | |
| "loss": 0.2286, | |
| "num_input_tokens_seen": 509408, | |
| "step": 1625, | |
| "train_runtime": 293.8353, | |
| "train_tokens_per_second": 1733.651 | |
| }, | |
| { | |
| "epoch": 18.11111111111111, | |
| "grad_norm": 0.020751953125, | |
| "learning_rate": 0.0008172213660102473, | |
| "loss": 0.2267, | |
| "num_input_tokens_seen": 510912, | |
| "step": 1630, | |
| "train_runtime": 294.6455, | |
| "train_tokens_per_second": 1733.989 | |
| }, | |
| { | |
| "epoch": 18.166666666666668, | |
| "grad_norm": 0.0113525390625, | |
| "learning_rate": 0.0007705368504167398, | |
| "loss": 0.2329, | |
| "num_input_tokens_seen": 512384, | |
| "step": 1635, | |
| "train_runtime": 295.4336, | |
| "train_tokens_per_second": 1734.346 | |
| }, | |
| { | |
| "epoch": 18.22222222222222, | |
| "grad_norm": 0.01153564453125, | |
| "learning_rate": 0.0007251901460545118, | |
| "loss": 0.2307, | |
| "num_input_tokens_seen": 513952, | |
| "step": 1640, | |
| "train_runtime": 296.2304, | |
| "train_tokens_per_second": 1734.974 | |
| }, | |
| { | |
| "epoch": 18.27777777777778, | |
| "grad_norm": 0.003753662109375, | |
| "learning_rate": 0.0006811855162840213, | |
| "loss": 0.238, | |
| "num_input_tokens_seen": 515520, | |
| "step": 1645, | |
| "train_runtime": 297.0246, | |
| "train_tokens_per_second": 1735.614 | |
| }, | |
| { | |
| "epoch": 18.333333333333332, | |
| "grad_norm": 0.003143310546875, | |
| "learning_rate": 0.0006385270982879065, | |
| "loss": 0.236, | |
| "num_input_tokens_seen": 517120, | |
| "step": 1650, | |
| "train_runtime": 297.8218, | |
| "train_tokens_per_second": 1736.34 | |
| }, | |
| { | |
| "epoch": 18.38888888888889, | |
| "grad_norm": 0.0034332275390625, | |
| "learning_rate": 0.0005972189026820351, | |
| "loss": 0.2276, | |
| "num_input_tokens_seen": 518688, | |
| "step": 1655, | |
| "train_runtime": 298.6166, | |
| "train_tokens_per_second": 1736.97 | |
| }, | |
| { | |
| "epoch": 18.444444444444443, | |
| "grad_norm": 0.01287841796875, | |
| "learning_rate": 0.0005572648131384361, | |
| "loss": 0.2358, | |
| "num_input_tokens_seen": 520224, | |
| "step": 1660, | |
| "train_runtime": 299.4059, | |
| "train_tokens_per_second": 1737.521 | |
| }, | |
| { | |
| "epoch": 18.5, | |
| "grad_norm": 0.01214599609375, | |
| "learning_rate": 0.0005186685860201717, | |
| "loss": 0.2255, | |
| "num_input_tokens_seen": 521824, | |
| "step": 1665, | |
| "train_runtime": 300.2061, | |
| "train_tokens_per_second": 1738.219 | |
| }, | |
| { | |
| "epoch": 18.555555555555557, | |
| "grad_norm": 0.00579833984375, | |
| "learning_rate": 0.0004814338500281634, | |
| "loss": 0.2297, | |
| "num_input_tokens_seen": 523424, | |
| "step": 1670, | |
| "train_runtime": 301.0084, | |
| "train_tokens_per_second": 1738.902 | |
| }, | |
| { | |
| "epoch": 18.61111111111111, | |
| "grad_norm": 0.004425048828125, | |
| "learning_rate": 0.0004455641058600529, | |
| "loss": 0.2307, | |
| "num_input_tokens_seen": 524960, | |
| "step": 1675, | |
| "train_runtime": 301.8068, | |
| "train_tokens_per_second": 1739.391 | |
| }, | |
| { | |
| "epoch": 18.666666666666668, | |
| "grad_norm": 0.020751953125, | |
| "learning_rate": 0.00041106272588105564, | |
| "loss": 0.2255, | |
| "num_input_tokens_seen": 526496, | |
| "step": 1680, | |
| "train_runtime": 302.6028, | |
| "train_tokens_per_second": 1739.891 | |
| }, | |
| { | |
| "epoch": 18.72222222222222, | |
| "grad_norm": 0.01141357421875, | |
| "learning_rate": 0.0003779329538069159, | |
| "loss": 0.2317, | |
| "num_input_tokens_seen": 528064, | |
| "step": 1685, | |
| "train_runtime": 303.3978, | |
| "train_tokens_per_second": 1740.5 | |
| }, | |
| { | |
| "epoch": 18.77777777777778, | |
| "grad_norm": 0.00982666015625, | |
| "learning_rate": 0.00034617790439893603, | |
| "loss": 0.2255, | |
| "num_input_tokens_seen": 529632, | |
| "step": 1690, | |
| "train_runtime": 304.1914, | |
| "train_tokens_per_second": 1741.114 | |
| }, | |
| { | |
| "epoch": 18.833333333333332, | |
| "grad_norm": 0.011474609375, | |
| "learning_rate": 0.00031580056317113525, | |
| "loss": 0.2327, | |
| "num_input_tokens_seen": 531232, | |
| "step": 1695, | |
| "train_runtime": 304.9886, | |
| "train_tokens_per_second": 1741.809 | |
| }, | |
| { | |
| "epoch": 18.88888888888889, | |
| "grad_norm": 0.0033721923828125, | |
| "learning_rate": 0.00028680378610956793, | |
| "loss": 0.2338, | |
| "num_input_tokens_seen": 532800, | |
| "step": 1700, | |
| "train_runtime": 305.7795, | |
| "train_tokens_per_second": 1742.432 | |
| }, | |
| { | |
| "epoch": 18.944444444444443, | |
| "grad_norm": 0.00628662109375, | |
| "learning_rate": 0.00025919029940380146, | |
| "loss": 0.2245, | |
| "num_input_tokens_seen": 534400, | |
| "step": 1705, | |
| "train_runtime": 306.5763, | |
| "train_tokens_per_second": 1743.123 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "grad_norm": 0.005096435546875, | |
| "learning_rate": 0.0002329626991906164, | |
| "loss": 0.2307, | |
| "num_input_tokens_seen": 535968, | |
| "step": 1710, | |
| "train_runtime": 307.4089, | |
| "train_tokens_per_second": 1743.502 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_loss": 0.23493099212646484, | |
| "eval_runtime": 0.8178, | |
| "eval_samples_per_second": 48.914, | |
| "eval_steps_per_second": 12.229, | |
| "num_input_tokens_seen": 535968, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 19.055555555555557, | |
| "grad_norm": 0.0224609375, | |
| "learning_rate": 0.00020812345130992503, | |
| "loss": 0.2327, | |
| "num_input_tokens_seen": 537536, | |
| "step": 1715, | |
| "train_runtime": 309.9741, | |
| "train_tokens_per_second": 1734.132 | |
| }, | |
| { | |
| "epoch": 19.11111111111111, | |
| "grad_norm": 0.006378173828125, | |
| "learning_rate": 0.0001846748910729351, | |
| "loss": 0.2297, | |
| "num_input_tokens_seen": 539072, | |
| "step": 1720, | |
| "train_runtime": 310.7692, | |
| "train_tokens_per_second": 1734.638 | |
| }, | |
| { | |
| "epoch": 19.166666666666668, | |
| "grad_norm": 0.0224609375, | |
| "learning_rate": 0.0001626192230425938, | |
| "loss": 0.2286, | |
| "num_input_tokens_seen": 540608, | |
| "step": 1725, | |
| "train_runtime": 311.5604, | |
| "train_tokens_per_second": 1735.163 | |
| }, | |
| { | |
| "epoch": 19.22222222222222, | |
| "grad_norm": 0.00372314453125, | |
| "learning_rate": 0.00014195852082632686, | |
| "loss": 0.2339, | |
| "num_input_tokens_seen": 542208, | |
| "step": 1730, | |
| "train_runtime": 312.3593, | |
| "train_tokens_per_second": 1735.847 | |
| }, | |
| { | |
| "epoch": 19.27777777777778, | |
| "grad_norm": 0.013671875, | |
| "learning_rate": 0.00012269472688107463, | |
| "loss": 0.2328, | |
| "num_input_tokens_seen": 543776, | |
| "step": 1735, | |
| "train_runtime": 313.1529, | |
| "train_tokens_per_second": 1736.455 | |
| }, | |
| { | |
| "epoch": 19.333333333333332, | |
| "grad_norm": 0.0050048828125, | |
| "learning_rate": 0.00010482965233067298, | |
| "loss": 0.2287, | |
| "num_input_tokens_seen": 545280, | |
| "step": 1740, | |
| "train_runtime": 313.9486, | |
| "train_tokens_per_second": 1736.845 | |
| }, | |
| { | |
| "epoch": 19.38888888888889, | |
| "grad_norm": 0.01202392578125, | |
| "learning_rate": 8.836497679557964e-05, | |
| "loss": 0.2422, | |
| "num_input_tokens_seen": 546848, | |
| "step": 1745, | |
| "train_runtime": 314.7443, | |
| "train_tokens_per_second": 1737.436 | |
| }, | |
| { | |
| "epoch": 19.444444444444443, | |
| "grad_norm": 0.00311279296875, | |
| "learning_rate": 7.330224823495379e-05, | |
| "loss": 0.2369, | |
| "num_input_tokens_seen": 548416, | |
| "step": 1750, | |
| "train_runtime": 315.5381, | |
| "train_tokens_per_second": 1738.034 | |
| }, | |
| { | |
| "epoch": 19.5, | |
| "grad_norm": 0.01019287109375, | |
| "learning_rate": 5.96428828011325e-05, | |
| "loss": 0.2306, | |
| "num_input_tokens_seen": 550016, | |
| "step": 1755, | |
| "train_runtime": 316.3398, | |
| "train_tokens_per_second": 1738.687 | |
| }, | |
| { | |
| "epoch": 19.555555555555557, | |
| "grad_norm": 0.0026092529296875, | |
| "learning_rate": 4.738816470647389e-05, | |
| "loss": 0.236, | |
| "num_input_tokens_seen": 551584, | |
| "step": 1760, | |
| "train_runtime": 317.1375, | |
| "train_tokens_per_second": 1739.258 | |
| }, | |
| { | |
| "epoch": 19.61111111111111, | |
| "grad_norm": 0.01263427734375, | |
| "learning_rate": 3.653924610263703e-05, | |
| "loss": 0.2297, | |
| "num_input_tokens_seen": 553152, | |
| "step": 1765, | |
| "train_runtime": 317.9345, | |
| "train_tokens_per_second": 1739.83 | |
| }, | |
| { | |
| "epoch": 19.666666666666668, | |
| "grad_norm": 0.0211181640625, | |
| "learning_rate": 2.7097146972240305e-05, | |
| "loss": 0.2276, | |
| "num_input_tokens_seen": 554752, | |
| "step": 1770, | |
| "train_runtime": 318.7296, | |
| "train_tokens_per_second": 1740.51 | |
| }, | |
| { | |
| "epoch": 19.72222222222222, | |
| "grad_norm": 0.004180908203125, | |
| "learning_rate": 1.9062755032984713e-05, | |
| "loss": 0.2235, | |
| "num_input_tokens_seen": 556288, | |
| "step": 1775, | |
| "train_runtime": 319.5234, | |
| "train_tokens_per_second": 1740.993 | |
| }, | |
| { | |
| "epoch": 19.77777777777778, | |
| "grad_norm": 0.00506591796875, | |
| "learning_rate": 1.2436825654180693e-05, | |
| "loss": 0.2308, | |
| "num_input_tokens_seen": 557888, | |
| "step": 1780, | |
| "train_runtime": 320.3196, | |
| "train_tokens_per_second": 1741.661 | |
| }, | |
| { | |
| "epoch": 19.833333333333332, | |
| "grad_norm": 0.0048828125, | |
| "learning_rate": 7.219981785733242e-06, | |
| "loss": 0.2307, | |
| "num_input_tokens_seen": 559424, | |
| "step": 1785, | |
| "train_runtime": 321.1115, | |
| "train_tokens_per_second": 1742.149 | |
| }, | |
| { | |
| "epoch": 19.88888888888889, | |
| "grad_norm": 0.007659912109375, | |
| "learning_rate": 3.4127138995787565e-06, | |
| "loss": 0.2297, | |
| "num_input_tokens_seen": 560960, | |
| "step": 1790, | |
| "train_runtime": 321.902, | |
| "train_tokens_per_second": 1742.642 | |
| }, | |
| { | |
| "epoch": 19.944444444444443, | |
| "grad_norm": 0.01220703125, | |
| "learning_rate": 1.0153799435669298e-06, | |
| "loss": 0.2234, | |
| "num_input_tokens_seen": 562592, | |
| "step": 1795, | |
| "train_runtime": 322.6996, | |
| "train_tokens_per_second": 1743.393 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 0.00958251953125, | |
| "learning_rate": 2.820530780767161e-08, | |
| "loss": 0.2288, | |
| "num_input_tokens_seen": 564096, | |
| "step": 1800, | |
| "train_runtime": 323.5286, | |
| "train_tokens_per_second": 1743.574 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 0.2323770523071289, | |
| "eval_runtime": 0.8134, | |
| "eval_samples_per_second": 49.178, | |
| "eval_steps_per_second": 12.295, | |
| "num_input_tokens_seen": 564096, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "num_input_tokens_seen": 564096, | |
| "step": 1800, | |
| "total_flos": 2.540098792665907e+16, | |
| "train_loss": 0.2664620706770155, | |
| "train_runtime": 325.1935, | |
| "train_samples_per_second": 22.141, | |
| "train_steps_per_second": 5.535 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1800, | |
| "num_input_tokens_seen": 564096, | |
| "num_train_epochs": 20, | |
| "save_steps": 90, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.540098792665907e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |