| { |
| "best_global_step": 1106, |
| "best_metric": 0.04775509238243103, |
| "best_model_checkpoint": "saves_multiple/prompt-tuning/llama-3-8b-instruct/train_svamp_42_1760623621/checkpoint-1106", |
| "epoch": 20.0, |
| "eval_steps": 158, |
| "global_step": 3160, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.03164556962025317, |
| "grad_norm": 9.375, |
| "learning_rate": 0.00037974683544303797, |
| "loss": 2.2551, |
| "num_input_tokens_seen": 2336, |
| "step": 5, |
| "train_runtime": 2.9586, |
| "train_tokens_per_second": 789.551 |
| }, |
| { |
| "epoch": 0.06329113924050633, |
| "grad_norm": 11.75, |
| "learning_rate": 0.0008544303797468354, |
| "loss": 1.4629, |
| "num_input_tokens_seen": 4672, |
| "step": 10, |
| "train_runtime": 3.9467, |
| "train_tokens_per_second": 1183.768 |
| }, |
| { |
| "epoch": 0.0949367088607595, |
| "grad_norm": 35.5, |
| "learning_rate": 0.001329113924050633, |
| "loss": 0.621, |
| "num_input_tokens_seen": 6912, |
| "step": 15, |
| "train_runtime": 4.9324, |
| "train_tokens_per_second": 1401.334 |
| }, |
| { |
| "epoch": 0.12658227848101267, |
| "grad_norm": 3.71875, |
| "learning_rate": 0.0018037974683544303, |
| "loss": 0.1941, |
| "num_input_tokens_seen": 9152, |
| "step": 20, |
| "train_runtime": 5.9039, |
| "train_tokens_per_second": 1550.15 |
| }, |
| { |
| "epoch": 0.15822784810126583, |
| "grad_norm": 6.90625, |
| "learning_rate": 0.002278481012658228, |
| "loss": 0.2347, |
| "num_input_tokens_seen": 11456, |
| "step": 25, |
| "train_runtime": 6.8902, |
| "train_tokens_per_second": 1662.643 |
| }, |
| { |
| "epoch": 0.189873417721519, |
| "grad_norm": 4.6875, |
| "learning_rate": 0.0027531645569620253, |
| "loss": 0.1998, |
| "num_input_tokens_seen": 13792, |
| "step": 30, |
| "train_runtime": 7.8811, |
| "train_tokens_per_second": 1750.02 |
| }, |
| { |
| "epoch": 0.22151898734177214, |
| "grad_norm": 14.5625, |
| "learning_rate": 0.0032278481012658227, |
| "loss": 0.2293, |
| "num_input_tokens_seen": 16064, |
| "step": 35, |
| "train_runtime": 8.8602, |
| "train_tokens_per_second": 1813.052 |
| }, |
| { |
| "epoch": 0.25316455696202533, |
| "grad_norm": 30.625, |
| "learning_rate": 0.00370253164556962, |
| "loss": 0.3135, |
| "num_input_tokens_seen": 18336, |
| "step": 40, |
| "train_runtime": 9.8367, |
| "train_tokens_per_second": 1864.043 |
| }, |
| { |
| "epoch": 0.2848101265822785, |
| "grad_norm": 2.421875, |
| "learning_rate": 0.004177215189873417, |
| "loss": 1.6481, |
| "num_input_tokens_seen": 20640, |
| "step": 45, |
| "train_runtime": 10.817, |
| "train_tokens_per_second": 1908.101 |
| }, |
| { |
| "epoch": 0.31645569620253167, |
| "grad_norm": 35.25, |
| "learning_rate": 0.0046518987341772145, |
| "loss": 1.5985, |
| "num_input_tokens_seen": 22880, |
| "step": 50, |
| "train_runtime": 11.7934, |
| "train_tokens_per_second": 1940.068 |
| }, |
| { |
| "epoch": 0.34810126582278483, |
| "grad_norm": 7.59375, |
| "learning_rate": 0.005126582278481013, |
| "loss": 0.3414, |
| "num_input_tokens_seen": 25088, |
| "step": 55, |
| "train_runtime": 12.7651, |
| "train_tokens_per_second": 1965.361 |
| }, |
| { |
| "epoch": 0.379746835443038, |
| "grad_norm": 8.8125, |
| "learning_rate": 0.00560126582278481, |
| "loss": 0.4014, |
| "num_input_tokens_seen": 27392, |
| "step": 60, |
| "train_runtime": 13.7552, |
| "train_tokens_per_second": 1991.388 |
| }, |
| { |
| "epoch": 0.41139240506329117, |
| "grad_norm": 5.03125, |
| "learning_rate": 0.0060759493670886075, |
| "loss": 0.7651, |
| "num_input_tokens_seen": 29728, |
| "step": 65, |
| "train_runtime": 14.7552, |
| "train_tokens_per_second": 2014.75 |
| }, |
| { |
| "epoch": 0.4430379746835443, |
| "grad_norm": 2.109375, |
| "learning_rate": 0.006550632911392405, |
| "loss": 1.8753, |
| "num_input_tokens_seen": 32064, |
| "step": 70, |
| "train_runtime": 15.7473, |
| "train_tokens_per_second": 2036.159 |
| }, |
| { |
| "epoch": 0.47468354430379744, |
| "grad_norm": 3.640625, |
| "learning_rate": 0.007025316455696202, |
| "loss": 0.1454, |
| "num_input_tokens_seen": 34400, |
| "step": 75, |
| "train_runtime": 16.739, |
| "train_tokens_per_second": 2055.084 |
| }, |
| { |
| "epoch": 0.5063291139240507, |
| "grad_norm": 4.4375, |
| "learning_rate": 0.0075, |
| "loss": 0.2766, |
| "num_input_tokens_seen": 36704, |
| "step": 80, |
| "train_runtime": 17.7274, |
| "train_tokens_per_second": 2070.462 |
| }, |
| { |
| "epoch": 0.5379746835443038, |
| "grad_norm": 22.75, |
| "learning_rate": 0.007974683544303796, |
| "loss": 0.5439, |
| "num_input_tokens_seen": 38880, |
| "step": 85, |
| "train_runtime": 18.6936, |
| "train_tokens_per_second": 2079.855 |
| }, |
| { |
| "epoch": 0.569620253164557, |
| "grad_norm": 4.6875, |
| "learning_rate": 0.008449367088607595, |
| "loss": 1.2058, |
| "num_input_tokens_seen": 41120, |
| "step": 90, |
| "train_runtime": 19.6663, |
| "train_tokens_per_second": 2090.882 |
| }, |
| { |
| "epoch": 0.6012658227848101, |
| "grad_norm": 1.7734375, |
| "learning_rate": 0.008924050632911391, |
| "loss": 0.4753, |
| "num_input_tokens_seen": 43232, |
| "step": 95, |
| "train_runtime": 20.6294, |
| "train_tokens_per_second": 2095.655 |
| }, |
| { |
| "epoch": 0.6329113924050633, |
| "grad_norm": 11.8125, |
| "learning_rate": 0.00939873417721519, |
| "loss": 5.5594, |
| "num_input_tokens_seen": 45504, |
| "step": 100, |
| "train_runtime": 21.618, |
| "train_tokens_per_second": 2104.912 |
| }, |
| { |
| "epoch": 0.6645569620253164, |
| "grad_norm": 7.0, |
| "learning_rate": 0.009873417721518986, |
| "loss": 3.1616, |
| "num_input_tokens_seen": 47872, |
| "step": 105, |
| "train_runtime": 22.608, |
| "train_tokens_per_second": 2117.482 |
| }, |
| { |
| "epoch": 0.6962025316455697, |
| "grad_norm": 18.25, |
| "learning_rate": 0.010348101265822784, |
| "loss": 2.1289, |
| "num_input_tokens_seen": 49952, |
| "step": 110, |
| "train_runtime": 23.5679, |
| "train_tokens_per_second": 2119.497 |
| }, |
| { |
| "epoch": 0.7278481012658228, |
| "grad_norm": 2.6875, |
| "learning_rate": 0.01082278481012658, |
| "loss": 1.2601, |
| "num_input_tokens_seen": 52288, |
| "step": 115, |
| "train_runtime": 24.5454, |
| "train_tokens_per_second": 2130.257 |
| }, |
| { |
| "epoch": 0.759493670886076, |
| "grad_norm": 1.9296875, |
| "learning_rate": 0.011297468354430379, |
| "loss": 0.433, |
| "num_input_tokens_seen": 54592, |
| "step": 120, |
| "train_runtime": 25.5344, |
| "train_tokens_per_second": 2137.983 |
| }, |
| { |
| "epoch": 0.7911392405063291, |
| "grad_norm": 0.7578125, |
| "learning_rate": 0.011772151898734175, |
| "loss": 0.1428, |
| "num_input_tokens_seen": 56864, |
| "step": 125, |
| "train_runtime": 26.5128, |
| "train_tokens_per_second": 2144.779 |
| }, |
| { |
| "epoch": 0.8227848101265823, |
| "grad_norm": 1.875, |
| "learning_rate": 0.012246835443037974, |
| "loss": 0.263, |
| "num_input_tokens_seen": 59104, |
| "step": 130, |
| "train_runtime": 27.488, |
| "train_tokens_per_second": 2150.174 |
| }, |
| { |
| "epoch": 0.8544303797468354, |
| "grad_norm": 1.7265625, |
| "learning_rate": 0.012721518987341772, |
| "loss": 0.6412, |
| "num_input_tokens_seen": 61408, |
| "step": 135, |
| "train_runtime": 28.4783, |
| "train_tokens_per_second": 2156.311 |
| }, |
| { |
| "epoch": 0.8860759493670886, |
| "grad_norm": 0.859375, |
| "learning_rate": 0.01319620253164557, |
| "loss": 0.3362, |
| "num_input_tokens_seen": 63616, |
| "step": 140, |
| "train_runtime": 29.4521, |
| "train_tokens_per_second": 2159.982 |
| }, |
| { |
| "epoch": 0.9177215189873418, |
| "grad_norm": 0.58203125, |
| "learning_rate": 0.013670886075949367, |
| "loss": 0.3041, |
| "num_input_tokens_seen": 65824, |
| "step": 145, |
| "train_runtime": 30.4243, |
| "train_tokens_per_second": 2163.531 |
| }, |
| { |
| "epoch": 0.9493670886075949, |
| "grad_norm": 1.0078125, |
| "learning_rate": 0.014145569620253165, |
| "loss": 0.1369, |
| "num_input_tokens_seen": 68064, |
| "step": 150, |
| "train_runtime": 31.3981, |
| "train_tokens_per_second": 2167.774 |
| }, |
| { |
| "epoch": 0.9810126582278481, |
| "grad_norm": 0.8671875, |
| "learning_rate": 0.014620253164556962, |
| "loss": 0.1167, |
| "num_input_tokens_seen": 70432, |
| "step": 155, |
| "train_runtime": 32.3917, |
| "train_tokens_per_second": 2174.381 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.09709509462118149, |
| "eval_runtime": 1.6655, |
| "eval_samples_per_second": 42.028, |
| "eval_steps_per_second": 10.807, |
| "num_input_tokens_seen": 71568, |
| "step": 158 |
| }, |
| { |
| "epoch": 1.0126582278481013, |
| "grad_norm": 0.185546875, |
| "learning_rate": 0.01509493670886076, |
| "loss": 0.168, |
| "num_input_tokens_seen": 72400, |
| "step": 160, |
| "train_runtime": 35.9024, |
| "train_tokens_per_second": 2016.576 |
| }, |
| { |
| "epoch": 1.0443037974683544, |
| "grad_norm": 1.65625, |
| "learning_rate": 0.015569620253164556, |
| "loss": 0.1485, |
| "num_input_tokens_seen": 74576, |
| "step": 165, |
| "train_runtime": 36.8752, |
| "train_tokens_per_second": 2022.387 |
| }, |
| { |
| "epoch": 1.0759493670886076, |
| "grad_norm": 0.353515625, |
| "learning_rate": 0.01604430379746835, |
| "loss": 0.1079, |
| "num_input_tokens_seen": 76944, |
| "step": 170, |
| "train_runtime": 37.8783, |
| "train_tokens_per_second": 2031.348 |
| }, |
| { |
| "epoch": 1.1075949367088607, |
| "grad_norm": 0.212890625, |
| "learning_rate": 0.01651898734177215, |
| "loss": 0.1488, |
| "num_input_tokens_seen": 79184, |
| "step": 175, |
| "train_runtime": 38.8505, |
| "train_tokens_per_second": 2038.175 |
| }, |
| { |
| "epoch": 1.139240506329114, |
| "grad_norm": 0.2109375, |
| "learning_rate": 0.016993670886075948, |
| "loss": 0.0581, |
| "num_input_tokens_seen": 81424, |
| "step": 180, |
| "train_runtime": 39.8224, |
| "train_tokens_per_second": 2044.679 |
| }, |
| { |
| "epoch": 1.1708860759493671, |
| "grad_norm": 0.404296875, |
| "learning_rate": 0.017468354430379748, |
| "loss": 0.1631, |
| "num_input_tokens_seen": 83760, |
| "step": 185, |
| "train_runtime": 40.8036, |
| "train_tokens_per_second": 2052.762 |
| }, |
| { |
| "epoch": 1.2025316455696202, |
| "grad_norm": 0.30859375, |
| "learning_rate": 0.017943037974683544, |
| "loss": 0.0797, |
| "num_input_tokens_seen": 86032, |
| "step": 190, |
| "train_runtime": 41.7895, |
| "train_tokens_per_second": 2058.699 |
| }, |
| { |
| "epoch": 1.2341772151898733, |
| "grad_norm": 0.169921875, |
| "learning_rate": 0.018417721518987344, |
| "loss": 0.0969, |
| "num_input_tokens_seen": 88368, |
| "step": 195, |
| "train_runtime": 42.785, |
| "train_tokens_per_second": 2065.397 |
| }, |
| { |
| "epoch": 1.2658227848101267, |
| "grad_norm": 0.142578125, |
| "learning_rate": 0.01889240506329114, |
| "loss": 0.0895, |
| "num_input_tokens_seen": 90608, |
| "step": 200, |
| "train_runtime": 43.7627, |
| "train_tokens_per_second": 2070.439 |
| }, |
| { |
| "epoch": 1.2974683544303798, |
| "grad_norm": 0.19921875, |
| "learning_rate": 0.019367088607594937, |
| "loss": 0.1284, |
| "num_input_tokens_seen": 93040, |
| "step": 205, |
| "train_runtime": 44.7708, |
| "train_tokens_per_second": 2078.138 |
| }, |
| { |
| "epoch": 1.3291139240506329, |
| "grad_norm": 0.40625, |
| "learning_rate": 0.019841772151898734, |
| "loss": 0.1272, |
| "num_input_tokens_seen": 95376, |
| "step": 210, |
| "train_runtime": 45.7696, |
| "train_tokens_per_second": 2083.83 |
| }, |
| { |
| "epoch": 1.360759493670886, |
| "grad_norm": 0.107421875, |
| "learning_rate": 0.02031645569620253, |
| "loss": 0.0786, |
| "num_input_tokens_seen": 97616, |
| "step": 215, |
| "train_runtime": 46.7523, |
| "train_tokens_per_second": 2087.942 |
| }, |
| { |
| "epoch": 1.3924050632911391, |
| "grad_norm": 0.1748046875, |
| "learning_rate": 0.02079113924050633, |
| "loss": 0.0547, |
| "num_input_tokens_seen": 99760, |
| "step": 220, |
| "train_runtime": 47.7222, |
| "train_tokens_per_second": 2090.431 |
| }, |
| { |
| "epoch": 1.4240506329113924, |
| "grad_norm": 0.248046875, |
| "learning_rate": 0.021265822784810127, |
| "loss": 0.164, |
| "num_input_tokens_seen": 102064, |
| "step": 225, |
| "train_runtime": 48.705, |
| "train_tokens_per_second": 2095.553 |
| }, |
| { |
| "epoch": 1.4556962025316456, |
| "grad_norm": 0.28125, |
| "learning_rate": 0.021740506329113923, |
| "loss": 0.0752, |
| "num_input_tokens_seen": 104304, |
| "step": 230, |
| "train_runtime": 49.686, |
| "train_tokens_per_second": 2099.264 |
| }, |
| { |
| "epoch": 1.4873417721518987, |
| "grad_norm": 0.099609375, |
| "learning_rate": 0.02221518987341772, |
| "loss": 0.0558, |
| "num_input_tokens_seen": 106608, |
| "step": 235, |
| "train_runtime": 50.6626, |
| "train_tokens_per_second": 2104.274 |
| }, |
| { |
| "epoch": 1.518987341772152, |
| "grad_norm": 0.091796875, |
| "learning_rate": 0.02268987341772152, |
| "loss": 0.0738, |
| "num_input_tokens_seen": 108752, |
| "step": 240, |
| "train_runtime": 51.6278, |
| "train_tokens_per_second": 2106.46 |
| }, |
| { |
| "epoch": 1.5506329113924051, |
| "grad_norm": 0.1630859375, |
| "learning_rate": 0.023164556962025316, |
| "loss": 0.1385, |
| "num_input_tokens_seen": 111088, |
| "step": 245, |
| "train_runtime": 52.6192, |
| "train_tokens_per_second": 2111.17 |
| }, |
| { |
| "epoch": 1.5822784810126582, |
| "grad_norm": 0.0830078125, |
| "learning_rate": 0.023639240506329113, |
| "loss": 0.0286, |
| "num_input_tokens_seen": 113328, |
| "step": 250, |
| "train_runtime": 53.5922, |
| "train_tokens_per_second": 2114.635 |
| }, |
| { |
| "epoch": 1.6139240506329116, |
| "grad_norm": 0.0791015625, |
| "learning_rate": 0.02411392405063291, |
| "loss": 0.2212, |
| "num_input_tokens_seen": 115600, |
| "step": 255, |
| "train_runtime": 54.578, |
| "train_tokens_per_second": 2118.069 |
| }, |
| { |
| "epoch": 1.6455696202531644, |
| "grad_norm": 0.1640625, |
| "learning_rate": 0.02458860759493671, |
| "loss": 0.1101, |
| "num_input_tokens_seen": 117936, |
| "step": 260, |
| "train_runtime": 55.5669, |
| "train_tokens_per_second": 2122.413 |
| }, |
| { |
| "epoch": 1.6772151898734178, |
| "grad_norm": 0.07568359375, |
| "learning_rate": 0.025063291139240506, |
| "loss": 0.1231, |
| "num_input_tokens_seen": 120176, |
| "step": 265, |
| "train_runtime": 56.5444, |
| "train_tokens_per_second": 2125.338 |
| }, |
| { |
| "epoch": 1.7088607594936709, |
| "grad_norm": 0.054931640625, |
| "learning_rate": 0.025537974683544303, |
| "loss": 0.0565, |
| "num_input_tokens_seen": 122320, |
| "step": 270, |
| "train_runtime": 57.5142, |
| "train_tokens_per_second": 2126.778 |
| }, |
| { |
| "epoch": 1.740506329113924, |
| "grad_norm": 0.0966796875, |
| "learning_rate": 0.0260126582278481, |
| "loss": 0.0567, |
| "num_input_tokens_seen": 124752, |
| "step": 275, |
| "train_runtime": 58.5126, |
| "train_tokens_per_second": 2132.054 |
| }, |
| { |
| "epoch": 1.7721518987341773, |
| "grad_norm": 0.162109375, |
| "learning_rate": 0.0264873417721519, |
| "loss": 0.1192, |
| "num_input_tokens_seen": 127216, |
| "step": 280, |
| "train_runtime": 59.5352, |
| "train_tokens_per_second": 2136.82 |
| }, |
| { |
| "epoch": 1.8037974683544302, |
| "grad_norm": 0.11083984375, |
| "learning_rate": 0.026962025316455696, |
| "loss": 0.1226, |
| "num_input_tokens_seen": 129424, |
| "step": 285, |
| "train_runtime": 60.5092, |
| "train_tokens_per_second": 2138.915 |
| }, |
| { |
| "epoch": 1.8354430379746836, |
| "grad_norm": 0.15234375, |
| "learning_rate": 0.027436708860759492, |
| "loss": 0.0977, |
| "num_input_tokens_seen": 131632, |
| "step": 290, |
| "train_runtime": 61.4848, |
| "train_tokens_per_second": 2140.888 |
| }, |
| { |
| "epoch": 1.8670886075949367, |
| "grad_norm": 0.21484375, |
| "learning_rate": 0.02791139240506329, |
| "loss": 0.1471, |
| "num_input_tokens_seen": 133808, |
| "step": 295, |
| "train_runtime": 62.4572, |
| "train_tokens_per_second": 2142.395 |
| }, |
| { |
| "epoch": 1.8987341772151898, |
| "grad_norm": 0.0830078125, |
| "learning_rate": 0.02838607594936709, |
| "loss": 0.1052, |
| "num_input_tokens_seen": 136144, |
| "step": 300, |
| "train_runtime": 63.4497, |
| "train_tokens_per_second": 2145.7 |
| }, |
| { |
| "epoch": 1.9303797468354431, |
| "grad_norm": 0.0576171875, |
| "learning_rate": 0.028860759493670885, |
| "loss": 0.0865, |
| "num_input_tokens_seen": 138384, |
| "step": 305, |
| "train_runtime": 64.4245, |
| "train_tokens_per_second": 2148.004 |
| }, |
| { |
| "epoch": 1.9620253164556962, |
| "grad_norm": 0.310546875, |
| "learning_rate": 0.02933544303797468, |
| "loss": 0.1386, |
| "num_input_tokens_seen": 140656, |
| "step": 310, |
| "train_runtime": 65.4004, |
| "train_tokens_per_second": 2150.689 |
| }, |
| { |
| "epoch": 1.9936708860759493, |
| "grad_norm": 0.0966796875, |
| "learning_rate": 0.029810126582278478, |
| "loss": 0.0618, |
| "num_input_tokens_seen": 142992, |
| "step": 315, |
| "train_runtime": 66.4373, |
| "train_tokens_per_second": 2152.285 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.07142843306064606, |
| "eval_runtime": 1.6714, |
| "eval_samples_per_second": 41.882, |
| "eval_steps_per_second": 10.77, |
| "num_input_tokens_seen": 143232, |
| "step": 316 |
| }, |
| { |
| "epoch": 2.0253164556962027, |
| "grad_norm": 0.036376953125, |
| "learning_rate": 0.02999991763476599, |
| "loss": 0.0334, |
| "num_input_tokens_seen": 145056, |
| "step": 320, |
| "train_runtime": 70.098, |
| "train_tokens_per_second": 2069.33 |
| }, |
| { |
| "epoch": 2.0569620253164556, |
| "grad_norm": 0.0439453125, |
| "learning_rate": 0.02999941429494495, |
| "loss": 0.0793, |
| "num_input_tokens_seen": 147360, |
| "step": 325, |
| "train_runtime": 71.0787, |
| "train_tokens_per_second": 2073.194 |
| }, |
| { |
| "epoch": 2.088607594936709, |
| "grad_norm": 0.04736328125, |
| "learning_rate": 0.02999845338910228, |
| "loss": 0.0319, |
| "num_input_tokens_seen": 149568, |
| "step": 330, |
| "train_runtime": 72.054, |
| "train_tokens_per_second": 2075.777 |
| }, |
| { |
| "epoch": 2.1202531645569622, |
| "grad_norm": 0.0634765625, |
| "learning_rate": 0.029997034946550982, |
| "loss": 0.1134, |
| "num_input_tokens_seen": 151872, |
| "step": 335, |
| "train_runtime": 73.033, |
| "train_tokens_per_second": 2079.498 |
| }, |
| { |
| "epoch": 2.151898734177215, |
| "grad_norm": 0.0732421875, |
| "learning_rate": 0.029995159010561483, |
| "loss": 0.044, |
| "num_input_tokens_seen": 154080, |
| "step": 340, |
| "train_runtime": 74.0082, |
| "train_tokens_per_second": 2081.933 |
| }, |
| { |
| "epoch": 2.1835443037974684, |
| "grad_norm": 0.04296875, |
| "learning_rate": 0.029992825638360327, |
| "loss": 0.047, |
| "num_input_tokens_seen": 156288, |
| "step": 345, |
| "train_runtime": 74.9789, |
| "train_tokens_per_second": 2084.427 |
| }, |
| { |
| "epoch": 2.2151898734177213, |
| "grad_norm": 0.032470703125, |
| "learning_rate": 0.02999003490112841, |
| "loss": 0.0458, |
| "num_input_tokens_seen": 158624, |
| "step": 350, |
| "train_runtime": 75.9723, |
| "train_tokens_per_second": 2087.92 |
| }, |
| { |
| "epoch": 2.2468354430379747, |
| "grad_norm": 0.087890625, |
| "learning_rate": 0.029986786883998827, |
| "loss": 0.019, |
| "num_input_tokens_seen": 160864, |
| "step": 355, |
| "train_runtime": 76.9879, |
| "train_tokens_per_second": 2089.471 |
| }, |
| { |
| "epoch": 2.278481012658228, |
| "grad_norm": 0.1416015625, |
| "learning_rate": 0.029983081686054267, |
| "loss": 0.1553, |
| "num_input_tokens_seen": 163136, |
| "step": 360, |
| "train_runtime": 78.0606, |
| "train_tokens_per_second": 2089.864 |
| }, |
| { |
| "epoch": 2.310126582278481, |
| "grad_norm": 0.031005859375, |
| "learning_rate": 0.02997891942032399, |
| "loss": 0.0626, |
| "num_input_tokens_seen": 165408, |
| "step": 365, |
| "train_runtime": 79.1252, |
| "train_tokens_per_second": 2090.46 |
| }, |
| { |
| "epoch": 2.3417721518987342, |
| "grad_norm": 0.16796875, |
| "learning_rate": 0.029974300213780378, |
| "loss": 0.0852, |
| "num_input_tokens_seen": 167712, |
| "step": 370, |
| "train_runtime": 80.115, |
| "train_tokens_per_second": 2093.39 |
| }, |
| { |
| "epoch": 2.3734177215189876, |
| "grad_norm": 0.1474609375, |
| "learning_rate": 0.02996922420733506, |
| "loss": 0.1164, |
| "num_input_tokens_seen": 169824, |
| "step": 375, |
| "train_runtime": 81.0779, |
| "train_tokens_per_second": 2094.579 |
| }, |
| { |
| "epoch": 2.4050632911392404, |
| "grad_norm": 1.1328125, |
| "learning_rate": 0.029963691555834625, |
| "loss": 0.153, |
| "num_input_tokens_seen": 171968, |
| "step": 380, |
| "train_runtime": 82.0451, |
| "train_tokens_per_second": 2096.018 |
| }, |
| { |
| "epoch": 2.4367088607594938, |
| "grad_norm": 0.1220703125, |
| "learning_rate": 0.02995770242805588, |
| "loss": 0.0872, |
| "num_input_tokens_seen": 174272, |
| "step": 385, |
| "train_runtime": 83.0327, |
| "train_tokens_per_second": 2098.836 |
| }, |
| { |
| "epoch": 2.4683544303797467, |
| "grad_norm": 0.173828125, |
| "learning_rate": 0.029951257006700725, |
| "loss": 0.1133, |
| "num_input_tokens_seen": 176576, |
| "step": 390, |
| "train_runtime": 84.014, |
| "train_tokens_per_second": 2101.745 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 10.125, |
| "learning_rate": 0.029944355488390553, |
| "loss": 1.7205, |
| "num_input_tokens_seen": 178848, |
| "step": 395, |
| "train_runtime": 85.0012, |
| "train_tokens_per_second": 2104.066 |
| }, |
| { |
| "epoch": 2.5316455696202533, |
| "grad_norm": 0.408203125, |
| "learning_rate": 0.029936998083660273, |
| "loss": 1.1129, |
| "num_input_tokens_seen": 181120, |
| "step": 400, |
| "train_runtime": 85.989, |
| "train_tokens_per_second": 2106.315 |
| }, |
| { |
| "epoch": 2.5632911392405062, |
| "grad_norm": 0.1962890625, |
| "learning_rate": 0.029929185016951868, |
| "loss": 0.174, |
| "num_input_tokens_seen": 183232, |
| "step": 405, |
| "train_runtime": 86.955, |
| "train_tokens_per_second": 2107.205 |
| }, |
| { |
| "epoch": 2.5949367088607596, |
| "grad_norm": 0.1767578125, |
| "learning_rate": 0.02992091652660758, |
| "loss": 0.1194, |
| "num_input_tokens_seen": 185504, |
| "step": 410, |
| "train_runtime": 87.9423, |
| "train_tokens_per_second": 2109.383 |
| }, |
| { |
| "epoch": 2.6265822784810124, |
| "grad_norm": 0.1435546875, |
| "learning_rate": 0.029912192864862595, |
| "loss": 0.1515, |
| "num_input_tokens_seen": 187808, |
| "step": 415, |
| "train_runtime": 88.9212, |
| "train_tokens_per_second": 2112.072 |
| }, |
| { |
| "epoch": 2.6582278481012658, |
| "grad_norm": 0.07568359375, |
| "learning_rate": 0.029903014297837396, |
| "loss": 0.1169, |
| "num_input_tokens_seen": 190016, |
| "step": 420, |
| "train_runtime": 89.8976, |
| "train_tokens_per_second": 2113.693 |
| }, |
| { |
| "epoch": 2.689873417721519, |
| "grad_norm": 0.1357421875, |
| "learning_rate": 0.0298933811055296, |
| "loss": 0.1, |
| "num_input_tokens_seen": 192480, |
| "step": 425, |
| "train_runtime": 90.9192, |
| "train_tokens_per_second": 2117.044 |
| }, |
| { |
| "epoch": 2.721518987341772, |
| "grad_norm": 0.37890625, |
| "learning_rate": 0.029883293581805453, |
| "loss": 0.1351, |
| "num_input_tokens_seen": 194880, |
| "step": 430, |
| "train_runtime": 91.9268, |
| "train_tokens_per_second": 2119.947 |
| }, |
| { |
| "epoch": 2.7531645569620253, |
| "grad_norm": 0.123046875, |
| "learning_rate": 0.029872752034390833, |
| "loss": 0.128, |
| "num_input_tokens_seen": 197152, |
| "step": 435, |
| "train_runtime": 92.9052, |
| "train_tokens_per_second": 2122.076 |
| }, |
| { |
| "epoch": 2.7848101265822782, |
| "grad_norm": 0.09814453125, |
| "learning_rate": 0.029861756784861908, |
| "loss": 0.0751, |
| "num_input_tokens_seen": 199328, |
| "step": 440, |
| "train_runtime": 93.878, |
| "train_tokens_per_second": 2123.267 |
| }, |
| { |
| "epoch": 2.8164556962025316, |
| "grad_norm": 0.033203125, |
| "learning_rate": 0.029850308168635264, |
| "loss": 0.1457, |
| "num_input_tokens_seen": 201664, |
| "step": 445, |
| "train_runtime": 94.8626, |
| "train_tokens_per_second": 2125.855 |
| }, |
| { |
| "epoch": 2.848101265822785, |
| "grad_norm": 0.08984375, |
| "learning_rate": 0.02983840653495774, |
| "loss": 0.0665, |
| "num_input_tokens_seen": 204000, |
| "step": 450, |
| "train_runtime": 95.8542, |
| "train_tokens_per_second": 2128.231 |
| }, |
| { |
| "epoch": 2.879746835443038, |
| "grad_norm": 0.068359375, |
| "learning_rate": 0.029826052246895707, |
| "loss": 0.0807, |
| "num_input_tokens_seen": 206432, |
| "step": 455, |
| "train_runtime": 96.8728, |
| "train_tokens_per_second": 2130.959 |
| }, |
| { |
| "epoch": 2.911392405063291, |
| "grad_norm": 0.016845703125, |
| "learning_rate": 0.029813245681324055, |
| "loss": 0.0481, |
| "num_input_tokens_seen": 208672, |
| "step": 460, |
| "train_runtime": 97.8462, |
| "train_tokens_per_second": 2132.653 |
| }, |
| { |
| "epoch": 2.9430379746835444, |
| "grad_norm": 0.1826171875, |
| "learning_rate": 0.02979998722891465, |
| "loss": 0.0907, |
| "num_input_tokens_seen": 210976, |
| "step": 465, |
| "train_runtime": 98.8269, |
| "train_tokens_per_second": 2134.804 |
| }, |
| { |
| "epoch": 2.9746835443037973, |
| "grad_norm": 0.01556396484375, |
| "learning_rate": 0.029786277294124443, |
| "loss": 0.0498, |
| "num_input_tokens_seen": 213248, |
| "step": 470, |
| "train_runtime": 99.8061, |
| "train_tokens_per_second": 2136.623 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.07707643508911133, |
| "eval_runtime": 1.6778, |
| "eval_samples_per_second": 41.721, |
| "eval_steps_per_second": 10.728, |
| "num_input_tokens_seen": 214912, |
| "step": 474 |
| }, |
| { |
| "epoch": 3.0063291139240507, |
| "grad_norm": 0.02978515625, |
| "learning_rate": 0.029772116295183122, |
| "loss": 0.029, |
| "num_input_tokens_seen": 215360, |
| "step": 475, |
| "train_runtime": 103.3347, |
| "train_tokens_per_second": 2084.101 |
| }, |
| { |
| "epoch": 3.037974683544304, |
| "grad_norm": 0.019775390625, |
| "learning_rate": 0.02975750466408034, |
| "loss": 0.0487, |
| "num_input_tokens_seen": 217632, |
| "step": 480, |
| "train_runtime": 104.3119, |
| "train_tokens_per_second": 2086.359 |
| }, |
| { |
| "epoch": 3.069620253164557, |
| "grad_norm": 0.028076171875, |
| "learning_rate": 0.029742442846552575, |
| "loss": 0.0696, |
| "num_input_tokens_seen": 219936, |
| "step": 485, |
| "train_runtime": 105.3099, |
| "train_tokens_per_second": 2088.464 |
| }, |
| { |
| "epoch": 3.1012658227848102, |
| "grad_norm": 0.10009765625, |
| "learning_rate": 0.029726931302069493, |
| "loss": 0.0665, |
| "num_input_tokens_seen": 222304, |
| "step": 490, |
| "train_runtime": 106.3036, |
| "train_tokens_per_second": 2091.217 |
| }, |
| { |
| "epoch": 3.132911392405063, |
| "grad_norm": 0.0771484375, |
| "learning_rate": 0.029710970503819947, |
| "loss": 0.0542, |
| "num_input_tokens_seen": 224448, |
| "step": 495, |
| "train_runtime": 107.2731, |
| "train_tokens_per_second": 2092.304 |
| }, |
| { |
| "epoch": 3.1645569620253164, |
| "grad_norm": 0.0108642578125, |
| "learning_rate": 0.029694560938697545, |
| "loss": 0.0892, |
| "num_input_tokens_seen": 226752, |
| "step": 500, |
| "train_runtime": 108.2551, |
| "train_tokens_per_second": 2094.609 |
| }, |
| { |
| "epoch": 3.1962025316455698, |
| "grad_norm": 0.080078125, |
| "learning_rate": 0.029677703107285798, |
| "loss": 0.1341, |
| "num_input_tokens_seen": 229088, |
| "step": 505, |
| "train_runtime": 109.2387, |
| "train_tokens_per_second": 2097.132 |
| }, |
| { |
| "epoch": 3.2278481012658227, |
| "grad_norm": 0.02783203125, |
| "learning_rate": 0.029660397523842846, |
| "loss": 0.026, |
| "num_input_tokens_seen": 231232, |
| "step": 510, |
| "train_runtime": 110.218, |
| "train_tokens_per_second": 2097.951 |
| }, |
| { |
| "epoch": 3.259493670886076, |
| "grad_norm": 0.08642578125, |
| "learning_rate": 0.029642644716285763, |
| "loss": 0.0589, |
| "num_input_tokens_seen": 233568, |
| "step": 515, |
| "train_runtime": 111.2009, |
| "train_tokens_per_second": 2100.415 |
| }, |
| { |
| "epoch": 3.291139240506329, |
| "grad_norm": 0.0152587890625, |
| "learning_rate": 0.02962444522617446, |
| "loss": 0.0482, |
| "num_input_tokens_seen": 235808, |
| "step": 520, |
| "train_runtime": 112.1832, |
| "train_tokens_per_second": 2101.991 |
| }, |
| { |
| "epoch": 3.3227848101265822, |
| "grad_norm": 0.0213623046875, |
| "learning_rate": 0.02960579960869518, |
| "loss": 0.1296, |
| "num_input_tokens_seen": 238080, |
| "step": 525, |
| "train_runtime": 113.1698, |
| "train_tokens_per_second": 2103.741 |
| }, |
| { |
| "epoch": 3.3544303797468356, |
| "grad_norm": 0.05859375, |
| "learning_rate": 0.029586708432643525, |
| "loss": 0.0445, |
| "num_input_tokens_seen": 240288, |
| "step": 530, |
| "train_runtime": 114.1467, |
| "train_tokens_per_second": 2105.081 |
| }, |
| { |
| "epoch": 3.3860759493670884, |
| "grad_norm": 0.068359375, |
| "learning_rate": 0.029567172280407134, |
| "loss": 0.0441, |
| "num_input_tokens_seen": 242464, |
| "step": 535, |
| "train_runtime": 115.1213, |
| "train_tokens_per_second": 2106.16 |
| }, |
| { |
| "epoch": 3.4177215189873418, |
| "grad_norm": 0.05078125, |
| "learning_rate": 0.02954719174794791, |
| "loss": 0.0436, |
| "num_input_tokens_seen": 244736, |
| "step": 540, |
| "train_runtime": 116.108, |
| "train_tokens_per_second": 2107.83 |
| }, |
| { |
| "epoch": 3.449367088607595, |
| "grad_norm": 0.05517578125, |
| "learning_rate": 0.02952676744478383, |
| "loss": 0.0769, |
| "num_input_tokens_seen": 247072, |
| "step": 545, |
| "train_runtime": 117.0993, |
| "train_tokens_per_second": 2109.936 |
| }, |
| { |
| "epoch": 3.481012658227848, |
| "grad_norm": 0.046142578125, |
| "learning_rate": 0.029505899993970373, |
| "loss": 0.037, |
| "num_input_tokens_seen": 249376, |
| "step": 550, |
| "train_runtime": 118.0787, |
| "train_tokens_per_second": 2111.947 |
| }, |
| { |
| "epoch": 3.5126582278481013, |
| "grad_norm": 0.032470703125, |
| "learning_rate": 0.029484590032081488, |
| "loss": 0.0548, |
| "num_input_tokens_seen": 251680, |
| "step": 555, |
| "train_runtime": 119.0678, |
| "train_tokens_per_second": 2113.753 |
| }, |
| { |
| "epoch": 3.5443037974683547, |
| "grad_norm": 0.0771484375, |
| "learning_rate": 0.029462838209190195, |
| "loss": 0.0635, |
| "num_input_tokens_seen": 254048, |
| "step": 560, |
| "train_runtime": 120.0624, |
| "train_tokens_per_second": 2115.966 |
| }, |
| { |
| "epoch": 3.5759493670886076, |
| "grad_norm": 0.125, |
| "learning_rate": 0.029440645188848733, |
| "loss": 0.0936, |
| "num_input_tokens_seen": 256256, |
| "step": 565, |
| "train_runtime": 121.034, |
| "train_tokens_per_second": 2117.223 |
| }, |
| { |
| "epoch": 3.607594936708861, |
| "grad_norm": 0.03466796875, |
| "learning_rate": 0.029418011648068353, |
| "loss": 0.0574, |
| "num_input_tokens_seen": 258592, |
| "step": 570, |
| "train_runtime": 122.0267, |
| "train_tokens_per_second": 2119.144 |
| }, |
| { |
| "epoch": 3.6392405063291138, |
| "grad_norm": 0.05322265625, |
| "learning_rate": 0.029394938277298614, |
| "loss": 0.0827, |
| "num_input_tokens_seen": 260832, |
| "step": 575, |
| "train_runtime": 123.0027, |
| "train_tokens_per_second": 2120.539 |
| }, |
| { |
| "epoch": 3.670886075949367, |
| "grad_norm": 0.05810546875, |
| "learning_rate": 0.029371425780406368, |
| "loss": 0.0578, |
| "num_input_tokens_seen": 263168, |
| "step": 580, |
| "train_runtime": 123.9966, |
| "train_tokens_per_second": 2122.38 |
| }, |
| { |
| "epoch": 3.7025316455696204, |
| "grad_norm": 0.035888671875, |
| "learning_rate": 0.029347474874654274, |
| "loss": 0.0399, |
| "num_input_tokens_seen": 265344, |
| "step": 585, |
| "train_runtime": 124.9725, |
| "train_tokens_per_second": 2123.219 |
| }, |
| { |
| "epoch": 3.7341772151898733, |
| "grad_norm": 0.0849609375, |
| "learning_rate": 0.029323086290678896, |
| "loss": 0.0807, |
| "num_input_tokens_seen": 267552, |
| "step": 590, |
| "train_runtime": 125.9463, |
| "train_tokens_per_second": 2124.334 |
| }, |
| { |
| "epoch": 3.7658227848101267, |
| "grad_norm": 0.054443359375, |
| "learning_rate": 0.02929826077246845, |
| "loss": 0.0741, |
| "num_input_tokens_seen": 269760, |
| "step": 595, |
| "train_runtime": 126.921, |
| "train_tokens_per_second": 2125.416 |
| }, |
| { |
| "epoch": 3.7974683544303796, |
| "grad_norm": 0.0140380859375, |
| "learning_rate": 0.029272999077340066, |
| "loss": 0.0445, |
| "num_input_tokens_seen": 272000, |
| "step": 600, |
| "train_runtime": 127.8981, |
| "train_tokens_per_second": 2126.693 |
| }, |
| { |
| "epoch": 3.829113924050633, |
| "grad_norm": 0.0546875, |
| "learning_rate": 0.02924730197591674, |
| "loss": 0.05, |
| "num_input_tokens_seen": 274304, |
| "step": 605, |
| "train_runtime": 128.8903, |
| "train_tokens_per_second": 2128.197 |
| }, |
| { |
| "epoch": 3.8607594936708862, |
| "grad_norm": 0.050537109375, |
| "learning_rate": 0.029221170252103766, |
| "loss": 0.0443, |
| "num_input_tokens_seen": 276608, |
| "step": 610, |
| "train_runtime": 129.8816, |
| "train_tokens_per_second": 2129.693 |
| }, |
| { |
| "epoch": 3.892405063291139, |
| "grad_norm": 0.01470947265625, |
| "learning_rate": 0.029194604703064876, |
| "loss": 0.0408, |
| "num_input_tokens_seen": 278912, |
| "step": 615, |
| "train_runtime": 130.8625, |
| "train_tokens_per_second": 2131.337 |
| }, |
| { |
| "epoch": 3.9240506329113924, |
| "grad_norm": 0.0654296875, |
| "learning_rate": 0.029167606139197878, |
| "loss": 0.078, |
| "num_input_tokens_seen": 281152, |
| "step": 620, |
| "train_runtime": 131.8362, |
| "train_tokens_per_second": 2132.586 |
| }, |
| { |
| "epoch": 3.9556962025316453, |
| "grad_norm": 0.05908203125, |
| "learning_rate": 0.029140175384109963, |
| "loss": 0.0847, |
| "num_input_tokens_seen": 283424, |
| "step": 625, |
| "train_runtime": 132.8127, |
| "train_tokens_per_second": 2134.013 |
| }, |
| { |
| "epoch": 3.9873417721518987, |
| "grad_norm": 0.08349609375, |
| "learning_rate": 0.02911231327459257, |
| "loss": 0.0981, |
| "num_input_tokens_seen": 285792, |
| "step": 630, |
| "train_runtime": 133.8165, |
| "train_tokens_per_second": 2135.7 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 0.057963330298662186, |
| "eval_runtime": 1.671, |
| "eval_samples_per_second": 41.892, |
| "eval_steps_per_second": 10.772, |
| "num_input_tokens_seen": 286448, |
| "step": 632 |
| }, |
| { |
| "epoch": 4.018987341772152, |
| "grad_norm": 0.0301513671875, |
| "learning_rate": 0.029084020660595865, |
| "loss": 0.0247, |
| "num_input_tokens_seen": 287824, |
| "step": 635, |
| "train_runtime": 137.3316, |
| "train_tokens_per_second": 2095.832 |
| }, |
| { |
| "epoch": 4.050632911392405, |
| "grad_norm": 0.051513671875, |
| "learning_rate": 0.0290552984052028, |
| "loss": 0.0395, |
| "num_input_tokens_seen": 290128, |
| "step": 640, |
| "train_runtime": 138.3441, |
| "train_tokens_per_second": 2097.148 |
| }, |
| { |
| "epoch": 4.082278481012658, |
| "grad_norm": 0.0001811981201171875, |
| "learning_rate": 0.029026147384602796, |
| "loss": 0.018, |
| "num_input_tokens_seen": 292336, |
| "step": 645, |
| "train_runtime": 139.3178, |
| "train_tokens_per_second": 2098.339 |
| }, |
| { |
| "epoch": 4.113924050632911, |
| "grad_norm": 0.00201416015625, |
| "learning_rate": 0.028996568488065012, |
| "loss": 0.0408, |
| "num_input_tokens_seen": 294544, |
| "step": 650, |
| "train_runtime": 140.2961, |
| "train_tokens_per_second": 2099.446 |
| }, |
| { |
| "epoch": 4.1455696202531644, |
| "grad_norm": 0.03515625, |
| "learning_rate": 0.02896656261791122, |
| "loss": 0.0666, |
| "num_input_tokens_seen": 296784, |
| "step": 655, |
| "train_runtime": 141.2725, |
| "train_tokens_per_second": 2100.79 |
| }, |
| { |
| "epoch": 4.177215189873418, |
| "grad_norm": 0.072265625, |
| "learning_rate": 0.028936130689488263, |
| "loss": 0.0343, |
| "num_input_tokens_seen": 299056, |
| "step": 660, |
| "train_runtime": 142.253, |
| "train_tokens_per_second": 2102.283 |
| }, |
| { |
| "epoch": 4.208860759493671, |
| "grad_norm": 0.06005859375, |
| "learning_rate": 0.028905273631140153, |
| "loss": 0.0286, |
| "num_input_tokens_seen": 301264, |
| "step": 665, |
| "train_runtime": 143.231, |
| "train_tokens_per_second": 2103.344 |
| }, |
| { |
| "epoch": 4.2405063291139244, |
| "grad_norm": 0.013671875, |
| "learning_rate": 0.02887399238417974, |
| "loss": 0.0207, |
| "num_input_tokens_seen": 303472, |
| "step": 670, |
| "train_runtime": 144.2059, |
| "train_tokens_per_second": 2104.435 |
| }, |
| { |
| "epoch": 4.272151898734177, |
| "grad_norm": 0.0274658203125, |
| "learning_rate": 0.02884228790286001, |
| "loss": 0.0509, |
| "num_input_tokens_seen": 305744, |
| "step": 675, |
| "train_runtime": 145.1926, |
| "train_tokens_per_second": 2105.782 |
| }, |
| { |
| "epoch": 4.30379746835443, |
| "grad_norm": 0.030029296875, |
| "learning_rate": 0.02881016115434494, |
| "loss": 0.0467, |
| "num_input_tokens_seen": 308016, |
| "step": 680, |
| "train_runtime": 146.1698, |
| "train_tokens_per_second": 2107.248 |
| }, |
| { |
| "epoch": 4.3354430379746836, |
| "grad_norm": 0.0020904541015625, |
| "learning_rate": 0.028777613118680035, |
| "loss": 0.0321, |
| "num_input_tokens_seen": 310320, |
| "step": 685, |
| "train_runtime": 147.1607, |
| "train_tokens_per_second": 2108.715 |
| }, |
| { |
| "epoch": 4.367088607594937, |
| "grad_norm": 0.059814453125, |
| "learning_rate": 0.028744644788762413, |
| "loss": 0.0559, |
| "num_input_tokens_seen": 312624, |
| "step": 690, |
| "train_runtime": 148.1407, |
| "train_tokens_per_second": 2110.318 |
| }, |
| { |
| "epoch": 4.39873417721519, |
| "grad_norm": 0.00469970703125, |
| "learning_rate": 0.02871125717031052, |
| "loss": 0.0241, |
| "num_input_tokens_seen": 314960, |
| "step": 695, |
| "train_runtime": 149.1324, |
| "train_tokens_per_second": 2111.949 |
| }, |
| { |
| "epoch": 4.430379746835443, |
| "grad_norm": 0.0166015625, |
| "learning_rate": 0.028677451281833435, |
| "loss": 0.0067, |
| "num_input_tokens_seen": 317232, |
| "step": 700, |
| "train_runtime": 150.1094, |
| "train_tokens_per_second": 2113.339 |
| }, |
| { |
| "epoch": 4.462025316455696, |
| "grad_norm": 0.041259765625, |
| "learning_rate": 0.028643228154599815, |
| "loss": 0.0212, |
| "num_input_tokens_seen": 319536, |
| "step": 705, |
| "train_runtime": 151.0915, |
| "train_tokens_per_second": 2114.851 |
| }, |
| { |
| "epoch": 4.493670886075949, |
| "grad_norm": 0.0166015625, |
| "learning_rate": 0.028608588832606446, |
| "loss": 0.0317, |
| "num_input_tokens_seen": 321936, |
| "step": 710, |
| "train_runtime": 152.0979, |
| "train_tokens_per_second": 2116.637 |
| }, |
| { |
| "epoch": 4.525316455696203, |
| "grad_norm": 0.0869140625, |
| "learning_rate": 0.02857353437254637, |
| "loss": 0.036, |
| "num_input_tokens_seen": 324208, |
| "step": 715, |
| "train_runtime": 153.0861, |
| "train_tokens_per_second": 2117.815 |
| }, |
| { |
| "epoch": 4.556962025316456, |
| "grad_norm": 0.05810546875, |
| "learning_rate": 0.028538065843776658, |
| "loss": 0.0478, |
| "num_input_tokens_seen": 326384, |
| "step": 720, |
| "train_runtime": 154.0569, |
| "train_tokens_per_second": 2118.594 |
| }, |
| { |
| "epoch": 4.588607594936709, |
| "grad_norm": 0.00151824951171875, |
| "learning_rate": 0.028502184328285808, |
| "loss": 0.0082, |
| "num_input_tokens_seen": 328592, |
| "step": 725, |
| "train_runtime": 155.0297, |
| "train_tokens_per_second": 2119.543 |
| }, |
| { |
| "epoch": 4.620253164556962, |
| "grad_norm": 0.046875, |
| "learning_rate": 0.02846589092066071, |
| "loss": 0.0182, |
| "num_input_tokens_seen": 330896, |
| "step": 730, |
| "train_runtime": 156.0139, |
| "train_tokens_per_second": 2120.939 |
| }, |
| { |
| "epoch": 4.651898734177215, |
| "grad_norm": 0.0047607421875, |
| "learning_rate": 0.02842918672805327, |
| "loss": 0.0399, |
| "num_input_tokens_seen": 333360, |
| "step": 735, |
| "train_runtime": 157.0419, |
| "train_tokens_per_second": 2122.745 |
| }, |
| { |
| "epoch": 4.6835443037974684, |
| "grad_norm": 0.0908203125, |
| "learning_rate": 0.028392072870146633, |
| "loss": 0.0917, |
| "num_input_tokens_seen": 335568, |
| "step": 740, |
| "train_runtime": 158.0163, |
| "train_tokens_per_second": 2123.629 |
| }, |
| { |
| "epoch": 4.715189873417722, |
| "grad_norm": 0.029296875, |
| "learning_rate": 0.028354550479121027, |
| "loss": 0.0462, |
| "num_input_tokens_seen": 337808, |
| "step": 745, |
| "train_runtime": 158.9933, |
| "train_tokens_per_second": 2124.668 |
| }, |
| { |
| "epoch": 4.746835443037975, |
| "grad_norm": 0.072265625, |
| "learning_rate": 0.028316620699619228, |
| "loss": 0.0275, |
| "num_input_tokens_seen": 340016, |
| "step": 750, |
| "train_runtime": 159.9715, |
| "train_tokens_per_second": 2125.479 |
| }, |
| { |
| "epoch": 4.7784810126582276, |
| "grad_norm": 0.049072265625, |
| "learning_rate": 0.028278284688711637, |
| "loss": 0.0641, |
| "num_input_tokens_seen": 342320, |
| "step": 755, |
| "train_runtime": 160.9537, |
| "train_tokens_per_second": 2126.823 |
| }, |
| { |
| "epoch": 4.810126582278481, |
| "grad_norm": 0.08154296875, |
| "learning_rate": 0.028239543615860983, |
| "loss": 0.0479, |
| "num_input_tokens_seen": 344656, |
| "step": 760, |
| "train_runtime": 161.9536, |
| "train_tokens_per_second": 2128.115 |
| }, |
| { |
| "epoch": 4.841772151898734, |
| "grad_norm": 0.0576171875, |
| "learning_rate": 0.028200398662886653, |
| "loss": 0.0593, |
| "num_input_tokens_seen": 346928, |
| "step": 765, |
| "train_runtime": 162.9314, |
| "train_tokens_per_second": 2129.288 |
| }, |
| { |
| "epoch": 4.8734177215189876, |
| "grad_norm": 0.040283203125, |
| "learning_rate": 0.028160851023928634, |
| "loss": 0.1044, |
| "num_input_tokens_seen": 349168, |
| "step": 770, |
| "train_runtime": 163.9064, |
| "train_tokens_per_second": 2130.289 |
| }, |
| { |
| "epoch": 4.905063291139241, |
| "grad_norm": 0.01116943359375, |
| "learning_rate": 0.02812090190541108, |
| "loss": 0.0527, |
| "num_input_tokens_seen": 351536, |
| "step": 775, |
| "train_runtime": 164.9105, |
| "train_tokens_per_second": 2131.678 |
| }, |
| { |
| "epoch": 4.936708860759493, |
| "grad_norm": 0.0072021484375, |
| "learning_rate": 0.028080552526005543, |
| "loss": 0.03, |
| "num_input_tokens_seen": 353840, |
| "step": 780, |
| "train_runtime": 165.8904, |
| "train_tokens_per_second": 2132.974 |
| }, |
| { |
| "epoch": 4.968354430379747, |
| "grad_norm": 0.0205078125, |
| "learning_rate": 0.02803980411659374, |
| "loss": 0.0797, |
| "num_input_tokens_seen": 356112, |
| "step": 785, |
| "train_runtime": 166.8767, |
| "train_tokens_per_second": 2133.983 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.0113525390625, |
| "learning_rate": 0.02799865792023004, |
| "loss": 0.021, |
| "num_input_tokens_seen": 358176, |
| "step": 790, |
| "train_runtime": 167.8315, |
| "train_tokens_per_second": 2134.14 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 0.057497963309288025, |
| "eval_runtime": 1.6779, |
| "eval_samples_per_second": 41.718, |
| "eval_steps_per_second": 10.728, |
| "num_input_tokens_seen": 358176, |
| "step": 790 |
| }, |
| { |
| "epoch": 5.031645569620253, |
| "grad_norm": 0.00144195556640625, |
| "learning_rate": 0.027957115192103567, |
| "loss": 0.021, |
| "num_input_tokens_seen": 360512, |
| "step": 795, |
| "train_runtime": 171.5628, |
| "train_tokens_per_second": 2101.341 |
| }, |
| { |
| "epoch": 5.063291139240507, |
| "grad_norm": 0.00262451171875, |
| "learning_rate": 0.027915177199499843, |
| "loss": 0.0336, |
| "num_input_tokens_seen": 362752, |
| "step": 800, |
| "train_runtime": 172.5391, |
| "train_tokens_per_second": 2102.433 |
| }, |
| { |
| "epoch": 5.094936708860759, |
| "grad_norm": 0.03662109375, |
| "learning_rate": 0.027872845221762192, |
| "loss": 0.0355, |
| "num_input_tokens_seen": 365056, |
| "step": 805, |
| "train_runtime": 173.5332, |
| "train_tokens_per_second": 2103.666 |
| }, |
| { |
| "epoch": 5.1265822784810124, |
| "grad_norm": 0.1015625, |
| "learning_rate": 0.02783012055025268, |
| "loss": 0.0592, |
| "num_input_tokens_seen": 367264, |
| "step": 810, |
| "train_runtime": 174.5081, |
| "train_tokens_per_second": 2104.567 |
| }, |
| { |
| "epoch": 5.158227848101266, |
| "grad_norm": 0.0079345703125, |
| "learning_rate": 0.027787004488312724, |
| "loss": 0.0095, |
| "num_input_tokens_seen": 369536, |
| "step": 815, |
| "train_runtime": 175.4906, |
| "train_tokens_per_second": 2105.731 |
| }, |
| { |
| "epoch": 5.189873417721519, |
| "grad_norm": 0.0654296875, |
| "learning_rate": 0.027743498351223354, |
| "loss": 0.0159, |
| "num_input_tokens_seen": 371744, |
| "step": 820, |
| "train_runtime": 176.468, |
| "train_tokens_per_second": 2106.58 |
| }, |
| { |
| "epoch": 5.2215189873417724, |
| "grad_norm": 0.00726318359375, |
| "learning_rate": 0.027699603466165058, |
| "loss": 0.0128, |
| "num_input_tokens_seen": 373984, |
| "step": 825, |
| "train_runtime": 177.4488, |
| "train_tokens_per_second": 2107.56 |
| }, |
| { |
| "epoch": 5.253164556962025, |
| "grad_norm": 0.0001239776611328125, |
| "learning_rate": 0.027655321172177314, |
| "loss": 0.0304, |
| "num_input_tokens_seen": 376288, |
| "step": 830, |
| "train_runtime": 178.4286, |
| "train_tokens_per_second": 2108.9 |
| }, |
| { |
| "epoch": 5.284810126582278, |
| "grad_norm": 0.0010986328125, |
| "learning_rate": 0.027610652820117747, |
| "loss": 0.0402, |
| "num_input_tokens_seen": 378528, |
| "step": 835, |
| "train_runtime": 179.4119, |
| "train_tokens_per_second": 2109.827 |
| }, |
| { |
| "epoch": 5.3164556962025316, |
| "grad_norm": 0.038818359375, |
| "learning_rate": 0.0275655997726209, |
| "loss": 0.0245, |
| "num_input_tokens_seen": 380896, |
| "step": 840, |
| "train_runtime": 180.4061, |
| "train_tokens_per_second": 2111.325 |
| }, |
| { |
| "epoch": 5.348101265822785, |
| "grad_norm": 0.046875, |
| "learning_rate": 0.02752016340405669, |
| "loss": 0.0517, |
| "num_input_tokens_seen": 383168, |
| "step": 845, |
| "train_runtime": 181.3859, |
| "train_tokens_per_second": 2112.447 |
| }, |
| { |
| "epoch": 5.379746835443038, |
| "grad_norm": 0.056640625, |
| "learning_rate": 0.027474345100488465, |
| "loss": 0.0426, |
| "num_input_tokens_seen": 385568, |
| "step": 850, |
| "train_runtime": 182.3921, |
| "train_tokens_per_second": 2113.952 |
| }, |
| { |
| "epoch": 5.4113924050632916, |
| "grad_norm": 0.060546875, |
| "learning_rate": 0.027428146259630727, |
| "loss": 0.0426, |
| "num_input_tokens_seen": 387872, |
| "step": 855, |
| "train_runtime": 183.3925, |
| "train_tokens_per_second": 2114.983 |
| }, |
| { |
| "epoch": 5.443037974683544, |
| "grad_norm": 0.00335693359375, |
| "learning_rate": 0.027381568290806495, |
| "loss": 0.0082, |
| "num_input_tokens_seen": 390112, |
| "step": 860, |
| "train_runtime": 184.3713, |
| "train_tokens_per_second": 2115.905 |
| }, |
| { |
| "epoch": 5.474683544303797, |
| "grad_norm": 0.07666015625, |
| "learning_rate": 0.027334612614904306, |
| "loss": 0.032, |
| "num_input_tokens_seen": 392512, |
| "step": 865, |
| "train_runtime": 185.3821, |
| "train_tokens_per_second": 2117.314 |
| }, |
| { |
| "epoch": 5.506329113924051, |
| "grad_norm": 0.054443359375, |
| "learning_rate": 0.02728728066433488, |
| "loss": 0.0408, |
| "num_input_tokens_seen": 394752, |
| "step": 870, |
| "train_runtime": 186.3599, |
| "train_tokens_per_second": 2118.224 |
| }, |
| { |
| "epoch": 5.537974683544304, |
| "grad_norm": 0.0054931640625, |
| "learning_rate": 0.027239573882987415, |
| "loss": 0.0327, |
| "num_input_tokens_seen": 396992, |
| "step": 875, |
| "train_runtime": 187.3367, |
| "train_tokens_per_second": 2119.137 |
| }, |
| { |
| "epoch": 5.569620253164557, |
| "grad_norm": 0.02099609375, |
| "learning_rate": 0.02719149372618555, |
| "loss": 0.0248, |
| "num_input_tokens_seen": 399232, |
| "step": 880, |
| "train_runtime": 188.3252, |
| "train_tokens_per_second": 2119.908 |
| }, |
| { |
| "epoch": 5.60126582278481, |
| "grad_norm": 0.038818359375, |
| "learning_rate": 0.027143041660642967, |
| "loss": 0.0276, |
| "num_input_tokens_seen": 401440, |
| "step": 885, |
| "train_runtime": 189.3005, |
| "train_tokens_per_second": 2120.649 |
| }, |
| { |
| "epoch": 5.632911392405063, |
| "grad_norm": 0.016845703125, |
| "learning_rate": 0.027094219164418627, |
| "loss": 0.016, |
| "num_input_tokens_seen": 403616, |
| "step": 890, |
| "train_runtime": 190.2719, |
| "train_tokens_per_second": 2121.259 |
| }, |
| { |
| "epoch": 5.6645569620253164, |
| "grad_norm": 0.03662109375, |
| "learning_rate": 0.02704502772687172, |
| "loss": 0.0218, |
| "num_input_tokens_seen": 405760, |
| "step": 895, |
| "train_runtime": 191.2416, |
| "train_tokens_per_second": 2121.714 |
| }, |
| { |
| "epoch": 5.69620253164557, |
| "grad_norm": 0.02099609375, |
| "learning_rate": 0.026995468848616182, |
| "loss": 0.0419, |
| "num_input_tokens_seen": 408064, |
| "step": 900, |
| "train_runtime": 192.2375, |
| "train_tokens_per_second": 2122.707 |
| }, |
| { |
| "epoch": 5.727848101265823, |
| "grad_norm": 0.06103515625, |
| "learning_rate": 0.026945544041474978, |
| "loss": 0.0553, |
| "num_input_tokens_seen": 410400, |
| "step": 905, |
| "train_runtime": 193.2315, |
| "train_tokens_per_second": 2123.878 |
| }, |
| { |
| "epoch": 5.759493670886076, |
| "grad_norm": 0.0986328125, |
| "learning_rate": 0.02689525482843393, |
| "loss": 0.032, |
| "num_input_tokens_seen": 412736, |
| "step": 910, |
| "train_runtime": 194.2258, |
| "train_tokens_per_second": 2125.032 |
| }, |
| { |
| "epoch": 5.791139240506329, |
| "grad_norm": 0.026123046875, |
| "learning_rate": 0.02684460274359528, |
| "loss": 0.0392, |
| "num_input_tokens_seen": 415168, |
| "step": 915, |
| "train_runtime": 195.2258, |
| "train_tokens_per_second": 2126.604 |
| }, |
| { |
| "epoch": 5.822784810126582, |
| "grad_norm": 0.041259765625, |
| "learning_rate": 0.026793589332130902, |
| "loss": 0.0468, |
| "num_input_tokens_seen": 417344, |
| "step": 920, |
| "train_runtime": 196.1957, |
| "train_tokens_per_second": 2127.183 |
| }, |
| { |
| "epoch": 5.8544303797468356, |
| "grad_norm": 0.028564453125, |
| "learning_rate": 0.02674221615023513, |
| "loss": 0.0345, |
| "num_input_tokens_seen": 419488, |
| "step": 925, |
| "train_runtime": 197.1647, |
| "train_tokens_per_second": 2127.602 |
| }, |
| { |
| "epoch": 5.886075949367089, |
| "grad_norm": 0.034912109375, |
| "learning_rate": 0.026690484765077332, |
| "loss": 0.0553, |
| "num_input_tokens_seen": 421600, |
| "step": 930, |
| "train_runtime": 198.1377, |
| "train_tokens_per_second": 2127.814 |
| }, |
| { |
| "epoch": 5.917721518987342, |
| "grad_norm": 0.0157470703125, |
| "learning_rate": 0.026638396754754056, |
| "loss": 0.0148, |
| "num_input_tokens_seen": 423904, |
| "step": 935, |
| "train_runtime": 199.1211, |
| "train_tokens_per_second": 2128.875 |
| }, |
| { |
| "epoch": 5.949367088607595, |
| "grad_norm": 0.04296875, |
| "learning_rate": 0.026585953708240937, |
| "loss": 0.0247, |
| "num_input_tokens_seen": 426240, |
| "step": 940, |
| "train_runtime": 200.116, |
| "train_tokens_per_second": 2129.965 |
| }, |
| { |
| "epoch": 5.981012658227848, |
| "grad_norm": 0.00439453125, |
| "learning_rate": 0.02653315722534418, |
| "loss": 0.0334, |
| "num_input_tokens_seen": 428608, |
| "step": 945, |
| "train_runtime": 201.1018, |
| "train_tokens_per_second": 2131.299 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_loss": 0.057629313319921494, |
| "eval_runtime": 1.682, |
| "eval_samples_per_second": 41.616, |
| "eval_steps_per_second": 10.701, |
| "num_input_tokens_seen": 429728, |
| "step": 948 |
| }, |
| { |
| "epoch": 6.012658227848101, |
| "grad_norm": 0.0556640625, |
| "learning_rate": 0.026480008916651778, |
| "loss": 0.0224, |
| "num_input_tokens_seen": 430624, |
| "step": 950, |
| "train_runtime": 204.5872, |
| "train_tokens_per_second": 2104.843 |
| }, |
| { |
| "epoch": 6.044303797468355, |
| "grad_norm": 0.004547119140625, |
| "learning_rate": 0.02642651040348439, |
| "loss": 0.0104, |
| "num_input_tokens_seen": 432960, |
| "step": 955, |
| "train_runtime": 205.6102, |
| "train_tokens_per_second": 2105.732 |
| }, |
| { |
| "epoch": 6.075949367088608, |
| "grad_norm": 0.0211181640625, |
| "learning_rate": 0.026372663317845862, |
| "loss": 0.0298, |
| "num_input_tokens_seen": 435104, |
| "step": 960, |
| "train_runtime": 206.5794, |
| "train_tokens_per_second": 2106.231 |
| }, |
| { |
| "epoch": 6.1075949367088604, |
| "grad_norm": 0.0008544921875, |
| "learning_rate": 0.02631846930237345, |
| "loss": 0.0218, |
| "num_input_tokens_seen": 437312, |
| "step": 965, |
| "train_runtime": 207.5581, |
| "train_tokens_per_second": 2106.938 |
| }, |
| { |
| "epoch": 6.139240506329114, |
| "grad_norm": 0.0272216796875, |
| "learning_rate": 0.026263930010287713, |
| "loss": 0.0266, |
| "num_input_tokens_seen": 439584, |
| "step": 970, |
| "train_runtime": 208.5403, |
| "train_tokens_per_second": 2107.909 |
| }, |
| { |
| "epoch": 6.170886075949367, |
| "grad_norm": 0.017578125, |
| "learning_rate": 0.02620904710534207, |
| "loss": 0.0454, |
| "num_input_tokens_seen": 441856, |
| "step": 975, |
| "train_runtime": 209.5207, |
| "train_tokens_per_second": 2108.889 |
| }, |
| { |
| "epoch": 6.2025316455696204, |
| "grad_norm": 0.0004482269287109375, |
| "learning_rate": 0.026153822261772066, |
| "loss": 0.0287, |
| "num_input_tokens_seen": 444128, |
| "step": 980, |
| "train_runtime": 210.4989, |
| "train_tokens_per_second": 2109.883 |
| }, |
| { |
| "epoch": 6.234177215189874, |
| "grad_norm": 0.0189208984375, |
| "learning_rate": 0.026098257164244274, |
| "loss": 0.0146, |
| "num_input_tokens_seen": 446368, |
| "step": 985, |
| "train_runtime": 211.4822, |
| "train_tokens_per_second": 2110.665 |
| }, |
| { |
| "epoch": 6.265822784810126, |
| "grad_norm": 0.021240234375, |
| "learning_rate": 0.02604235350780493, |
| "loss": 0.004, |
| "num_input_tokens_seen": 448640, |
| "step": 990, |
| "train_runtime": 212.4666, |
| "train_tokens_per_second": 2111.579 |
| }, |
| { |
| "epoch": 6.2974683544303796, |
| "grad_norm": 0.046142578125, |
| "learning_rate": 0.025986112997828197, |
| "loss": 0.0223, |
| "num_input_tokens_seen": 450848, |
| "step": 995, |
| "train_runtime": 213.4442, |
| "train_tokens_per_second": 2112.252 |
| }, |
| { |
| "epoch": 6.329113924050633, |
| "grad_norm": 0.0242919921875, |
| "learning_rate": 0.025929537349964157, |
| "loss": 0.0108, |
| "num_input_tokens_seen": 453056, |
| "step": 1000, |
| "train_runtime": 214.4214, |
| "train_tokens_per_second": 2112.923 |
| }, |
| { |
| "epoch": 6.360759493670886, |
| "grad_norm": 0.017333984375, |
| "learning_rate": 0.025872628290086477, |
| "loss": 0.0305, |
| "num_input_tokens_seen": 455424, |
| "step": 1005, |
| "train_runtime": 215.4176, |
| "train_tokens_per_second": 2114.145 |
| }, |
| { |
| "epoch": 6.3924050632911396, |
| "grad_norm": 0.01806640625, |
| "learning_rate": 0.025815387554239753, |
| "loss": 0.0084, |
| "num_input_tokens_seen": 457632, |
| "step": 1010, |
| "train_runtime": 216.3934, |
| "train_tokens_per_second": 2114.815 |
| }, |
| { |
| "epoch": 6.424050632911392, |
| "grad_norm": 0.0296630859375, |
| "learning_rate": 0.025757816888586547, |
| "loss": 0.0425, |
| "num_input_tokens_seen": 459968, |
| "step": 1015, |
| "train_runtime": 217.3863, |
| "train_tokens_per_second": 2115.902 |
| }, |
| { |
| "epoch": 6.455696202531645, |
| "grad_norm": 0.004058837890625, |
| "learning_rate": 0.025699918049354144, |
| "loss": 0.0221, |
| "num_input_tokens_seen": 462240, |
| "step": 1020, |
| "train_runtime": 218.3659, |
| "train_tokens_per_second": 2116.814 |
| }, |
| { |
| "epoch": 6.487341772151899, |
| "grad_norm": 0.000591278076171875, |
| "learning_rate": 0.025641692802780933, |
| "loss": 0.0021, |
| "num_input_tokens_seen": 464416, |
| "step": 1025, |
| "train_runtime": 219.3388, |
| "train_tokens_per_second": 2117.346 |
| }, |
| { |
| "epoch": 6.518987341772152, |
| "grad_norm": 0.005340576171875, |
| "learning_rate": 0.02558314292506257, |
| "loss": 0.0216, |
| "num_input_tokens_seen": 466752, |
| "step": 1030, |
| "train_runtime": 220.3457, |
| "train_tokens_per_second": 2118.272 |
| }, |
| { |
| "epoch": 6.550632911392405, |
| "grad_norm": 0.00909423828125, |
| "learning_rate": 0.025524270202297767, |
| "loss": 0.027, |
| "num_input_tokens_seen": 469088, |
| "step": 1035, |
| "train_runtime": 221.3497, |
| "train_tokens_per_second": 2119.217 |
| }, |
| { |
| "epoch": 6.582278481012658, |
| "grad_norm": 0.06494140625, |
| "learning_rate": 0.025465076430433827, |
| "loss": 0.0288, |
| "num_input_tokens_seen": 471328, |
| "step": 1040, |
| "train_runtime": 222.3263, |
| "train_tokens_per_second": 2119.983 |
| }, |
| { |
| "epoch": 6.613924050632911, |
| "grad_norm": 0.07275390625, |
| "learning_rate": 0.025405563415211833, |
| "loss": 0.0325, |
| "num_input_tokens_seen": 473728, |
| "step": 1045, |
| "train_runtime": 223.3355, |
| "train_tokens_per_second": 2121.15 |
| }, |
| { |
| "epoch": 6.6455696202531644, |
| "grad_norm": 0.023681640625, |
| "learning_rate": 0.025345732972111585, |
| "loss": 0.0163, |
| "num_input_tokens_seen": 475840, |
| "step": 1050, |
| "train_runtime": 224.3028, |
| "train_tokens_per_second": 2121.418 |
| }, |
| { |
| "epoch": 6.677215189873418, |
| "grad_norm": 0.025390625, |
| "learning_rate": 0.025285586926296195, |
| "loss": 0.0182, |
| "num_input_tokens_seen": 478048, |
| "step": 1055, |
| "train_runtime": 225.2808, |
| "train_tokens_per_second": 2122.01 |
| }, |
| { |
| "epoch": 6.708860759493671, |
| "grad_norm": 0.002716064453125, |
| "learning_rate": 0.025225127112556447, |
| "loss": 0.038, |
| "num_input_tokens_seen": 480352, |
| "step": 1060, |
| "train_runtime": 226.2736, |
| "train_tokens_per_second": 2122.881 |
| }, |
| { |
| "epoch": 6.740506329113924, |
| "grad_norm": 0.0576171875, |
| "learning_rate": 0.025164355375254775, |
| "loss": 0.0626, |
| "num_input_tokens_seen": 482752, |
| "step": 1065, |
| "train_runtime": 227.2847, |
| "train_tokens_per_second": 2123.997 |
| }, |
| { |
| "epoch": 6.772151898734177, |
| "grad_norm": 0.01177978515625, |
| "learning_rate": 0.02510327356826905, |
| "loss": 0.0365, |
| "num_input_tokens_seen": 485056, |
| "step": 1070, |
| "train_runtime": 228.2764, |
| "train_tokens_per_second": 2124.863 |
| }, |
| { |
| "epoch": 6.80379746835443, |
| "grad_norm": 0.00775146484375, |
| "learning_rate": 0.02504188355493598, |
| "loss": 0.0396, |
| "num_input_tokens_seen": 487360, |
| "step": 1075, |
| "train_runtime": 229.2704, |
| "train_tokens_per_second": 2125.699 |
| }, |
| { |
| "epoch": 6.8354430379746836, |
| "grad_norm": 0.01458740234375, |
| "learning_rate": 0.024980187207994307, |
| "loss": 0.0469, |
| "num_input_tokens_seen": 489728, |
| "step": 1080, |
| "train_runtime": 230.2664, |
| "train_tokens_per_second": 2126.789 |
| }, |
| { |
| "epoch": 6.867088607594937, |
| "grad_norm": 0.06591796875, |
| "learning_rate": 0.024918186409527657, |
| "loss": 0.0226, |
| "num_input_tokens_seen": 492064, |
| "step": 1085, |
| "train_runtime": 231.2505, |
| "train_tokens_per_second": 2127.84 |
| }, |
| { |
| "epoch": 6.89873417721519, |
| "grad_norm": 0.048828125, |
| "learning_rate": 0.024855883050907124, |
| "loss": 0.0439, |
| "num_input_tokens_seen": 494304, |
| "step": 1090, |
| "train_runtime": 232.2288, |
| "train_tokens_per_second": 2128.521 |
| }, |
| { |
| "epoch": 6.930379746835443, |
| "grad_norm": 0.06787109375, |
| "learning_rate": 0.024793279032733578, |
| "loss": 0.0403, |
| "num_input_tokens_seen": 496800, |
| "step": 1095, |
| "train_runtime": 233.2529, |
| "train_tokens_per_second": 2129.877 |
| }, |
| { |
| "epoch": 6.962025316455696, |
| "grad_norm": 0.059814453125, |
| "learning_rate": 0.024730376264779707, |
| "loss": 0.0124, |
| "num_input_tokens_seen": 499040, |
| "step": 1100, |
| "train_runtime": 234.2299, |
| "train_tokens_per_second": 2130.556 |
| }, |
| { |
| "epoch": 6.993670886075949, |
| "grad_norm": 0.00176239013671875, |
| "learning_rate": 0.02466717666593172, |
| "loss": 0.0169, |
| "num_input_tokens_seen": 501280, |
| "step": 1105, |
| "train_runtime": 235.2092, |
| "train_tokens_per_second": 2131.21 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_loss": 0.04775509238243103, |
| "eval_runtime": 1.6963, |
| "eval_samples_per_second": 41.266, |
| "eval_steps_per_second": 10.611, |
| "num_input_tokens_seen": 501504, |
| "step": 1106 |
| }, |
| { |
| "epoch": 7.025316455696203, |
| "grad_norm": 0.00537109375, |
| "learning_rate": 0.02460368216413082, |
| "loss": 0.0175, |
| "num_input_tokens_seen": 503328, |
| "step": 1110, |
| "train_runtime": 238.7545, |
| "train_tokens_per_second": 2108.141 |
| }, |
| { |
| "epoch": 7.056962025316456, |
| "grad_norm": 0.01031494140625, |
| "learning_rate": 0.024539894696314412, |
| "loss": 0.0033, |
| "num_input_tokens_seen": 505568, |
| "step": 1115, |
| "train_runtime": 239.7409, |
| "train_tokens_per_second": 2108.81 |
| }, |
| { |
| "epoch": 7.0886075949367084, |
| "grad_norm": 0.05029296875, |
| "learning_rate": 0.024475816208357017, |
| "loss": 0.0229, |
| "num_input_tokens_seen": 507904, |
| "step": 1120, |
| "train_runtime": 240.7268, |
| "train_tokens_per_second": 2109.878 |
| }, |
| { |
| "epoch": 7.120253164556962, |
| "grad_norm": 0.0030975341796875, |
| "learning_rate": 0.024411448655010867, |
| "loss": 0.0058, |
| "num_input_tokens_seen": 510048, |
| "step": 1125, |
| "train_runtime": 241.698, |
| "train_tokens_per_second": 2110.27 |
| }, |
| { |
| "epoch": 7.151898734177215, |
| "grad_norm": 0.0247802734375, |
| "learning_rate": 0.024346793999846333, |
| "loss": 0.0131, |
| "num_input_tokens_seen": 512256, |
| "step": 1130, |
| "train_runtime": 242.6735, |
| "train_tokens_per_second": 2110.886 |
| }, |
| { |
| "epoch": 7.1835443037974684, |
| "grad_norm": 0.0181884765625, |
| "learning_rate": 0.02428185421519197, |
| "loss": 0.0039, |
| "num_input_tokens_seen": 514592, |
| "step": 1135, |
| "train_runtime": 243.675, |
| "train_tokens_per_second": 2111.797 |
| }, |
| { |
| "epoch": 7.215189873417722, |
| "grad_norm": 0.00543212890625, |
| "learning_rate": 0.02421663128207441, |
| "loss": 0.0084, |
| "num_input_tokens_seen": 517024, |
| "step": 1140, |
| "train_runtime": 244.6865, |
| "train_tokens_per_second": 2113.005 |
| }, |
| { |
| "epoch": 7.246835443037975, |
| "grad_norm": 0.0069580078125, |
| "learning_rate": 0.024151127190157863, |
| "loss": 0.0167, |
| "num_input_tokens_seen": 519360, |
| "step": 1145, |
| "train_runtime": 245.6905, |
| "train_tokens_per_second": 2113.879 |
| }, |
| { |
| "epoch": 7.2784810126582276, |
| "grad_norm": 0.028564453125, |
| "learning_rate": 0.02408534393768348, |
| "loss": 0.0082, |
| "num_input_tokens_seen": 521664, |
| "step": 1150, |
| "train_runtime": 246.6697, |
| "train_tokens_per_second": 2114.828 |
| }, |
| { |
| "epoch": 7.310126582278481, |
| "grad_norm": 0.009033203125, |
| "learning_rate": 0.024019283531408357, |
| "loss": 0.0026, |
| "num_input_tokens_seen": 523872, |
| "step": 1155, |
| "train_runtime": 247.6391, |
| "train_tokens_per_second": 2115.466 |
| }, |
| { |
| "epoch": 7.341772151898734, |
| "grad_norm": 0.0184326171875, |
| "learning_rate": 0.02395294798654433, |
| "loss": 0.0037, |
| "num_input_tokens_seen": 526144, |
| "step": 1160, |
| "train_runtime": 248.6207, |
| "train_tokens_per_second": 2116.252 |
| }, |
| { |
| "epoch": 7.3734177215189876, |
| "grad_norm": 0.006256103515625, |
| "learning_rate": 0.023886339326696513, |
| "loss": 0.0101, |
| "num_input_tokens_seen": 528416, |
| "step": 1165, |
| "train_runtime": 249.6153, |
| "train_tokens_per_second": 2116.922 |
| }, |
| { |
| "epoch": 7.405063291139241, |
| "grad_norm": 0.0279541015625, |
| "learning_rate": 0.023819459583801543, |
| "loss": 0.0379, |
| "num_input_tokens_seen": 530720, |
| "step": 1170, |
| "train_runtime": 250.6008, |
| "train_tokens_per_second": 2117.79 |
| }, |
| { |
| "epoch": 7.436708860759493, |
| "grad_norm": 0.005859375, |
| "learning_rate": 0.023752310798065612, |
| "loss": 0.0242, |
| "num_input_tokens_seen": 532896, |
| "step": 1175, |
| "train_runtime": 251.5746, |
| "train_tokens_per_second": 2118.243 |
| }, |
| { |
| "epoch": 7.468354430379747, |
| "grad_norm": 0.0032806396484375, |
| "learning_rate": 0.023684895017902212, |
| "loss": 0.0015, |
| "num_input_tokens_seen": 535040, |
| "step": 1180, |
| "train_runtime": 252.5512, |
| "train_tokens_per_second": 2118.541 |
| }, |
| { |
| "epoch": 7.5, |
| "grad_norm": 0.00013065338134765625, |
| "learning_rate": 0.02361721429986967, |
| "loss": 0.0016, |
| "num_input_tokens_seen": 537248, |
| "step": 1185, |
| "train_runtime": 253.527, |
| "train_tokens_per_second": 2119.095 |
| }, |
| { |
| "epoch": 7.531645569620253, |
| "grad_norm": 0.0016326904296875, |
| "learning_rate": 0.02354927070860841, |
| "loss": 0.0056, |
| "num_input_tokens_seen": 539616, |
| "step": 1190, |
| "train_runtime": 254.5237, |
| "train_tokens_per_second": 2120.102 |
| }, |
| { |
| "epoch": 7.563291139240507, |
| "grad_norm": 0.0203857421875, |
| "learning_rate": 0.023481066316777932, |
| "loss": 0.0166, |
| "num_input_tokens_seen": 541824, |
| "step": 1195, |
| "train_runtime": 255.5036, |
| "train_tokens_per_second": 2120.612 |
| }, |
| { |
| "epoch": 7.594936708860759, |
| "grad_norm": 0.0264892578125, |
| "learning_rate": 0.023412603204993634, |
| "loss": 0.007, |
| "num_input_tokens_seen": 544064, |
| "step": 1200, |
| "train_runtime": 256.4988, |
| "train_tokens_per_second": 2121.117 |
| }, |
| { |
| "epoch": 7.6265822784810124, |
| "grad_norm": 0.02685546875, |
| "learning_rate": 0.023343883461763304, |
| "loss": 0.0038, |
| "num_input_tokens_seen": 546336, |
| "step": 1205, |
| "train_runtime": 257.4801, |
| "train_tokens_per_second": 2121.857 |
| }, |
| { |
| "epoch": 7.658227848101266, |
| "grad_norm": 0.003265380859375, |
| "learning_rate": 0.023274909183423443, |
| "loss": 0.0311, |
| "num_input_tokens_seen": 548608, |
| "step": 1210, |
| "train_runtime": 258.4622, |
| "train_tokens_per_second": 2122.585 |
| }, |
| { |
| "epoch": 7.689873417721519, |
| "grad_norm": 0.055908203125, |
| "learning_rate": 0.023205682474075274, |
| "loss": 0.0167, |
| "num_input_tokens_seen": 550944, |
| "step": 1215, |
| "train_runtime": 259.4627, |
| "train_tokens_per_second": 2123.404 |
| }, |
| { |
| "epoch": 7.7215189873417724, |
| "grad_norm": 0.10791015625, |
| "learning_rate": 0.023136205445520596, |
| "loss": 0.0156, |
| "num_input_tokens_seen": 553344, |
| "step": 1220, |
| "train_runtime": 260.4697, |
| "train_tokens_per_second": 2124.409 |
| }, |
| { |
| "epoch": 7.753164556962025, |
| "grad_norm": 0.06591796875, |
| "learning_rate": 0.02306648021719733, |
| "loss": 0.0211, |
| "num_input_tokens_seen": 555552, |
| "step": 1225, |
| "train_runtime": 261.4463, |
| "train_tokens_per_second": 2124.918 |
| }, |
| { |
| "epoch": 7.784810126582278, |
| "grad_norm": 0.0108642578125, |
| "learning_rate": 0.022996508916114898, |
| "loss": 0.0031, |
| "num_input_tokens_seen": 557792, |
| "step": 1230, |
| "train_runtime": 262.4269, |
| "train_tokens_per_second": 2125.514 |
| }, |
| { |
| "epoch": 7.8164556962025316, |
| "grad_norm": 0.041015625, |
| "learning_rate": 0.02292629367678929, |
| "loss": 0.0079, |
| "num_input_tokens_seen": 559968, |
| "step": 1235, |
| "train_runtime": 263.401, |
| "train_tokens_per_second": 2125.915 |
| }, |
| { |
| "epoch": 7.848101265822785, |
| "grad_norm": 0.0198974609375, |
| "learning_rate": 0.022855836641178, |
| "loss": 0.0098, |
| "num_input_tokens_seen": 562272, |
| "step": 1240, |
| "train_runtime": 264.3946, |
| "train_tokens_per_second": 2126.639 |
| }, |
| { |
| "epoch": 7.879746835443038, |
| "grad_norm": 0.05224609375, |
| "learning_rate": 0.022785139958614652, |
| "loss": 0.0238, |
| "num_input_tokens_seen": 564608, |
| "step": 1245, |
| "train_runtime": 265.3951, |
| "train_tokens_per_second": 2127.424 |
| }, |
| { |
| "epoch": 7.911392405063291, |
| "grad_norm": 0.0242919921875, |
| "learning_rate": 0.02271420578574343, |
| "loss": 0.0284, |
| "num_input_tokens_seen": 566976, |
| "step": 1250, |
| "train_runtime": 266.4068, |
| "train_tokens_per_second": 2128.234 |
| }, |
| { |
| "epoch": 7.943037974683544, |
| "grad_norm": 0.05615234375, |
| "learning_rate": 0.022643036286453325, |
| "loss": 0.0153, |
| "num_input_tokens_seen": 569248, |
| "step": 1255, |
| "train_runtime": 267.3887, |
| "train_tokens_per_second": 2128.916 |
| }, |
| { |
| "epoch": 7.974683544303797, |
| "grad_norm": 0.083984375, |
| "learning_rate": 0.022571633631812082, |
| "loss": 0.0271, |
| "num_input_tokens_seen": 571488, |
| "step": 1260, |
| "train_runtime": 268.3681, |
| "train_tokens_per_second": 2129.493 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_loss": 0.06543910503387451, |
| "eval_runtime": 1.6923, |
| "eval_samples_per_second": 41.364, |
| "eval_steps_per_second": 10.636, |
| "num_input_tokens_seen": 573120, |
| "step": 1264 |
| }, |
| { |
| "epoch": 8.00632911392405, |
| "grad_norm": 0.007598876953125, |
| "learning_rate": 0.0225, |
| "loss": 0.0486, |
| "num_input_tokens_seen": 573568, |
| "step": 1265, |
| "train_runtime": 271.9159, |
| "train_tokens_per_second": 2109.358 |
| }, |
| { |
| "epoch": 8.037974683544304, |
| "grad_norm": 0.030517578125, |
| "learning_rate": 0.022428137576243456, |
| "loss": 0.0053, |
| "num_input_tokens_seen": 575808, |
| "step": 1270, |
| "train_runtime": 272.9056, |
| "train_tokens_per_second": 2109.917 |
| }, |
| { |
| "epoch": 8.069620253164556, |
| "grad_norm": 0.035888671875, |
| "learning_rate": 0.022356048552748285, |
| "loss": 0.0142, |
| "num_input_tokens_seen": 578208, |
| "step": 1275, |
| "train_runtime": 273.9241, |
| "train_tokens_per_second": 2110.833 |
| }, |
| { |
| "epoch": 8.10126582278481, |
| "grad_norm": 0.04541015625, |
| "learning_rate": 0.02228373512863286, |
| "loss": 0.002, |
| "num_input_tokens_seen": 580416, |
| "step": 1280, |
| "train_runtime": 274.9034, |
| "train_tokens_per_second": 2111.345 |
| }, |
| { |
| "epoch": 8.132911392405063, |
| "grad_norm": 0.0087890625, |
| "learning_rate": 0.022211199509861033, |
| "loss": 0.0041, |
| "num_input_tokens_seen": 582720, |
| "step": 1285, |
| "train_runtime": 275.8966, |
| "train_tokens_per_second": 2112.096 |
| }, |
| { |
| "epoch": 8.164556962025316, |
| "grad_norm": 0.00089263916015625, |
| "learning_rate": 0.022138443909174844, |
| "loss": 0.0066, |
| "num_input_tokens_seen": 584928, |
| "step": 1290, |
| "train_runtime": 276.8743, |
| "train_tokens_per_second": 2112.612 |
| }, |
| { |
| "epoch": 8.19620253164557, |
| "grad_norm": 4.9591064453125e-05, |
| "learning_rate": 0.02206547054602701, |
| "loss": 0.0191, |
| "num_input_tokens_seen": 587232, |
| "step": 1295, |
| "train_runtime": 277.8742, |
| "train_tokens_per_second": 2113.302 |
| }, |
| { |
| "epoch": 8.227848101265822, |
| "grad_norm": 0.0023193359375, |
| "learning_rate": 0.021992281646513213, |
| "loss": 0.0067, |
| "num_input_tokens_seen": 589504, |
| "step": 1300, |
| "train_runtime": 278.8576, |
| "train_tokens_per_second": 2113.996 |
| }, |
| { |
| "epoch": 8.259493670886076, |
| "grad_norm": 0.0123291015625, |
| "learning_rate": 0.0219188794433042, |
| "loss": 0.0072, |
| "num_input_tokens_seen": 591680, |
| "step": 1305, |
| "train_runtime": 279.8324, |
| "train_tokens_per_second": 2114.408 |
| }, |
| { |
| "epoch": 8.291139240506329, |
| "grad_norm": 0.06982421875, |
| "learning_rate": 0.021845266175577683, |
| "loss": 0.0619, |
| "num_input_tokens_seen": 594016, |
| "step": 1310, |
| "train_runtime": 280.8306, |
| "train_tokens_per_second": 2115.211 |
| }, |
| { |
| "epoch": 8.322784810126583, |
| "grad_norm": 0.07763671875, |
| "learning_rate": 0.02177144408895002, |
| "loss": 0.0096, |
| "num_input_tokens_seen": 596416, |
| "step": 1315, |
| "train_runtime": 281.8514, |
| "train_tokens_per_second": 2116.066 |
| }, |
| { |
| "epoch": 8.354430379746836, |
| "grad_norm": 0.0791015625, |
| "learning_rate": 0.0216974154354077, |
| "loss": 0.0368, |
| "num_input_tokens_seen": 598752, |
| "step": 1320, |
| "train_runtime": 282.8349, |
| "train_tokens_per_second": 2116.966 |
| }, |
| { |
| "epoch": 8.386075949367088, |
| "grad_norm": 0.00101470947265625, |
| "learning_rate": 0.02162318247323868, |
| "loss": 0.0353, |
| "num_input_tokens_seen": 600960, |
| "step": 1325, |
| "train_runtime": 283.8103, |
| "train_tokens_per_second": 2117.471 |
| }, |
| { |
| "epoch": 8.417721518987342, |
| "grad_norm": 0.0106201171875, |
| "learning_rate": 0.021548747466963447, |
| "loss": 0.0221, |
| "num_input_tokens_seen": 603264, |
| "step": 1330, |
| "train_runtime": 284.8014, |
| "train_tokens_per_second": 2118.192 |
| }, |
| { |
| "epoch": 8.449367088607595, |
| "grad_norm": 0.0771484375, |
| "learning_rate": 0.02147411268726599, |
| "loss": 0.0107, |
| "num_input_tokens_seen": 605536, |
| "step": 1335, |
| "train_runtime": 285.7825, |
| "train_tokens_per_second": 2118.87 |
| }, |
| { |
| "epoch": 8.481012658227849, |
| "grad_norm": 0.0137939453125, |
| "learning_rate": 0.021399280410924492, |
| "loss": 0.0387, |
| "num_input_tokens_seen": 607840, |
| "step": 1340, |
| "train_runtime": 286.7715, |
| "train_tokens_per_second": 2119.597 |
| }, |
| { |
| "epoch": 8.512658227848101, |
| "grad_norm": 0.006011962890625, |
| "learning_rate": 0.021324252920741877, |
| "loss": 0.0014, |
| "num_input_tokens_seen": 610144, |
| "step": 1345, |
| "train_runtime": 287.7496, |
| "train_tokens_per_second": 2120.399 |
| }, |
| { |
| "epoch": 8.544303797468354, |
| "grad_norm": 0.044921875, |
| "learning_rate": 0.021249032505476193, |
| "loss": 0.0213, |
| "num_input_tokens_seen": 612448, |
| "step": 1350, |
| "train_runtime": 288.7422, |
| "train_tokens_per_second": 2121.089 |
| }, |
| { |
| "epoch": 8.575949367088608, |
| "grad_norm": 0.039306640625, |
| "learning_rate": 0.0211736214597708, |
| "loss": 0.0532, |
| "num_input_tokens_seen": 614816, |
| "step": 1355, |
| "train_runtime": 289.7373, |
| "train_tokens_per_second": 2121.977 |
| }, |
| { |
| "epoch": 8.60759493670886, |
| "grad_norm": 0.0595703125, |
| "learning_rate": 0.021098022084084324, |
| "loss": 0.0263, |
| "num_input_tokens_seen": 617088, |
| "step": 1360, |
| "train_runtime": 290.7256, |
| "train_tokens_per_second": 2122.579 |
| }, |
| { |
| "epoch": 8.639240506329115, |
| "grad_norm": 0.050537109375, |
| "learning_rate": 0.02102223668462052, |
| "loss": 0.0137, |
| "num_input_tokens_seen": 619328, |
| "step": 1365, |
| "train_runtime": 291.7026, |
| "train_tokens_per_second": 2123.149 |
| }, |
| { |
| "epoch": 8.670886075949367, |
| "grad_norm": 0.0517578125, |
| "learning_rate": 0.02094626757325791, |
| "loss": 0.024, |
| "num_input_tokens_seen": 621600, |
| "step": 1370, |
| "train_runtime": 292.6815, |
| "train_tokens_per_second": 2123.81 |
| }, |
| { |
| "epoch": 8.70253164556962, |
| "grad_norm": 0.01129150390625, |
| "learning_rate": 0.020870117067479252, |
| "loss": 0.0252, |
| "num_input_tokens_seen": 623840, |
| "step": 1375, |
| "train_runtime": 293.6647, |
| "train_tokens_per_second": 2124.328 |
| }, |
| { |
| "epoch": 8.734177215189874, |
| "grad_norm": 0.044921875, |
| "learning_rate": 0.02079378749030086, |
| "loss": 0.0102, |
| "num_input_tokens_seen": 626112, |
| "step": 1380, |
| "train_runtime": 294.6439, |
| "train_tokens_per_second": 2124.979 |
| }, |
| { |
| "epoch": 8.765822784810126, |
| "grad_norm": 0.007080078125, |
| "learning_rate": 0.020717281170201704, |
| "loss": 0.0083, |
| "num_input_tokens_seen": 628448, |
| "step": 1385, |
| "train_runtime": 295.6383, |
| "train_tokens_per_second": 2125.733 |
| }, |
| { |
| "epoch": 8.79746835443038, |
| "grad_norm": 0.0220947265625, |
| "learning_rate": 0.02064060044105243, |
| "loss": 0.0515, |
| "num_input_tokens_seen": 630752, |
| "step": 1390, |
| "train_runtime": 296.6244, |
| "train_tokens_per_second": 2126.433 |
| }, |
| { |
| "epoch": 8.829113924050633, |
| "grad_norm": 0.0281982421875, |
| "learning_rate": 0.02056374764204411, |
| "loss": 0.0056, |
| "num_input_tokens_seen": 633024, |
| "step": 1395, |
| "train_runtime": 297.601, |
| "train_tokens_per_second": 2127.09 |
| }, |
| { |
| "epoch": 8.860759493670885, |
| "grad_norm": 0.0634765625, |
| "learning_rate": 0.02048672511761693, |
| "loss": 0.0236, |
| "num_input_tokens_seen": 635264, |
| "step": 1400, |
| "train_runtime": 298.5788, |
| "train_tokens_per_second": 2127.626 |
| }, |
| { |
| "epoch": 8.89240506329114, |
| "grad_norm": 0.02294921875, |
| "learning_rate": 0.020409535217388638, |
| "loss": 0.0061, |
| "num_input_tokens_seen": 637472, |
| "step": 1405, |
| "train_runtime": 299.553, |
| "train_tokens_per_second": 2128.077 |
| }, |
| { |
| "epoch": 8.924050632911392, |
| "grad_norm": 0.008544921875, |
| "learning_rate": 0.020332180296082875, |
| "loss": 0.0166, |
| "num_input_tokens_seen": 639744, |
| "step": 1410, |
| "train_runtime": 300.5318, |
| "train_tokens_per_second": 2128.706 |
| }, |
| { |
| "epoch": 8.955696202531646, |
| "grad_norm": 0.04345703125, |
| "learning_rate": 0.020254662713457366, |
| "loss": 0.0118, |
| "num_input_tokens_seen": 642016, |
| "step": 1415, |
| "train_runtime": 301.5109, |
| "train_tokens_per_second": 2129.329 |
| }, |
| { |
| "epoch": 8.987341772151899, |
| "grad_norm": 0.031982421875, |
| "learning_rate": 0.020176984834231897, |
| "loss": 0.0045, |
| "num_input_tokens_seen": 644288, |
| "step": 1420, |
| "train_runtime": 302.4933, |
| "train_tokens_per_second": 2129.925 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_loss": 0.06644842028617859, |
| "eval_runtime": 1.6869, |
| "eval_samples_per_second": 41.497, |
| "eval_steps_per_second": 10.671, |
| "num_input_tokens_seen": 644944, |
| "step": 1422 |
| }, |
| { |
| "epoch": 9.018987341772151, |
| "grad_norm": 0.0303955078125, |
| "learning_rate": 0.02009914902801621, |
| "loss": 0.0113, |
| "num_input_tokens_seen": 646320, |
| "step": 1425, |
| "train_runtime": 306.0813, |
| "train_tokens_per_second": 2111.596 |
| }, |
| { |
| "epoch": 9.050632911392405, |
| "grad_norm": 0.05419921875, |
| "learning_rate": 0.020021157669237698, |
| "loss": 0.0084, |
| "num_input_tokens_seen": 648560, |
| "step": 1430, |
| "train_runtime": 307.0899, |
| "train_tokens_per_second": 2111.955 |
| }, |
| { |
| "epoch": 9.082278481012658, |
| "grad_norm": 0.003936767578125, |
| "learning_rate": 0.01994301313706898, |
| "loss": 0.0014, |
| "num_input_tokens_seen": 650832, |
| "step": 1435, |
| "train_runtime": 308.0817, |
| "train_tokens_per_second": 2112.531 |
| }, |
| { |
| "epoch": 9.113924050632912, |
| "grad_norm": 0.00010967254638671875, |
| "learning_rate": 0.01986471781535531, |
| "loss": 0.0214, |
| "num_input_tokens_seen": 653072, |
| "step": 1440, |
| "train_runtime": 309.06, |
| "train_tokens_per_second": 2113.091 |
| }, |
| { |
| "epoch": 9.145569620253164, |
| "grad_norm": 0.0115966796875, |
| "learning_rate": 0.019786274092541887, |
| "loss": 0.0146, |
| "num_input_tokens_seen": 655344, |
| "step": 1445, |
| "train_runtime": 310.0523, |
| "train_tokens_per_second": 2113.656 |
| }, |
| { |
| "epoch": 9.177215189873417, |
| "grad_norm": 0.0264892578125, |
| "learning_rate": 0.01970768436160095, |
| "loss": 0.0028, |
| "num_input_tokens_seen": 657520, |
| "step": 1450, |
| "train_runtime": 311.0288, |
| "train_tokens_per_second": 2114.017 |
| }, |
| { |
| "epoch": 9.208860759493671, |
| "grad_norm": 0.0147705078125, |
| "learning_rate": 0.019628951019958815, |
| "loss": 0.0143, |
| "num_input_tokens_seen": 659856, |
| "step": 1455, |
| "train_runtime": 312.0235, |
| "train_tokens_per_second": 2114.764 |
| }, |
| { |
| "epoch": 9.240506329113924, |
| "grad_norm": 0.0089111328125, |
| "learning_rate": 0.01955007646942273, |
| "loss": 0.0047, |
| "num_input_tokens_seen": 662096, |
| "step": 1460, |
| "train_runtime": 313.0002, |
| "train_tokens_per_second": 2115.321 |
| }, |
| { |
| "epoch": 9.272151898734178, |
| "grad_norm": 0.000591278076171875, |
| "learning_rate": 0.019471063116107593, |
| "loss": 0.0064, |
| "num_input_tokens_seen": 664368, |
| "step": 1465, |
| "train_runtime": 313.9816, |
| "train_tokens_per_second": 2115.946 |
| }, |
| { |
| "epoch": 9.30379746835443, |
| "grad_norm": 0.0157470703125, |
| "learning_rate": 0.01939191337036257, |
| "loss": 0.0027, |
| "num_input_tokens_seen": 666608, |
| "step": 1470, |
| "train_runtime": 314.9615, |
| "train_tokens_per_second": 2116.475 |
| }, |
| { |
| "epoch": 9.335443037974684, |
| "grad_norm": 0.0751953125, |
| "learning_rate": 0.019312629646697572, |
| "loss": 0.0119, |
| "num_input_tokens_seen": 668976, |
| "step": 1475, |
| "train_runtime": 315.9602, |
| "train_tokens_per_second": 2117.279 |
| }, |
| { |
| "epoch": 9.367088607594937, |
| "grad_norm": 6.771087646484375e-05, |
| "learning_rate": 0.019233214363709557, |
| "loss": 0.0048, |
| "num_input_tokens_seen": 671312, |
| "step": 1480, |
| "train_runtime": 316.9511, |
| "train_tokens_per_second": 2118.03 |
| }, |
| { |
| "epoch": 9.39873417721519, |
| "grad_norm": 0.08056640625, |
| "learning_rate": 0.0191536699440088, |
| "loss": 0.0121, |
| "num_input_tokens_seen": 673648, |
| "step": 1485, |
| "train_runtime": 317.9431, |
| "train_tokens_per_second": 2118.769 |
| }, |
| { |
| "epoch": 9.430379746835444, |
| "grad_norm": 0.0057373046875, |
| "learning_rate": 0.019073998814144958, |
| "loss": 0.0044, |
| "num_input_tokens_seen": 675952, |
| "step": 1490, |
| "train_runtime": 318.9217, |
| "train_tokens_per_second": 2119.492 |
| }, |
| { |
| "epoch": 9.462025316455696, |
| "grad_norm": 0.0002536773681640625, |
| "learning_rate": 0.018994203404533068, |
| "loss": 0.0122, |
| "num_input_tokens_seen": 678128, |
| "step": 1495, |
| "train_runtime": 319.8919, |
| "train_tokens_per_second": 2119.866 |
| }, |
| { |
| "epoch": 9.49367088607595, |
| "grad_norm": 0.0004024505615234375, |
| "learning_rate": 0.01891428614937938, |
| "loss": 0.008, |
| "num_input_tokens_seen": 680432, |
| "step": 1500, |
| "train_runtime": 320.8826, |
| "train_tokens_per_second": 2120.501 |
| }, |
| { |
| "epoch": 9.525316455696203, |
| "grad_norm": 0.07177734375, |
| "learning_rate": 0.01883424948660712, |
| "loss": 0.0093, |
| "num_input_tokens_seen": 682608, |
| "step": 1505, |
| "train_runtime": 321.8483, |
| "train_tokens_per_second": 2120.9 |
| }, |
| { |
| "epoch": 9.556962025316455, |
| "grad_norm": 0.042236328125, |
| "learning_rate": 0.018754095857782118, |
| "loss": 0.0179, |
| "num_input_tokens_seen": 684912, |
| "step": 1510, |
| "train_runtime": 322.8354, |
| "train_tokens_per_second": 2121.552 |
| }, |
| { |
| "epoch": 9.58860759493671, |
| "grad_norm": 0.234375, |
| "learning_rate": 0.01867382770803832, |
| "loss": 0.0253, |
| "num_input_tokens_seen": 687216, |
| "step": 1515, |
| "train_runtime": 323.8209, |
| "train_tokens_per_second": 2122.21 |
| }, |
| { |
| "epoch": 9.620253164556962, |
| "grad_norm": 0.000713348388671875, |
| "learning_rate": 0.018593447486003202, |
| "loss": 0.0127, |
| "num_input_tokens_seen": 689488, |
| "step": 1520, |
| "train_runtime": 324.803, |
| "train_tokens_per_second": 2122.788 |
| }, |
| { |
| "epoch": 9.651898734177216, |
| "grad_norm": 0.0004444122314453125, |
| "learning_rate": 0.018512957643723064, |
| "loss": 0.0011, |
| "num_input_tokens_seen": 691760, |
| "step": 1525, |
| "train_runtime": 325.7751, |
| "train_tokens_per_second": 2123.428 |
| }, |
| { |
| "epoch": 9.683544303797468, |
| "grad_norm": 0.0284423828125, |
| "learning_rate": 0.01843236063658825, |
| "loss": 0.0106, |
| "num_input_tokens_seen": 694000, |
| "step": 1530, |
| "train_runtime": 326.7548, |
| "train_tokens_per_second": 2123.917 |
| }, |
| { |
| "epoch": 9.715189873417721, |
| "grad_norm": 0.10498046875, |
| "learning_rate": 0.018351658923258213, |
| "loss": 0.0134, |
| "num_input_tokens_seen": 696400, |
| "step": 1535, |
| "train_runtime": 327.7566, |
| "train_tokens_per_second": 2124.747 |
| }, |
| { |
| "epoch": 9.746835443037975, |
| "grad_norm": 0.00018310546875, |
| "learning_rate": 0.018270854965586555, |
| "loss": 0.0175, |
| "num_input_tokens_seen": 698640, |
| "step": 1540, |
| "train_runtime": 328.7367, |
| "train_tokens_per_second": 2125.226 |
| }, |
| { |
| "epoch": 9.778481012658228, |
| "grad_norm": 0.0115966796875, |
| "learning_rate": 0.018189951228545883, |
| "loss": 0.0058, |
| "num_input_tokens_seen": 700848, |
| "step": 1545, |
| "train_runtime": 329.7056, |
| "train_tokens_per_second": 2125.678 |
| }, |
| { |
| "epoch": 9.810126582278482, |
| "grad_norm": 0.02392578125, |
| "learning_rate": 0.018108950180152635, |
| "loss": 0.0138, |
| "num_input_tokens_seen": 703248, |
| "step": 1550, |
| "train_runtime": 330.7032, |
| "train_tokens_per_second": 2126.523 |
| }, |
| { |
| "epoch": 9.841772151898734, |
| "grad_norm": 0.048095703125, |
| "learning_rate": 0.018027854291391796, |
| "loss": 0.0099, |
| "num_input_tokens_seen": 705488, |
| "step": 1555, |
| "train_runtime": 331.675, |
| "train_tokens_per_second": 2127.046 |
| }, |
| { |
| "epoch": 9.873417721518987, |
| "grad_norm": 0.020751953125, |
| "learning_rate": 0.017946666036141513, |
| "loss": 0.0111, |
| "num_input_tokens_seen": 707728, |
| "step": 1560, |
| "train_runtime": 332.6455, |
| "train_tokens_per_second": 2127.575 |
| }, |
| { |
| "epoch": 9.905063291139241, |
| "grad_norm": 9.107589721679688e-05, |
| "learning_rate": 0.017865387891097616, |
| "loss": 0.002, |
| "num_input_tokens_seen": 709904, |
| "step": 1565, |
| "train_runtime": 333.6131, |
| "train_tokens_per_second": 2127.926 |
| }, |
| { |
| "epoch": 9.936708860759493, |
| "grad_norm": 0.0038604736328125, |
| "learning_rate": 0.017784022335698094, |
| "loss": 0.0029, |
| "num_input_tokens_seen": 712208, |
| "step": 1570, |
| "train_runtime": 334.5996, |
| "train_tokens_per_second": 2128.538 |
| }, |
| { |
| "epoch": 9.968354430379748, |
| "grad_norm": 0.03857421875, |
| "learning_rate": 0.01770257185204742, |
| "loss": 0.0091, |
| "num_input_tokens_seen": 714448, |
| "step": 1575, |
| "train_runtime": 335.5716, |
| "train_tokens_per_second": 2129.048 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.1015625, |
| "learning_rate": 0.017621038924840873, |
| "loss": 0.013, |
| "num_input_tokens_seen": 716448, |
| "step": 1580, |
| "train_runtime": 336.5105, |
| "train_tokens_per_second": 2129.051 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_loss": 0.06166619062423706, |
| "eval_runtime": 1.6665, |
| "eval_samples_per_second": 42.005, |
| "eval_steps_per_second": 10.801, |
| "num_input_tokens_seen": 716448, |
| "step": 1580 |
| }, |
| { |
| "epoch": 10.031645569620252, |
| "grad_norm": 0.0035858154296875, |
| "learning_rate": 0.017539426041288716, |
| "loss": 0.0014, |
| "num_input_tokens_seen": 718880, |
| "step": 1585, |
| "train_runtime": 340.3212, |
| "train_tokens_per_second": 2112.358 |
| }, |
| { |
| "epoch": 10.063291139240507, |
| "grad_norm": 0.0025177001953125, |
| "learning_rate": 0.017457735691040317, |
| "loss": 0.0051, |
| "num_input_tokens_seen": 721248, |
| "step": 1590, |
| "train_runtime": 341.3156, |
| "train_tokens_per_second": 2113.141 |
| }, |
| { |
| "epoch": 10.094936708860759, |
| "grad_norm": 0.0181884765625, |
| "learning_rate": 0.017375970366108225, |
| "loss": 0.0169, |
| "num_input_tokens_seen": 723520, |
| "step": 1595, |
| "train_runtime": 342.3044, |
| "train_tokens_per_second": 2113.674 |
| }, |
| { |
| "epoch": 10.126582278481013, |
| "grad_norm": 0.0257568359375, |
| "learning_rate": 0.017294132560792125, |
| "loss": 0.0039, |
| "num_input_tokens_seen": 725696, |
| "step": 1600, |
| "train_runtime": 343.2739, |
| "train_tokens_per_second": 2114.044 |
| }, |
| { |
| "epoch": 10.158227848101266, |
| "grad_norm": 0.00146484375, |
| "learning_rate": 0.017212224771602776, |
| "loss": 0.0018, |
| "num_input_tokens_seen": 727968, |
| "step": 1605, |
| "train_runtime": 344.2492, |
| "train_tokens_per_second": 2114.654 |
| }, |
| { |
| "epoch": 10.189873417721518, |
| "grad_norm": 0.003021240234375, |
| "learning_rate": 0.01713024949718581, |
| "loss": 0.0087, |
| "num_input_tokens_seen": 730304, |
| "step": 1610, |
| "train_runtime": 345.2405, |
| "train_tokens_per_second": 2115.349 |
| }, |
| { |
| "epoch": 10.221518987341772, |
| "grad_norm": 0.00019359588623046875, |
| "learning_rate": 0.01704820923824556, |
| "loss": 0.002, |
| "num_input_tokens_seen": 732672, |
| "step": 1615, |
| "train_runtime": 346.2421, |
| "train_tokens_per_second": 2116.068 |
| }, |
| { |
| "epoch": 10.253164556962025, |
| "grad_norm": 0.007354736328125, |
| "learning_rate": 0.01696610649746875, |
| "loss": 0.0026, |
| "num_input_tokens_seen": 734880, |
| "step": 1620, |
| "train_runtime": 347.2145, |
| "train_tokens_per_second": 2116.501 |
| }, |
| { |
| "epoch": 10.284810126582279, |
| "grad_norm": 0.000904083251953125, |
| "learning_rate": 0.016883943779448123, |
| "loss": 0.0036, |
| "num_input_tokens_seen": 737120, |
| "step": 1625, |
| "train_runtime": 348.1864, |
| "train_tokens_per_second": 2117.027 |
| }, |
| { |
| "epoch": 10.316455696202532, |
| "grad_norm": 0.00494384765625, |
| "learning_rate": 0.016801723590606086, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 739392, |
| "step": 1630, |
| "train_runtime": 349.176, |
| "train_tokens_per_second": 2117.534 |
| }, |
| { |
| "epoch": 10.348101265822784, |
| "grad_norm": 0.0140380859375, |
| "learning_rate": 0.016719448439118236, |
| "loss": 0.0019, |
| "num_input_tokens_seen": 741632, |
| "step": 1635, |
| "train_runtime": 350.1489, |
| "train_tokens_per_second": 2118.048 |
| }, |
| { |
| "epoch": 10.379746835443038, |
| "grad_norm": 0.006439208984375, |
| "learning_rate": 0.016637120834836816, |
| "loss": 0.0012, |
| "num_input_tokens_seen": 744000, |
| "step": 1640, |
| "train_runtime": 351.1394, |
| "train_tokens_per_second": 2118.816 |
| }, |
| { |
| "epoch": 10.41139240506329, |
| "grad_norm": 0.01116943359375, |
| "learning_rate": 0.016554743289214174, |
| "loss": 0.0019, |
| "num_input_tokens_seen": 746272, |
| "step": 1645, |
| "train_runtime": 352.1151, |
| "train_tokens_per_second": 2119.398 |
| }, |
| { |
| "epoch": 10.443037974683545, |
| "grad_norm": 0.01123046875, |
| "learning_rate": 0.016472318315226164, |
| "loss": 0.0011, |
| "num_input_tokens_seen": 748576, |
| "step": 1650, |
| "train_runtime": 353.1023, |
| "train_tokens_per_second": 2119.998 |
| }, |
| { |
| "epoch": 10.474683544303797, |
| "grad_norm": 0.0002841949462890625, |
| "learning_rate": 0.016389848427295465, |
| "loss": 0.0022, |
| "num_input_tokens_seen": 750944, |
| "step": 1655, |
| "train_runtime": 354.0847, |
| "train_tokens_per_second": 2120.803 |
| }, |
| { |
| "epoch": 10.50632911392405, |
| "grad_norm": 0.00634765625, |
| "learning_rate": 0.016307336141214875, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 753184, |
| "step": 1660, |
| "train_runtime": 355.1293, |
| "train_tokens_per_second": 2120.872 |
| }, |
| { |
| "epoch": 10.537974683544304, |
| "grad_norm": 4.4345855712890625e-05, |
| "learning_rate": 0.016224783974070574, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 755360, |
| "step": 1665, |
| "train_runtime": 356.0959, |
| "train_tokens_per_second": 2121.226 |
| }, |
| { |
| "epoch": 10.569620253164556, |
| "grad_norm": 0.01251220703125, |
| "learning_rate": 0.016142194444165342, |
| "loss": 0.001, |
| "num_input_tokens_seen": 757536, |
| "step": 1670, |
| "train_runtime": 357.0656, |
| "train_tokens_per_second": 2121.56 |
| }, |
| { |
| "epoch": 10.60126582278481, |
| "grad_norm": 6.818771362304688e-05, |
| "learning_rate": 0.01605957007094174, |
| "loss": 0.0046, |
| "num_input_tokens_seen": 759808, |
| "step": 1675, |
| "train_runtime": 358.041, |
| "train_tokens_per_second": 2122.126 |
| }, |
| { |
| "epoch": 10.632911392405063, |
| "grad_norm": 8.678436279296875e-05, |
| "learning_rate": 0.015976913374905227, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 762208, |
| "step": 1680, |
| "train_runtime": 359.0375, |
| "train_tokens_per_second": 2122.92 |
| }, |
| { |
| "epoch": 10.664556962025316, |
| "grad_norm": 5.245208740234375e-05, |
| "learning_rate": 0.015894226877547296, |
| "loss": 0.001, |
| "num_input_tokens_seen": 764480, |
| "step": 1685, |
| "train_runtime": 360.0214, |
| "train_tokens_per_second": 2123.43 |
| }, |
| { |
| "epoch": 10.69620253164557, |
| "grad_norm": 0.0003814697265625, |
| "learning_rate": 0.015811513101268555, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 766720, |
| "step": 1690, |
| "train_runtime": 360.9952, |
| "train_tokens_per_second": 2123.906 |
| }, |
| { |
| "epoch": 10.727848101265822, |
| "grad_norm": 9.34600830078125e-05, |
| "learning_rate": 0.015728774569301763, |
| "loss": 0.0018, |
| "num_input_tokens_seen": 768896, |
| "step": 1695, |
| "train_runtime": 361.9638, |
| "train_tokens_per_second": 2124.234 |
| }, |
| { |
| "epoch": 10.759493670886076, |
| "grad_norm": 0.10400390625, |
| "learning_rate": 0.015646013805634868, |
| "loss": 0.002, |
| "num_input_tokens_seen": 771200, |
| "step": 1700, |
| "train_runtime": 362.9508, |
| "train_tokens_per_second": 2124.806 |
| }, |
| { |
| "epoch": 10.791139240506329, |
| "grad_norm": 0.033203125, |
| "learning_rate": 0.015563233334934002, |
| "loss": 0.0029, |
| "num_input_tokens_seen": 773440, |
| "step": 1705, |
| "train_runtime": 363.9338, |
| "train_tokens_per_second": 2125.222 |
| }, |
| { |
| "epoch": 10.822784810126583, |
| "grad_norm": 0.001800537109375, |
| "learning_rate": 0.01548043568246649, |
| "loss": 0.002, |
| "num_input_tokens_seen": 775584, |
| "step": 1710, |
| "train_runtime": 364.8998, |
| "train_tokens_per_second": 2125.471 |
| }, |
| { |
| "epoch": 10.854430379746836, |
| "grad_norm": 0.000614166259765625, |
| "learning_rate": 0.01539762337402378, |
| "loss": 0.0053, |
| "num_input_tokens_seen": 777824, |
| "step": 1715, |
| "train_runtime": 365.8738, |
| "train_tokens_per_second": 2125.935 |
| }, |
| { |
| "epoch": 10.886075949367088, |
| "grad_norm": 0.00077056884765625, |
| "learning_rate": 0.015314798935844417, |
| "loss": 0.0175, |
| "num_input_tokens_seen": 780160, |
| "step": 1720, |
| "train_runtime": 366.8549, |
| "train_tokens_per_second": 2126.617 |
| }, |
| { |
| "epoch": 10.917721518987342, |
| "grad_norm": 0.0498046875, |
| "learning_rate": 0.015231964894536964, |
| "loss": 0.0025, |
| "num_input_tokens_seen": 782528, |
| "step": 1725, |
| "train_runtime": 367.8465, |
| "train_tokens_per_second": 2127.322 |
| }, |
| { |
| "epoch": 10.949367088607595, |
| "grad_norm": 0.01153564453125, |
| "learning_rate": 0.015149123777002947, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 784832, |
| "step": 1730, |
| "train_runtime": 368.8341, |
| "train_tokens_per_second": 2127.873 |
| }, |
| { |
| "epoch": 10.981012658227849, |
| "grad_norm": 0.0004596710205078125, |
| "learning_rate": 0.015066278110359738, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 787136, |
| "step": 1735, |
| "train_runtime": 369.8218, |
| "train_tokens_per_second": 2128.42 |
| }, |
| { |
| "epoch": 11.0, |
| "eval_loss": 0.08956073224544525, |
| "eval_runtime": 1.7158, |
| "eval_samples_per_second": 40.797, |
| "eval_steps_per_second": 10.491, |
| "num_input_tokens_seen": 788256, |
| "step": 1738 |
| }, |
| { |
| "epoch": 11.012658227848101, |
| "grad_norm": 0.0030975341796875, |
| "learning_rate": 0.014983430421863501, |
| "loss": 0.0011, |
| "num_input_tokens_seen": 789184, |
| "step": 1740, |
| "train_runtime": 373.3617, |
| "train_tokens_per_second": 2113.725 |
| }, |
| { |
| "epoch": 11.044303797468354, |
| "grad_norm": 0.0029449462890625, |
| "learning_rate": 0.014900583238832062, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 791328, |
| "step": 1745, |
| "train_runtime": 374.327, |
| "train_tokens_per_second": 2114.002 |
| }, |
| { |
| "epoch": 11.075949367088608, |
| "grad_norm": 6.246566772460938e-05, |
| "learning_rate": 0.014817739088567832, |
| "loss": 0.0026, |
| "num_input_tokens_seen": 793536, |
| "step": 1750, |
| "train_runtime": 375.2996, |
| "train_tokens_per_second": 2114.406 |
| }, |
| { |
| "epoch": 11.10759493670886, |
| "grad_norm": 0.0014801025390625, |
| "learning_rate": 0.014734900498280717, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 795744, |
| "step": 1755, |
| "train_runtime": 376.274, |
| "train_tokens_per_second": 2114.799 |
| }, |
| { |
| "epoch": 11.139240506329115, |
| "grad_norm": 0.00112152099609375, |
| "learning_rate": 0.014652069995011003, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 797984, |
| "step": 1760, |
| "train_runtime": 377.247, |
| "train_tokens_per_second": 2115.283 |
| }, |
| { |
| "epoch": 11.170886075949367, |
| "grad_norm": 0.005828857421875, |
| "learning_rate": 0.014569250105552262, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 800352, |
| "step": 1765, |
| "train_runtime": 378.2415, |
| "train_tokens_per_second": 2115.982 |
| }, |
| { |
| "epoch": 11.20253164556962, |
| "grad_norm": 0.0012054443359375, |
| "learning_rate": 0.014486443356374317, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 802592, |
| "step": 1770, |
| "train_runtime": 379.2202, |
| "train_tokens_per_second": 2116.427 |
| }, |
| { |
| "epoch": 11.234177215189874, |
| "grad_norm": 0.0218505859375, |
| "learning_rate": 0.014403652273546117, |
| "loss": 0.0013, |
| "num_input_tokens_seen": 804896, |
| "step": 1775, |
| "train_runtime": 380.1991, |
| "train_tokens_per_second": 2117.038 |
| }, |
| { |
| "epoch": 11.265822784810126, |
| "grad_norm": 0.0027313232421875, |
| "learning_rate": 0.014320879382658702, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 807168, |
| "step": 1780, |
| "train_runtime": 381.1834, |
| "train_tokens_per_second": 2117.532 |
| }, |
| { |
| "epoch": 11.29746835443038, |
| "grad_norm": 0.00010204315185546875, |
| "learning_rate": 0.014238127208748164, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 809408, |
| "step": 1785, |
| "train_runtime": 382.157, |
| "train_tokens_per_second": 2117.999 |
| }, |
| { |
| "epoch": 11.329113924050633, |
| "grad_norm": 0.00174713134765625, |
| "learning_rate": 0.014155398276218605, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 811680, |
| "step": 1790, |
| "train_runtime": 383.1444, |
| "train_tokens_per_second": 2118.47 |
| }, |
| { |
| "epoch": 11.360759493670885, |
| "grad_norm": 0.00019073486328125, |
| "learning_rate": 0.014072695108765128, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 813824, |
| "step": 1795, |
| "train_runtime": 384.1109, |
| "train_tokens_per_second": 2118.722 |
| }, |
| { |
| "epoch": 11.39240506329114, |
| "grad_norm": 0.001312255859375, |
| "learning_rate": 0.013990020229296886, |
| "loss": 0.0028, |
| "num_input_tokens_seen": 816096, |
| "step": 1800, |
| "train_runtime": 385.0871, |
| "train_tokens_per_second": 2119.25 |
| }, |
| { |
| "epoch": 11.424050632911392, |
| "grad_norm": 0.0213623046875, |
| "learning_rate": 0.013907376159860046, |
| "loss": 0.003, |
| "num_input_tokens_seen": 818496, |
| "step": 1805, |
| "train_runtime": 386.0926, |
| "train_tokens_per_second": 2119.947 |
| }, |
| { |
| "epoch": 11.455696202531646, |
| "grad_norm": 4.673004150390625e-05, |
| "learning_rate": 0.013824765421560938, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 820704, |
| "step": 1810, |
| "train_runtime": 387.0636, |
| "train_tokens_per_second": 2120.334 |
| }, |
| { |
| "epoch": 11.487341772151899, |
| "grad_norm": 0.0029754638671875, |
| "learning_rate": 0.013742190534489085, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 823040, |
| "step": 1815, |
| "train_runtime": 388.0546, |
| "train_tokens_per_second": 2120.938 |
| }, |
| { |
| "epoch": 11.518987341772151, |
| "grad_norm": 0.00070953369140625, |
| "learning_rate": 0.013659654017640343, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 825248, |
| "step": 1820, |
| "train_runtime": 389.0274, |
| "train_tokens_per_second": 2121.311 |
| }, |
| { |
| "epoch": 11.550632911392405, |
| "grad_norm": 0.0123291015625, |
| "learning_rate": 0.013577158388840075, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 827552, |
| "step": 1825, |
| "train_runtime": 390.0152, |
| "train_tokens_per_second": 2121.846 |
| }, |
| { |
| "epoch": 11.582278481012658, |
| "grad_norm": 0.00115203857421875, |
| "learning_rate": 0.013494706164666324, |
| "loss": 0.0013, |
| "num_input_tokens_seen": 829856, |
| "step": 1830, |
| "train_runtime": 391.0116, |
| "train_tokens_per_second": 2122.331 |
| }, |
| { |
| "epoch": 11.613924050632912, |
| "grad_norm": 0.00189971923828125, |
| "learning_rate": 0.013412299860373046, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 832192, |
| "step": 1835, |
| "train_runtime": 392.014, |
| "train_tokens_per_second": 2122.863 |
| }, |
| { |
| "epoch": 11.645569620253164, |
| "grad_norm": 0.0007171630859375, |
| "learning_rate": 0.013329941989813392, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 834368, |
| "step": 1840, |
| "train_runtime": 392.9826, |
| "train_tokens_per_second": 2123.168 |
| }, |
| { |
| "epoch": 11.677215189873417, |
| "grad_norm": 0.00390625, |
| "learning_rate": 0.013247635065363007, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 836672, |
| "step": 1845, |
| "train_runtime": 393.9706, |
| "train_tokens_per_second": 2123.692 |
| }, |
| { |
| "epoch": 11.708860759493671, |
| "grad_norm": 5.316734313964844e-05, |
| "learning_rate": 0.013165381597843384, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 838912, |
| "step": 1850, |
| "train_runtime": 394.9452, |
| "train_tokens_per_second": 2124.122 |
| }, |
| { |
| "epoch": 11.740506329113924, |
| "grad_norm": 0.001220703125, |
| "learning_rate": 0.013083184096445313, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 841216, |
| "step": 1855, |
| "train_runtime": 395.9332, |
| "train_tokens_per_second": 2124.641 |
| }, |
| { |
| "epoch": 11.772151898734178, |
| "grad_norm": 0.0047607421875, |
| "learning_rate": 0.013001045068652269, |
| "loss": 0.0009, |
| "num_input_tokens_seen": 843520, |
| "step": 1860, |
| "train_runtime": 396.9205, |
| "train_tokens_per_second": 2125.161 |
| }, |
| { |
| "epoch": 11.80379746835443, |
| "grad_norm": 6.580352783203125e-05, |
| "learning_rate": 0.012918967020163976, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 845824, |
| "step": 1865, |
| "train_runtime": 397.9019, |
| "train_tokens_per_second": 2125.71 |
| }, |
| { |
| "epoch": 11.835443037974684, |
| "grad_norm": 3.886222839355469e-05, |
| "learning_rate": 0.012836952454819943, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 848160, |
| "step": 1870, |
| "train_runtime": 398.8925, |
| "train_tokens_per_second": 2126.287 |
| }, |
| { |
| "epoch": 11.867088607594937, |
| "grad_norm": 0.00124359130859375, |
| "learning_rate": 0.012755003874523082, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 850400, |
| "step": 1875, |
| "train_runtime": 399.8677, |
| "train_tokens_per_second": 2126.704 |
| }, |
| { |
| "epoch": 11.89873417721519, |
| "grad_norm": 0.00102996826171875, |
| "learning_rate": 0.012673123779163402, |
| "loss": 0.0009, |
| "num_input_tokens_seen": 852736, |
| "step": 1880, |
| "train_runtime": 400.8519, |
| "train_tokens_per_second": 2127.309 |
| }, |
| { |
| "epoch": 11.930379746835444, |
| "grad_norm": 0.0012359619140625, |
| "learning_rate": 0.01259131466654173, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 855072, |
| "step": 1885, |
| "train_runtime": 401.845, |
| "train_tokens_per_second": 2127.865 |
| }, |
| { |
| "epoch": 11.962025316455696, |
| "grad_norm": 0.007110595703125, |
| "learning_rate": 0.012509579032293525, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 857312, |
| "step": 1890, |
| "train_runtime": 402.8194, |
| "train_tokens_per_second": 2128.279 |
| }, |
| { |
| "epoch": 11.99367088607595, |
| "grad_norm": 5.459785461425781e-05, |
| "learning_rate": 0.012427919369812754, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 859584, |
| "step": 1895, |
| "train_runtime": 403.7997, |
| "train_tokens_per_second": 2128.739 |
| }, |
| { |
| "epoch": 12.0, |
| "eval_loss": 0.09028957784175873, |
| "eval_runtime": 1.6669, |
| "eval_samples_per_second": 41.995, |
| "eval_steps_per_second": 10.799, |
| "num_input_tokens_seen": 859808, |
| "step": 1896 |
| }, |
| { |
| "epoch": 12.025316455696203, |
| "grad_norm": 0.00299072265625, |
| "learning_rate": 0.012346338170175808, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 861632, |
| "step": 1900, |
| "train_runtime": 407.3224, |
| "train_tokens_per_second": 2115.356 |
| }, |
| { |
| "epoch": 12.056962025316455, |
| "grad_norm": 0.001861572265625, |
| "learning_rate": 0.012264837922065518, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 863808, |
| "step": 1905, |
| "train_runtime": 408.2983, |
| "train_tokens_per_second": 2115.63 |
| }, |
| { |
| "epoch": 12.08860759493671, |
| "grad_norm": 0.00023746490478515625, |
| "learning_rate": 0.012183421111695262, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 866048, |
| "step": 1910, |
| "train_runtime": 409.2823, |
| "train_tokens_per_second": 2116.016 |
| }, |
| { |
| "epoch": 12.120253164556962, |
| "grad_norm": 0.000736236572265625, |
| "learning_rate": 0.012102090222733081, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 868352, |
| "step": 1915, |
| "train_runtime": 410.2801, |
| "train_tokens_per_second": 2116.486 |
| }, |
| { |
| "epoch": 12.151898734177216, |
| "grad_norm": 0.00909423828125, |
| "learning_rate": 0.012020847736225939, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 870560, |
| "step": 1920, |
| "train_runtime": 411.2539, |
| "train_tokens_per_second": 2116.843 |
| }, |
| { |
| "epoch": 12.183544303797468, |
| "grad_norm": 0.00604248046875, |
| "learning_rate": 0.011939696130524032, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 872768, |
| "step": 1925, |
| "train_runtime": 412.226, |
| "train_tokens_per_second": 2117.208 |
| }, |
| { |
| "epoch": 12.215189873417721, |
| "grad_norm": 5.1021575927734375e-05, |
| "learning_rate": 0.011858637881205177, |
| "loss": 0.0, |
| "num_input_tokens_seen": 874976, |
| "step": 1930, |
| "train_runtime": 413.1976, |
| "train_tokens_per_second": 2117.573 |
| }, |
| { |
| "epoch": 12.246835443037975, |
| "grad_norm": 0.0023193359375, |
| "learning_rate": 0.011777675460999311, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 877344, |
| "step": 1935, |
| "train_runtime": 414.1922, |
| "train_tokens_per_second": 2118.205 |
| }, |
| { |
| "epoch": 12.278481012658228, |
| "grad_norm": 0.00714111328125, |
| "learning_rate": 0.01169681133971304, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 879584, |
| "step": 1940, |
| "train_runtime": 415.1661, |
| "train_tokens_per_second": 2118.632 |
| }, |
| { |
| "epoch": 12.310126582278482, |
| "grad_norm": 0.00015926361083984375, |
| "learning_rate": 0.011616047984154299, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 881760, |
| "step": 1945, |
| "train_runtime": 416.1354, |
| "train_tokens_per_second": 2118.926 |
| }, |
| { |
| "epoch": 12.341772151898734, |
| "grad_norm": 0.0009613037109375, |
| "learning_rate": 0.011535387858057114, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 883968, |
| "step": 1950, |
| "train_runtime": 417.1069, |
| "train_tokens_per_second": 2119.284 |
| }, |
| { |
| "epoch": 12.373417721518987, |
| "grad_norm": 0.000431060791015625, |
| "learning_rate": 0.011454833422006427, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 886144, |
| "step": 1955, |
| "train_runtime": 418.0757, |
| "train_tokens_per_second": 2119.578 |
| }, |
| { |
| "epoch": 12.405063291139241, |
| "grad_norm": 5.8650970458984375e-05, |
| "learning_rate": 0.011374387133363046, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 888448, |
| "step": 1960, |
| "train_runtime": 419.0588, |
| "train_tokens_per_second": 2120.103 |
| }, |
| { |
| "epoch": 12.436708860759493, |
| "grad_norm": 4.1484832763671875e-05, |
| "learning_rate": 0.01129405144618868, |
| "loss": 0.0, |
| "num_input_tokens_seen": 890720, |
| "step": 1965, |
| "train_runtime": 420.0388, |
| "train_tokens_per_second": 2120.566 |
| }, |
| { |
| "epoch": 12.468354430379748, |
| "grad_norm": 0.0004100799560546875, |
| "learning_rate": 0.01121382881117107, |
| "loss": 0.0012, |
| "num_input_tokens_seen": 892992, |
| "step": 1970, |
| "train_runtime": 421.0157, |
| "train_tokens_per_second": 2121.042 |
| }, |
| { |
| "epoch": 12.5, |
| "grad_norm": 0.000186920166015625, |
| "learning_rate": 0.011133721675549232, |
| "loss": 0.0009, |
| "num_input_tokens_seen": 895360, |
| "step": 1975, |
| "train_runtime": 422.0134, |
| "train_tokens_per_second": 2121.639 |
| }, |
| { |
| "epoch": 12.531645569620252, |
| "grad_norm": 6.079673767089844e-05, |
| "learning_rate": 0.011053732483038824, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 897696, |
| "step": 1980, |
| "train_runtime": 423.0131, |
| "train_tokens_per_second": 2122.147 |
| }, |
| { |
| "epoch": 12.563291139240507, |
| "grad_norm": 0.000579833984375, |
| "learning_rate": 0.010973863673757548, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 900032, |
| "step": 1985, |
| "train_runtime": 424.0043, |
| "train_tokens_per_second": 2122.695 |
| }, |
| { |
| "epoch": 12.594936708860759, |
| "grad_norm": 3.62396240234375e-05, |
| "learning_rate": 0.010894117684150773, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 902336, |
| "step": 1990, |
| "train_runtime": 424.9831, |
| "train_tokens_per_second": 2123.228 |
| }, |
| { |
| "epoch": 12.626582278481013, |
| "grad_norm": 0.00020122528076171875, |
| "learning_rate": 0.010814496946917168, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 904512, |
| "step": 1995, |
| "train_runtime": 425.953, |
| "train_tokens_per_second": 2123.502 |
| }, |
| { |
| "epoch": 12.658227848101266, |
| "grad_norm": 0.0019073486328125, |
| "learning_rate": 0.010735003890934494, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 906944, |
| "step": 2000, |
| "train_runtime": 426.96, |
| "train_tokens_per_second": 2124.189 |
| }, |
| { |
| "epoch": 12.689873417721518, |
| "grad_norm": 0.00109100341796875, |
| "learning_rate": 0.010655640941185544, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 909280, |
| "step": 2005, |
| "train_runtime": 427.9421, |
| "train_tokens_per_second": 2124.774 |
| }, |
| { |
| "epoch": 12.721518987341772, |
| "grad_norm": 0.00040435791015625, |
| "learning_rate": 0.010576410518684127, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 911424, |
| "step": 2010, |
| "train_runtime": 428.9095, |
| "train_tokens_per_second": 2124.98 |
| }, |
| { |
| "epoch": 12.753164556962025, |
| "grad_norm": 0.0059814453125, |
| "learning_rate": 0.01049731504040122, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 913760, |
| "step": 2015, |
| "train_runtime": 429.8934, |
| "train_tokens_per_second": 2125.55 |
| }, |
| { |
| "epoch": 12.784810126582279, |
| "grad_norm": 0.00019359588623046875, |
| "learning_rate": 0.010418356919191284, |
| "loss": 0.0012, |
| "num_input_tokens_seen": 916096, |
| "step": 2020, |
| "train_runtime": 430.8842, |
| "train_tokens_per_second": 2126.084 |
| }, |
| { |
| "epoch": 12.816455696202532, |
| "grad_norm": 0.0004558563232421875, |
| "learning_rate": 0.010339538563718576, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 918336, |
| "step": 2025, |
| "train_runtime": 431.8594, |
| "train_tokens_per_second": 2126.47 |
| }, |
| { |
| "epoch": 12.848101265822784, |
| "grad_norm": 0.00095367431640625, |
| "learning_rate": 0.010260862378383738, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 920672, |
| "step": 2030, |
| "train_runtime": 432.8498, |
| "train_tokens_per_second": 2127.001 |
| }, |
| { |
| "epoch": 12.879746835443038, |
| "grad_norm": 7.200241088867188e-05, |
| "learning_rate": 0.01018233076325044, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 923072, |
| "step": 2035, |
| "train_runtime": 433.845, |
| "train_tokens_per_second": 2127.654 |
| }, |
| { |
| "epoch": 12.91139240506329, |
| "grad_norm": 0.00518798828125, |
| "learning_rate": 0.01010394611397213, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 925440, |
| "step": 2040, |
| "train_runtime": 434.8489, |
| "train_tokens_per_second": 2128.188 |
| }, |
| { |
| "epoch": 12.943037974683545, |
| "grad_norm": 0.0045166015625, |
| "learning_rate": 0.010025710821718983, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 927744, |
| "step": 2045, |
| "train_runtime": 435.8295, |
| "train_tokens_per_second": 2128.685 |
| }, |
| { |
| "epoch": 12.974683544303797, |
| "grad_norm": 5.7220458984375e-05, |
| "learning_rate": 0.009947627273104958, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 929952, |
| "step": 2050, |
| "train_runtime": 436.8026, |
| "train_tokens_per_second": 2128.998 |
| }, |
| { |
| "epoch": 13.0, |
| "eval_loss": 0.09263655543327332, |
| "eval_runtime": 1.678, |
| "eval_samples_per_second": 41.718, |
| "eval_steps_per_second": 10.727, |
| "num_input_tokens_seen": 931472, |
| "step": 2054 |
| }, |
| { |
| "epoch": 13.00632911392405, |
| "grad_norm": 0.005462646484375, |
| "learning_rate": 0.00986969785011497, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 931952, |
| "step": 2055, |
| "train_runtime": 440.2919, |
| "train_tokens_per_second": 2116.669 |
| }, |
| { |
| "epoch": 13.037974683544304, |
| "grad_norm": 0.002655029296875, |
| "learning_rate": 0.009791924930032251, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 934288, |
| "step": 2060, |
| "train_runtime": 441.2826, |
| "train_tokens_per_second": 2117.21 |
| }, |
| { |
| "epoch": 13.069620253164556, |
| "grad_norm": 0.0004215240478515625, |
| "learning_rate": 0.00971431088536582, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 936624, |
| "step": 2065, |
| "train_runtime": 442.2743, |
| "train_tokens_per_second": 2117.745 |
| }, |
| { |
| "epoch": 13.10126582278481, |
| "grad_norm": 0.0003299713134765625, |
| "learning_rate": 0.009636858083778092, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 938928, |
| "step": 2070, |
| "train_runtime": 443.2555, |
| "train_tokens_per_second": 2118.254 |
| }, |
| { |
| "epoch": 13.132911392405063, |
| "grad_norm": 0.0010528564453125, |
| "learning_rate": 0.00955956888801269, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 941232, |
| "step": 2075, |
| "train_runtime": 444.2469, |
| "train_tokens_per_second": 2118.714 |
| }, |
| { |
| "epoch": 13.164556962025316, |
| "grad_norm": 0.00115203857421875, |
| "learning_rate": 0.009482445655822326, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 943440, |
| "step": 2080, |
| "train_runtime": 445.2222, |
| "train_tokens_per_second": 2119.032 |
| }, |
| { |
| "epoch": 13.19620253164557, |
| "grad_norm": 0.0001506805419921875, |
| "learning_rate": 0.009405490739896898, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 945648, |
| "step": 2085, |
| "train_runtime": 446.1949, |
| "train_tokens_per_second": 2119.361 |
| }, |
| { |
| "epoch": 13.227848101265822, |
| "grad_norm": 3.814697265625e-05, |
| "learning_rate": 0.009328706487791726, |
| "loss": 0.0, |
| "num_input_tokens_seen": 947920, |
| "step": 2090, |
| "train_runtime": 447.1719, |
| "train_tokens_per_second": 2119.811 |
| }, |
| { |
| "epoch": 13.259493670886076, |
| "grad_norm": 4.601478576660156e-05, |
| "learning_rate": 0.009252095241855923, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 950288, |
| "step": 2095, |
| "train_runtime": 448.1741, |
| "train_tokens_per_second": 2120.354 |
| }, |
| { |
| "epoch": 13.291139240506329, |
| "grad_norm": 0.00341796875, |
| "learning_rate": 0.009175659339160935, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 952560, |
| "step": 2100, |
| "train_runtime": 449.1515, |
| "train_tokens_per_second": 2120.799 |
| }, |
| { |
| "epoch": 13.322784810126583, |
| "grad_norm": 3.504753112792969e-05, |
| "learning_rate": 0.009099401111429277, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 954864, |
| "step": 2105, |
| "train_runtime": 450.1402, |
| "train_tokens_per_second": 2121.259 |
| }, |
| { |
| "epoch": 13.354430379746836, |
| "grad_norm": 0.004058837890625, |
| "learning_rate": 0.009023322884963372, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 957168, |
| "step": 2110, |
| "train_runtime": 451.1194, |
| "train_tokens_per_second": 2121.762 |
| }, |
| { |
| "epoch": 13.386075949367088, |
| "grad_norm": 0.00213623046875, |
| "learning_rate": 0.008947426980574607, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 959408, |
| "step": 2115, |
| "train_runtime": 452.094, |
| "train_tokens_per_second": 2122.143 |
| }, |
| { |
| "epoch": 13.417721518987342, |
| "grad_norm": 0.00049591064453125, |
| "learning_rate": 0.008871715713512522, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 961648, |
| "step": 2120, |
| "train_runtime": 453.0774, |
| "train_tokens_per_second": 2122.481 |
| }, |
| { |
| "epoch": 13.449367088607595, |
| "grad_norm": 0.001617431640625, |
| "learning_rate": 0.008796191393394177, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 964016, |
| "step": 2125, |
| "train_runtime": 454.0727, |
| "train_tokens_per_second": 2123.043 |
| }, |
| { |
| "epoch": 13.481012658227849, |
| "grad_norm": 0.002288818359375, |
| "learning_rate": 0.00872085632413372, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 966256, |
| "step": 2130, |
| "train_runtime": 455.0563, |
| "train_tokens_per_second": 2123.377 |
| }, |
| { |
| "epoch": 13.512658227848101, |
| "grad_norm": 4.649162292480469e-05, |
| "learning_rate": 0.008645712803872083, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 968432, |
| "step": 2135, |
| "train_runtime": 456.0259, |
| "train_tokens_per_second": 2123.634 |
| }, |
| { |
| "epoch": 13.544303797468354, |
| "grad_norm": 0.00433349609375, |
| "learning_rate": 0.008570763124906865, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 970672, |
| "step": 2140, |
| "train_runtime": 457.0, |
| "train_tokens_per_second": 2124.009 |
| }, |
| { |
| "epoch": 13.575949367088608, |
| "grad_norm": 4.124641418457031e-05, |
| "learning_rate": 0.00849600957362246, |
| "loss": 0.0012, |
| "num_input_tokens_seen": 972912, |
| "step": 2145, |
| "train_runtime": 457.9749, |
| "train_tokens_per_second": 2124.378 |
| }, |
| { |
| "epoch": 13.60759493670886, |
| "grad_norm": 0.000522613525390625, |
| "learning_rate": 0.008421454430420234, |
| "loss": 0.0012, |
| "num_input_tokens_seen": 975152, |
| "step": 2150, |
| "train_runtime": 458.9584, |
| "train_tokens_per_second": 2124.707 |
| }, |
| { |
| "epoch": 13.639240506329115, |
| "grad_norm": 0.0002288818359375, |
| "learning_rate": 0.008347099969649014, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 977584, |
| "step": 2155, |
| "train_runtime": 459.9591, |
| "train_tokens_per_second": 2125.372 |
| }, |
| { |
| "epoch": 13.670886075949367, |
| "grad_norm": 9.584426879882812e-05, |
| "learning_rate": 0.008272948459535695, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 979856, |
| "step": 2160, |
| "train_runtime": 460.9457, |
| "train_tokens_per_second": 2125.751 |
| }, |
| { |
| "epoch": 13.70253164556962, |
| "grad_norm": 6.151199340820312e-05, |
| "learning_rate": 0.008199002162116022, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 982256, |
| "step": 2165, |
| "train_runtime": 461.9527, |
| "train_tokens_per_second": 2126.313 |
| }, |
| { |
| "epoch": 13.734177215189874, |
| "grad_norm": 4.220008850097656e-05, |
| "learning_rate": 0.008125263333165628, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 984592, |
| "step": 2170, |
| "train_runtime": 462.9443, |
| "train_tokens_per_second": 2126.805 |
| }, |
| { |
| "epoch": 13.765822784810126, |
| "grad_norm": 0.0002765655517578125, |
| "learning_rate": 0.008051734222131186, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 986864, |
| "step": 2175, |
| "train_runtime": 463.9248, |
| "train_tokens_per_second": 2127.207 |
| }, |
| { |
| "epoch": 13.79746835443038, |
| "grad_norm": 0.006378173828125, |
| "learning_rate": 0.00797841707206179, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 989136, |
| "step": 2180, |
| "train_runtime": 464.9127, |
| "train_tokens_per_second": 2127.574 |
| }, |
| { |
| "epoch": 13.829113924050633, |
| "grad_norm": 0.000225067138671875, |
| "learning_rate": 0.00790531411954057, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 991440, |
| "step": 2185, |
| "train_runtime": 465.9052, |
| "train_tokens_per_second": 2127.986 |
| }, |
| { |
| "epoch": 13.860759493670885, |
| "grad_norm": 0.002593994140625, |
| "learning_rate": 0.007832427594616397, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 993712, |
| "step": 2190, |
| "train_runtime": 466.8825, |
| "train_tokens_per_second": 2128.398 |
| }, |
| { |
| "epoch": 13.89240506329114, |
| "grad_norm": 0.0012664794921875, |
| "learning_rate": 0.0077597597207359125, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 995920, |
| "step": 2195, |
| "train_runtime": 467.8545, |
| "train_tokens_per_second": 2128.696 |
| }, |
| { |
| "epoch": 13.924050632911392, |
| "grad_norm": 0.0002346038818359375, |
| "learning_rate": 0.007687312714675674, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 998224, |
| "step": 2200, |
| "train_runtime": 468.8346, |
| "train_tokens_per_second": 2129.16 |
| }, |
| { |
| "epoch": 13.955696202531646, |
| "grad_norm": 0.0023193359375, |
| "learning_rate": 0.007615088786474526, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 1000400, |
| "step": 2205, |
| "train_runtime": 469.8044, |
| "train_tokens_per_second": 2129.397 |
| }, |
| { |
| "epoch": 13.987341772151899, |
| "grad_norm": 0.000194549560546875, |
| "learning_rate": 0.0075430901393662, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1002640, |
| "step": 2210, |
| "train_runtime": 470.7794, |
| "train_tokens_per_second": 2129.745 |
| }, |
| { |
| "epoch": 14.0, |
| "eval_loss": 0.09213147312402725, |
| "eval_runtime": 1.6795, |
| "eval_samples_per_second": 41.679, |
| "eval_steps_per_second": 10.718, |
| "num_input_tokens_seen": 1003376, |
| "step": 2212 |
| }, |
| { |
| "epoch": 14.018987341772151, |
| "grad_norm": 0.0019989013671875, |
| "learning_rate": 0.007471318969712099, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1004752, |
| "step": 2215, |
| "train_runtime": 474.323, |
| "train_tokens_per_second": 2118.286 |
| }, |
| { |
| "epoch": 14.050632911392405, |
| "grad_norm": 0.0001735687255859375, |
| "learning_rate": 0.007399777466934275, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1007120, |
| "step": 2220, |
| "train_runtime": 475.3503, |
| "train_tokens_per_second": 2118.69 |
| }, |
| { |
| "epoch": 14.082278481012658, |
| "grad_norm": 0.0034332275390625, |
| "learning_rate": 0.007328467813448668, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 1009520, |
| "step": 2225, |
| "train_runtime": 476.3596, |
| "train_tokens_per_second": 2119.24 |
| }, |
| { |
| "epoch": 14.113924050632912, |
| "grad_norm": 0.003326416015625, |
| "learning_rate": 0.007257392184598517, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1011792, |
| "step": 2230, |
| "train_runtime": 477.3371, |
| "train_tokens_per_second": 2119.659 |
| }, |
| { |
| "epoch": 14.145569620253164, |
| "grad_norm": 0.00012111663818359375, |
| "learning_rate": 0.007186552748587997, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1014032, |
| "step": 2235, |
| "train_runtime": 478.3216, |
| "train_tokens_per_second": 2119.98 |
| }, |
| { |
| "epoch": 14.177215189873417, |
| "grad_norm": 0.0028839111328125, |
| "learning_rate": 0.00711595166641609, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 1016336, |
| "step": 2240, |
| "train_runtime": 479.3034, |
| "train_tokens_per_second": 2120.444 |
| }, |
| { |
| "epoch": 14.208860759493671, |
| "grad_norm": 0.001068115234375, |
| "learning_rate": 0.007045591091810634, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 1018672, |
| "step": 2245, |
| "train_runtime": 480.2951, |
| "train_tokens_per_second": 2120.929 |
| }, |
| { |
| "epoch": 14.240506329113924, |
| "grad_norm": 0.0002155303955078125, |
| "learning_rate": 0.006975473171162659, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1020976, |
| "step": 2250, |
| "train_runtime": 481.2844, |
| "train_tokens_per_second": 2121.357 |
| }, |
| { |
| "epoch": 14.272151898734178, |
| "grad_norm": 0.006011962890625, |
| "learning_rate": 0.006905600043460891, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 1023120, |
| "step": 2255, |
| "train_runtime": 482.2561, |
| "train_tokens_per_second": 2121.529 |
| }, |
| { |
| "epoch": 14.30379746835443, |
| "grad_norm": 4.935264587402344e-05, |
| "learning_rate": 0.006835973840226484, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1025360, |
| "step": 2260, |
| "train_runtime": 483.2319, |
| "train_tokens_per_second": 2121.88 |
| }, |
| { |
| "epoch": 14.335443037974684, |
| "grad_norm": 0.002716064453125, |
| "learning_rate": 0.006766596685448035, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1027664, |
| "step": 2265, |
| "train_runtime": 484.2201, |
| "train_tokens_per_second": 2122.308 |
| }, |
| { |
| "epoch": 14.367088607594937, |
| "grad_norm": 6.437301635742188e-05, |
| "learning_rate": 0.006697470695516768, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1029936, |
| "step": 2270, |
| "train_runtime": 485.2006, |
| "train_tokens_per_second": 2122.701 |
| }, |
| { |
| "epoch": 14.39873417721519, |
| "grad_norm": 4.5299530029296875e-05, |
| "learning_rate": 0.006628597979161958, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1032208, |
| "step": 2275, |
| "train_runtime": 486.187, |
| "train_tokens_per_second": 2123.068 |
| }, |
| { |
| "epoch": 14.430379746835444, |
| "grad_norm": 4.3392181396484375e-05, |
| "learning_rate": 0.006559980637386639, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1034416, |
| "step": 2280, |
| "train_runtime": 487.1593, |
| "train_tokens_per_second": 2123.363 |
| }, |
| { |
| "epoch": 14.462025316455696, |
| "grad_norm": 4.124641418457031e-05, |
| "learning_rate": 0.00649162076340348, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 1036688, |
| "step": 2285, |
| "train_runtime": 488.1363, |
| "train_tokens_per_second": 2123.767 |
| }, |
| { |
| "epoch": 14.49367088607595, |
| "grad_norm": 4.029273986816406e-05, |
| "learning_rate": 0.006423520442570956, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 1038960, |
| "step": 2290, |
| "train_runtime": 489.1261, |
| "train_tokens_per_second": 2124.115 |
| }, |
| { |
| "epoch": 14.525316455696203, |
| "grad_norm": 0.00012111663818359375, |
| "learning_rate": 0.006355681752329696, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1041072, |
| "step": 2295, |
| "train_runtime": 490.0912, |
| "train_tokens_per_second": 2124.241 |
| }, |
| { |
| "epoch": 14.556962025316455, |
| "grad_norm": 0.00171661376953125, |
| "learning_rate": 0.006288106762139153, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1043312, |
| "step": 2300, |
| "train_runtime": 491.0663, |
| "train_tokens_per_second": 2124.585 |
| }, |
| { |
| "epoch": 14.58860759493671, |
| "grad_norm": 0.00927734375, |
| "learning_rate": 0.006220797533414447, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 1045552, |
| "step": 2305, |
| "train_runtime": 492.0409, |
| "train_tokens_per_second": 2124.929 |
| }, |
| { |
| "epoch": 14.620253164556962, |
| "grad_norm": 0.001129150390625, |
| "learning_rate": 0.0061537561194634945, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1048048, |
| "step": 2310, |
| "train_runtime": 493.0522, |
| "train_tokens_per_second": 2125.633 |
| }, |
| { |
| "epoch": 14.651898734177216, |
| "grad_norm": 0.00201416015625, |
| "learning_rate": 0.006086984565424345, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1050384, |
| "step": 2315, |
| "train_runtime": 494.0443, |
| "train_tokens_per_second": 2126.093 |
| }, |
| { |
| "epoch": 14.683544303797468, |
| "grad_norm": 4.506111145019531e-05, |
| "learning_rate": 0.006020484908202826, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1052720, |
| "step": 2320, |
| "train_runtime": 495.0358, |
| "train_tokens_per_second": 2126.553 |
| }, |
| { |
| "epoch": 14.715189873417721, |
| "grad_norm": 0.00016689300537109375, |
| "learning_rate": 0.00595425917641039, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1054864, |
| "step": 2325, |
| "train_runtime": 496.0045, |
| "train_tokens_per_second": 2126.723 |
| }, |
| { |
| "epoch": 14.746835443037975, |
| "grad_norm": 0.00445556640625, |
| "learning_rate": 0.005888309390302235, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1057168, |
| "step": 2330, |
| "train_runtime": 496.9849, |
| "train_tokens_per_second": 2127.163 |
| }, |
| { |
| "epoch": 14.778481012658228, |
| "grad_norm": 0.00087738037109375, |
| "learning_rate": 0.005822637561715658, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1059376, |
| "step": 2335, |
| "train_runtime": 497.957, |
| "train_tokens_per_second": 2127.445 |
| }, |
| { |
| "epoch": 14.810126582278482, |
| "grad_norm": 0.00086212158203125, |
| "learning_rate": 0.005757245694008714, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1061680, |
| "step": 2340, |
| "train_runtime": 498.9369, |
| "train_tokens_per_second": 2127.884 |
| }, |
| { |
| "epoch": 14.841772151898734, |
| "grad_norm": 5.173683166503906e-05, |
| "learning_rate": 0.005692135781999078, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1063984, |
| "step": 2345, |
| "train_runtime": 499.9176, |
| "train_tokens_per_second": 2128.319 |
| }, |
| { |
| "epoch": 14.873417721518987, |
| "grad_norm": 6.103515625e-05, |
| "learning_rate": 0.005627309811903193, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1066352, |
| "step": 2350, |
| "train_runtime": 500.9022, |
| "train_tokens_per_second": 2128.863 |
| }, |
| { |
| "epoch": 14.905063291139241, |
| "grad_norm": 4.76837158203125e-05, |
| "learning_rate": 0.005562769761275697, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1068528, |
| "step": 2355, |
| "train_runtime": 501.8805, |
| "train_tokens_per_second": 2129.049 |
| }, |
| { |
| "epoch": 14.936708860759493, |
| "grad_norm": 0.0012054443359375, |
| "learning_rate": 0.005498517598949082, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1070864, |
| "step": 2360, |
| "train_runtime": 502.8728, |
| "train_tokens_per_second": 2129.493 |
| }, |
| { |
| "epoch": 14.968354430379748, |
| "grad_norm": 0.004119873046875, |
| "learning_rate": 0.005434555284973631, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 1073072, |
| "step": 2365, |
| "train_runtime": 503.8466, |
| "train_tokens_per_second": 2129.759 |
| }, |
| { |
| "epoch": 15.0, |
| "grad_norm": 0.0003948211669921875, |
| "learning_rate": 0.005370884770557645, |
| "loss": 0.0011, |
| "num_input_tokens_seen": 1075088, |
| "step": 2370, |
| "train_runtime": 504.7907, |
| "train_tokens_per_second": 2129.77 |
| }, |
| { |
| "epoch": 15.0, |
| "eval_loss": 0.09199367463588715, |
| "eval_runtime": 1.6755, |
| "eval_samples_per_second": 41.779, |
| "eval_steps_per_second": 10.743, |
| "num_input_tokens_seen": 1075088, |
| "step": 2370 |
| }, |
| { |
| "epoch": 15.031645569620252, |
| "grad_norm": 0.0025787353515625, |
| "learning_rate": 0.0053075079980078824, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 1077296, |
| "step": 2375, |
| "train_runtime": 508.5507, |
| "train_tokens_per_second": 2118.365 |
| }, |
| { |
| "epoch": 15.063291139240507, |
| "grad_norm": 0.0017547607421875, |
| "learning_rate": 0.005244426900670356, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1079440, |
| "step": 2380, |
| "train_runtime": 509.5263, |
| "train_tokens_per_second": 2118.517 |
| }, |
| { |
| "epoch": 15.094936708860759, |
| "grad_norm": 0.00128173828125, |
| "learning_rate": 0.0051816434028713245, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1081648, |
| "step": 2385, |
| "train_runtime": 510.5011, |
| "train_tokens_per_second": 2118.796 |
| }, |
| { |
| "epoch": 15.126582278481013, |
| "grad_norm": 0.0027618408203125, |
| "learning_rate": 0.005119159419858583, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1083952, |
| "step": 2390, |
| "train_runtime": 511.4794, |
| "train_tokens_per_second": 2119.248 |
| }, |
| { |
| "epoch": 15.158227848101266, |
| "grad_norm": 6.914138793945312e-05, |
| "learning_rate": 0.005056976857743068, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1086224, |
| "step": 2395, |
| "train_runtime": 512.4655, |
| "train_tokens_per_second": 2119.604 |
| }, |
| { |
| "epoch": 15.189873417721518, |
| "grad_norm": 0.006195068359375, |
| "learning_rate": 0.004995097613440688, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1088432, |
| "step": 2400, |
| "train_runtime": 513.4385, |
| "train_tokens_per_second": 2119.888 |
| }, |
| { |
| "epoch": 15.221518987341772, |
| "grad_norm": 0.00311279296875, |
| "learning_rate": 0.004933523574614447, |
| "loss": 0.0011, |
| "num_input_tokens_seen": 1090736, |
| "step": 2405, |
| "train_runtime": 514.427, |
| "train_tokens_per_second": 2120.293 |
| }, |
| { |
| "epoch": 15.253164556962025, |
| "grad_norm": 4.00543212890625e-05, |
| "learning_rate": 0.004872256619616906, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1092912, |
| "step": 2410, |
| "train_runtime": 515.3964, |
| "train_tokens_per_second": 2120.527 |
| }, |
| { |
| "epoch": 15.284810126582279, |
| "grad_norm": 8.726119995117188e-05, |
| "learning_rate": 0.004811298617432824, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1095280, |
| "step": 2415, |
| "train_runtime": 516.39, |
| "train_tokens_per_second": 2121.033 |
| }, |
| { |
| "epoch": 15.316455696202532, |
| "grad_norm": 0.0098876953125, |
| "learning_rate": 0.004750651427622173, |
| "loss": 0.001, |
| "num_input_tokens_seen": 1097552, |
| "step": 2420, |
| "train_runtime": 517.3784, |
| "train_tokens_per_second": 2121.372 |
| }, |
| { |
| "epoch": 15.348101265822784, |
| "grad_norm": 0.00017547607421875, |
| "learning_rate": 0.004690316900263435, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1099760, |
| "step": 2425, |
| "train_runtime": 518.351, |
| "train_tokens_per_second": 2121.651 |
| }, |
| { |
| "epoch": 15.379746835443038, |
| "grad_norm": 0.000774383544921875, |
| "learning_rate": 0.0046302968758971065, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 1102096, |
| "step": 2430, |
| "train_runtime": 519.3418, |
| "train_tokens_per_second": 2122.102 |
| }, |
| { |
| "epoch": 15.41139240506329, |
| "grad_norm": 0.00051116943359375, |
| "learning_rate": 0.004570593185469605, |
| "loss": 0.0009, |
| "num_input_tokens_seen": 1104336, |
| "step": 2435, |
| "train_runtime": 520.3174, |
| "train_tokens_per_second": 2122.428 |
| }, |
| { |
| "epoch": 15.443037974683545, |
| "grad_norm": 0.0002498626708984375, |
| "learning_rate": 0.004511207650277389, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1106480, |
| "step": 2440, |
| "train_runtime": 521.284, |
| "train_tokens_per_second": 2122.605 |
| }, |
| { |
| "epoch": 15.474683544303797, |
| "grad_norm": 4.410743713378906e-05, |
| "learning_rate": 0.004452142081911388, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 1108752, |
| "step": 2445, |
| "train_runtime": 522.2723, |
| "train_tokens_per_second": 2122.939 |
| }, |
| { |
| "epoch": 15.50632911392405, |
| "grad_norm": 0.0029754638671875, |
| "learning_rate": 0.004393398282201788, |
| "loss": 0.0009, |
| "num_input_tokens_seen": 1110960, |
| "step": 2450, |
| "train_runtime": 523.2445, |
| "train_tokens_per_second": 2123.214 |
| }, |
| { |
| "epoch": 15.537974683544304, |
| "grad_norm": 6.29425048828125e-05, |
| "learning_rate": 0.004334978043162998, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1113168, |
| "step": 2455, |
| "train_runtime": 524.222, |
| "train_tokens_per_second": 2123.467 |
| }, |
| { |
| "epoch": 15.569620253164556, |
| "grad_norm": 0.00194549560546875, |
| "learning_rate": 0.004276883146939021, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1115408, |
| "step": 2460, |
| "train_runtime": 525.1968, |
| "train_tokens_per_second": 2123.791 |
| }, |
| { |
| "epoch": 15.60126582278481, |
| "grad_norm": 0.003936767578125, |
| "learning_rate": 0.004219115365749112, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1117648, |
| "step": 2465, |
| "train_runtime": 526.1716, |
| "train_tokens_per_second": 2124.113 |
| }, |
| { |
| "epoch": 15.632911392405063, |
| "grad_norm": 0.00138092041015625, |
| "learning_rate": 0.004161676461833653, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 1119984, |
| "step": 2470, |
| "train_runtime": 527.1626, |
| "train_tokens_per_second": 2124.551 |
| }, |
| { |
| "epoch": 15.664556962025316, |
| "grad_norm": 4.7206878662109375e-05, |
| "learning_rate": 0.004104568187400455, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1122256, |
| "step": 2475, |
| "train_runtime": 528.1398, |
| "train_tokens_per_second": 2124.922 |
| }, |
| { |
| "epoch": 15.69620253164557, |
| "grad_norm": 0.00439453125, |
| "learning_rate": 0.004047792284571272, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1124560, |
| "step": 2480, |
| "train_runtime": 529.1252, |
| "train_tokens_per_second": 2125.319 |
| }, |
| { |
| "epoch": 15.727848101265822, |
| "grad_norm": 0.000858306884765625, |
| "learning_rate": 0.0039913504853286525, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 1126960, |
| "step": 2485, |
| "train_runtime": 530.1311, |
| "train_tokens_per_second": 2125.814 |
| }, |
| { |
| "epoch": 15.759493670886076, |
| "grad_norm": 0.00023651123046875, |
| "learning_rate": 0.00393524451146315, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1129360, |
| "step": 2490, |
| "train_runtime": 531.1366, |
| "train_tokens_per_second": 2126.308 |
| }, |
| { |
| "epoch": 15.791139240506329, |
| "grad_norm": 5.435943603515625e-05, |
| "learning_rate": 0.0038794760745207314, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1131568, |
| "step": 2495, |
| "train_runtime": 532.1089, |
| "train_tokens_per_second": 2126.572 |
| }, |
| { |
| "epoch": 15.822784810126583, |
| "grad_norm": 0.000202178955078125, |
| "learning_rate": 0.0038240468757506077, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1133872, |
| "step": 2500, |
| "train_runtime": 533.0972, |
| "train_tokens_per_second": 2126.952 |
| }, |
| { |
| "epoch": 15.854430379746836, |
| "grad_norm": 0.00024318695068359375, |
| "learning_rate": 0.0037689586060533522, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1136240, |
| "step": 2505, |
| "train_runtime": 534.084, |
| "train_tokens_per_second": 2127.456 |
| }, |
| { |
| "epoch": 15.886075949367088, |
| "grad_norm": 0.0004425048828125, |
| "learning_rate": 0.003714212945929265, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1138448, |
| "step": 2510, |
| "train_runtime": 535.0569, |
| "train_tokens_per_second": 2127.714 |
| }, |
| { |
| "epoch": 15.917721518987342, |
| "grad_norm": 0.003021240234375, |
| "learning_rate": 0.003659811565427151, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 1140752, |
| "step": 2515, |
| "train_runtime": 536.0486, |
| "train_tokens_per_second": 2128.076 |
| }, |
| { |
| "epoch": 15.949367088607595, |
| "grad_norm": 4.887580871582031e-05, |
| "learning_rate": 0.0036057561240933683, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1143056, |
| "step": 2520, |
| "train_runtime": 537.031, |
| "train_tokens_per_second": 2128.473 |
| }, |
| { |
| "epoch": 15.981012658227849, |
| "grad_norm": 0.0002994537353515625, |
| "learning_rate": 0.003552048270921177, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1145392, |
| "step": 2525, |
| "train_runtime": 538.0245, |
| "train_tokens_per_second": 2128.884 |
| }, |
| { |
| "epoch": 16.0, |
| "eval_loss": 0.09226036071777344, |
| "eval_runtime": 1.6756, |
| "eval_samples_per_second": 41.776, |
| "eval_steps_per_second": 10.742, |
| "num_input_tokens_seen": 1146608, |
| "step": 2528 |
| }, |
| { |
| "epoch": 16.0126582278481, |
| "grad_norm": 6.103515625e-05, |
| "learning_rate": 0.0034986896443004695, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1147536, |
| "step": 2530, |
| "train_runtime": 541.5633, |
| "train_tokens_per_second": 2118.932 |
| }, |
| { |
| "epoch": 16.044303797468356, |
| "grad_norm": 0.00018405914306640625, |
| "learning_rate": 0.003445681871967776, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1149776, |
| "step": 2535, |
| "train_runtime": 542.5573, |
| "train_tokens_per_second": 2119.179 |
| }, |
| { |
| "epoch": 16.075949367088608, |
| "grad_norm": 9.489059448242188e-05, |
| "learning_rate": 0.003393026570956594, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1151952, |
| "step": 2540, |
| "train_runtime": 543.5318, |
| "train_tokens_per_second": 2119.383 |
| }, |
| { |
| "epoch": 16.10759493670886, |
| "grad_norm": 0.0028228759765625, |
| "learning_rate": 0.0033407253475480903, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1154192, |
| "step": 2545, |
| "train_runtime": 544.51, |
| "train_tokens_per_second": 2119.689 |
| }, |
| { |
| "epoch": 16.139240506329113, |
| "grad_norm": 0.003692626953125, |
| "learning_rate": 0.0032887797972220756, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 1156528, |
| "step": 2550, |
| "train_runtime": 545.5009, |
| "train_tokens_per_second": 2120.121 |
| }, |
| { |
| "epoch": 16.170886075949365, |
| "grad_norm": 8.821487426757812e-05, |
| "learning_rate": 0.003237191504608346, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1158768, |
| "step": 2555, |
| "train_runtime": 546.4749, |
| "train_tokens_per_second": 2120.442 |
| }, |
| { |
| "epoch": 16.20253164556962, |
| "grad_norm": 0.004425048828125, |
| "learning_rate": 0.003185962043438345, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1160912, |
| "step": 2560, |
| "train_runtime": 547.4408, |
| "train_tokens_per_second": 2120.617 |
| }, |
| { |
| "epoch": 16.234177215189874, |
| "grad_norm": 3.504753112792969e-05, |
| "learning_rate": 0.003135092976497134, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 1163120, |
| "step": 2565, |
| "train_runtime": 548.4131, |
| "train_tokens_per_second": 2120.883 |
| }, |
| { |
| "epoch": 16.265822784810126, |
| "grad_norm": 3.743171691894531e-05, |
| "learning_rate": 0.003084585855575747, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1165264, |
| "step": 2570, |
| "train_runtime": 549.3818, |
| "train_tokens_per_second": 2121.046 |
| }, |
| { |
| "epoch": 16.29746835443038, |
| "grad_norm": 0.000690460205078125, |
| "learning_rate": 0.0030344422214238454, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1167536, |
| "step": 2575, |
| "train_runtime": 550.3587, |
| "train_tokens_per_second": 2121.409 |
| }, |
| { |
| "epoch": 16.32911392405063, |
| "grad_norm": 0.000873565673828125, |
| "learning_rate": 0.002984663603702693, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1169776, |
| "step": 2580, |
| "train_runtime": 551.3355, |
| "train_tokens_per_second": 2121.714 |
| }, |
| { |
| "epoch": 16.360759493670887, |
| "grad_norm": 0.00133514404296875, |
| "learning_rate": 0.0029352515209385283, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1172176, |
| "step": 2585, |
| "train_runtime": 552.3303, |
| "train_tokens_per_second": 2122.237 |
| }, |
| { |
| "epoch": 16.39240506329114, |
| "grad_norm": 0.0012359619140625, |
| "learning_rate": 0.002886207480476215, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1174384, |
| "step": 2590, |
| "train_runtime": 553.307, |
| "train_tokens_per_second": 2122.482 |
| }, |
| { |
| "epoch": 16.424050632911392, |
| "grad_norm": 0.00176239013671875, |
| "learning_rate": 0.0028375329784332765, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1176752, |
| "step": 2595, |
| "train_runtime": 554.3042, |
| "train_tokens_per_second": 2122.935 |
| }, |
| { |
| "epoch": 16.455696202531644, |
| "grad_norm": 0.000614166259765625, |
| "learning_rate": 0.002789229499654233, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1179024, |
| "step": 2600, |
| "train_runtime": 555.2831, |
| "train_tokens_per_second": 2123.285 |
| }, |
| { |
| "epoch": 16.4873417721519, |
| "grad_norm": 0.0028839111328125, |
| "learning_rate": 0.002741298517665333, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1181328, |
| "step": 2605, |
| "train_runtime": 556.2718, |
| "train_tokens_per_second": 2123.652 |
| }, |
| { |
| "epoch": 16.518987341772153, |
| "grad_norm": 0.002044677734375, |
| "learning_rate": 0.002693741494629585, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1183696, |
| "step": 2610, |
| "train_runtime": 557.2677, |
| "train_tokens_per_second": 2124.107 |
| }, |
| { |
| "epoch": 16.550632911392405, |
| "grad_norm": 9.298324584960938e-05, |
| "learning_rate": 0.002646559881302165, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1186000, |
| "step": 2615, |
| "train_runtime": 558.2479, |
| "train_tokens_per_second": 2124.504 |
| }, |
| { |
| "epoch": 16.582278481012658, |
| "grad_norm": 0.0028228759765625, |
| "learning_rate": 0.0025997551169861365, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1188400, |
| "step": 2620, |
| "train_runtime": 559.2532, |
| "train_tokens_per_second": 2124.977 |
| }, |
| { |
| "epoch": 16.61392405063291, |
| "grad_norm": 0.001708984375, |
| "learning_rate": 0.002553328629488577, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1190640, |
| "step": 2625, |
| "train_runtime": 560.2292, |
| "train_tokens_per_second": 2125.273 |
| }, |
| { |
| "epoch": 16.645569620253166, |
| "grad_norm": 0.001251220703125, |
| "learning_rate": 0.002507281835076998, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1192784, |
| "step": 2630, |
| "train_runtime": 561.1965, |
| "train_tokens_per_second": 2125.43 |
| }, |
| { |
| "epoch": 16.67721518987342, |
| "grad_norm": 0.000553131103515625, |
| "learning_rate": 0.002461616138436155, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 1195024, |
| "step": 2635, |
| "train_runtime": 562.1739, |
| "train_tokens_per_second": 2125.719 |
| }, |
| { |
| "epoch": 16.70886075949367, |
| "grad_norm": 0.001373291015625, |
| "learning_rate": 0.0024163329326251774, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1197264, |
| "step": 2640, |
| "train_runtime": 563.1522, |
| "train_tokens_per_second": 2126.004 |
| }, |
| { |
| "epoch": 16.740506329113924, |
| "grad_norm": 0.00604248046875, |
| "learning_rate": 0.002371433599035097, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1199760, |
| "step": 2645, |
| "train_runtime": 564.179, |
| "train_tokens_per_second": 2126.559 |
| }, |
| { |
| "epoch": 16.772151898734176, |
| "grad_norm": 0.0001201629638671875, |
| "learning_rate": 0.0023269195073466957, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1202096, |
| "step": 2650, |
| "train_runtime": 565.1712, |
| "train_tokens_per_second": 2126.959 |
| }, |
| { |
| "epoch": 16.803797468354432, |
| "grad_norm": 0.00213623046875, |
| "learning_rate": 0.0022827920154887132, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1204400, |
| "step": 2655, |
| "train_runtime": 566.1635, |
| "train_tokens_per_second": 2127.301 |
| }, |
| { |
| "epoch": 16.835443037974684, |
| "grad_norm": 0.000514984130859375, |
| "learning_rate": 0.002239052469596439, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 1206896, |
| "step": 2660, |
| "train_runtime": 567.1759, |
| "train_tokens_per_second": 2127.904 |
| }, |
| { |
| "epoch": 16.867088607594937, |
| "grad_norm": 0.005645751953125, |
| "learning_rate": 0.0021957022039706454, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 1209264, |
| "step": 2665, |
| "train_runtime": 568.1622, |
| "train_tokens_per_second": 2128.378 |
| }, |
| { |
| "epoch": 16.89873417721519, |
| "grad_norm": 4.38690185546875e-05, |
| "learning_rate": 0.002152742541036869, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1211504, |
| "step": 2670, |
| "train_runtime": 569.1386, |
| "train_tokens_per_second": 2128.663 |
| }, |
| { |
| "epoch": 16.930379746835442, |
| "grad_norm": 0.004364013671875, |
| "learning_rate": 0.0021101747913050855, |
| "loss": 0.0011, |
| "num_input_tokens_seen": 1213808, |
| "step": 2675, |
| "train_runtime": 570.1268, |
| "train_tokens_per_second": 2129.014 |
| }, |
| { |
| "epoch": 16.962025316455698, |
| "grad_norm": 0.00066375732421875, |
| "learning_rate": 0.0020680002533297274, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1215952, |
| "step": 2680, |
| "train_runtime": 571.0938, |
| "train_tokens_per_second": 2129.163 |
| }, |
| { |
| "epoch": 16.99367088607595, |
| "grad_norm": 0.003936767578125, |
| "learning_rate": 0.002026220213670069, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 1218160, |
| "step": 2685, |
| "train_runtime": 572.077, |
| "train_tokens_per_second": 2129.364 |
| }, |
| { |
| "epoch": 17.0, |
| "eval_loss": 0.09145190566778183, |
| "eval_runtime": 1.6789, |
| "eval_samples_per_second": 41.694, |
| "eval_steps_per_second": 10.721, |
| "num_input_tokens_seen": 1218368, |
| "step": 2686 |
| }, |
| { |
| "epoch": 17.025316455696203, |
| "grad_norm": 0.0010223388671875, |
| "learning_rate": 0.0019848359468509825, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1220256, |
| "step": 2690, |
| "train_runtime": 575.6869, |
| "train_tokens_per_second": 2119.652 |
| }, |
| { |
| "epoch": 17.056962025316455, |
| "grad_norm": 0.00274658203125, |
| "learning_rate": 0.0019438487153240424, |
| "loss": 0.001, |
| "num_input_tokens_seen": 1222528, |
| "step": 2695, |
| "train_runtime": 576.6737, |
| "train_tokens_per_second": 2119.965 |
| }, |
| { |
| "epoch": 17.088607594936708, |
| "grad_norm": 0.0019683837890625, |
| "learning_rate": 0.0019032597694290392, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1224768, |
| "step": 2700, |
| "train_runtime": 577.6518, |
| "train_tokens_per_second": 2120.253 |
| }, |
| { |
| "epoch": 17.120253164556964, |
| "grad_norm": 4.7206878662109375e-05, |
| "learning_rate": 0.0018630703473558234, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1227008, |
| "step": 2705, |
| "train_runtime": 578.6261, |
| "train_tokens_per_second": 2120.554 |
| }, |
| { |
| "epoch": 17.151898734177216, |
| "grad_norm": 0.003631591796875, |
| "learning_rate": 0.0018232816751065249, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 1229344, |
| "step": 2710, |
| "train_runtime": 579.6303, |
| "train_tokens_per_second": 2120.911 |
| }, |
| { |
| "epoch": 17.18354430379747, |
| "grad_norm": 0.0023651123046875, |
| "learning_rate": 0.0017838949664581742, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 1231712, |
| "step": 2715, |
| "train_runtime": 580.6264, |
| "train_tokens_per_second": 2121.35 |
| }, |
| { |
| "epoch": 17.21518987341772, |
| "grad_norm": 0.00032806396484375, |
| "learning_rate": 0.0017449114229256607, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 1233984, |
| "step": 2720, |
| "train_runtime": 581.604, |
| "train_tokens_per_second": 2121.691 |
| }, |
| { |
| "epoch": 17.246835443037973, |
| "grad_norm": 0.000274658203125, |
| "learning_rate": 0.0017063322337250713, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1236224, |
| "step": 2725, |
| "train_runtime": 582.5946, |
| "train_tokens_per_second": 2121.928 |
| }, |
| { |
| "epoch": 17.27848101265823, |
| "grad_norm": 0.0009307861328125, |
| "learning_rate": 0.0016681585757374472, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1238528, |
| "step": 2730, |
| "train_runtime": 583.5946, |
| "train_tokens_per_second": 2122.24 |
| }, |
| { |
| "epoch": 17.310126582278482, |
| "grad_norm": 4.887580871582031e-05, |
| "learning_rate": 0.001630391613472837, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1240832, |
| "step": 2735, |
| "train_runtime": 584.5843, |
| "train_tokens_per_second": 2122.589 |
| }, |
| { |
| "epoch": 17.341772151898734, |
| "grad_norm": 0.00014400482177734375, |
| "learning_rate": 0.001593032499034811, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1243104, |
| "step": 2740, |
| "train_runtime": 585.562, |
| "train_tokens_per_second": 2122.925 |
| }, |
| { |
| "epoch": 17.373417721518987, |
| "grad_norm": 4.3392181396484375e-05, |
| "learning_rate": 0.0015560823720852928, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1245376, |
| "step": 2745, |
| "train_runtime": 586.5413, |
| "train_tokens_per_second": 2123.254 |
| }, |
| { |
| "epoch": 17.40506329113924, |
| "grad_norm": 4.410743713378906e-05, |
| "learning_rate": 0.0015195423598097972, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1247648, |
| "step": 2750, |
| "train_runtime": 587.5225, |
| "train_tokens_per_second": 2123.575 |
| }, |
| { |
| "epoch": 17.436708860759495, |
| "grad_norm": 4.482269287109375e-05, |
| "learning_rate": 0.001483413576883057, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 1250048, |
| "step": 2755, |
| "train_runtime": 588.521, |
| "train_tokens_per_second": 2124.05 |
| }, |
| { |
| "epoch": 17.468354430379748, |
| "grad_norm": 4.839897155761719e-05, |
| "learning_rate": 0.001447697125435004, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 1252352, |
| "step": 2760, |
| "train_runtime": 589.5024, |
| "train_tokens_per_second": 2124.422 |
| }, |
| { |
| "epoch": 17.5, |
| "grad_norm": 4.649162292480469e-05, |
| "learning_rate": 0.0014123940950171508, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1254624, |
| "step": 2765, |
| "train_runtime": 590.481, |
| "train_tokens_per_second": 2124.749 |
| }, |
| { |
| "epoch": 17.531645569620252, |
| "grad_norm": 0.0042724609375, |
| "learning_rate": 0.0013775055625693683, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1256864, |
| "step": 2770, |
| "train_runtime": 591.4594, |
| "train_tokens_per_second": 2125.022 |
| }, |
| { |
| "epoch": 17.563291139240505, |
| "grad_norm": 0.005096435546875, |
| "learning_rate": 0.0013430325923870095, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1259072, |
| "step": 2775, |
| "train_runtime": 592.4327, |
| "train_tokens_per_second": 2125.257 |
| }, |
| { |
| "epoch": 17.59493670886076, |
| "grad_norm": 0.0015411376953125, |
| "learning_rate": 0.0013089762360884538, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1261376, |
| "step": 2780, |
| "train_runtime": 593.423, |
| "train_tokens_per_second": 2125.593 |
| }, |
| { |
| "epoch": 17.626582278481013, |
| "grad_norm": 0.00115203857421875, |
| "learning_rate": 0.0012753375325830413, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1263488, |
| "step": 2785, |
| "train_runtime": 594.3873, |
| "train_tokens_per_second": 2125.698 |
| }, |
| { |
| "epoch": 17.658227848101266, |
| "grad_norm": 0.00010967254638671875, |
| "learning_rate": 0.001242117508039347, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 1265824, |
| "step": 2790, |
| "train_runtime": 595.3802, |
| "train_tokens_per_second": 2126.077 |
| }, |
| { |
| "epoch": 17.689873417721518, |
| "grad_norm": 0.00023651123046875, |
| "learning_rate": 0.0012093171758539112, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1268032, |
| "step": 2795, |
| "train_runtime": 596.3537, |
| "train_tokens_per_second": 2126.309 |
| }, |
| { |
| "epoch": 17.72151898734177, |
| "grad_norm": 0.007232666015625, |
| "learning_rate": 0.0011769375366203066, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 1270272, |
| "step": 2800, |
| "train_runtime": 597.3299, |
| "train_tokens_per_second": 2126.584 |
| }, |
| { |
| "epoch": 17.753164556962027, |
| "grad_norm": 0.00189208984375, |
| "learning_rate": 0.0011449795780986071, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1272544, |
| "step": 2805, |
| "train_runtime": 598.3089, |
| "train_tokens_per_second": 2126.901 |
| }, |
| { |
| "epoch": 17.78481012658228, |
| "grad_norm": 0.00019359588623046875, |
| "learning_rate": 0.0011134442751852846, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1274880, |
| "step": 2810, |
| "train_runtime": 599.2919, |
| "train_tokens_per_second": 2127.311 |
| }, |
| { |
| "epoch": 17.81645569620253, |
| "grad_norm": 0.0004482269287109375, |
| "learning_rate": 0.0010823325898834395, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1277120, |
| "step": 2815, |
| "train_runtime": 600.2682, |
| "train_tokens_per_second": 2127.582 |
| }, |
| { |
| "epoch": 17.848101265822784, |
| "grad_norm": 0.003387451171875, |
| "learning_rate": 0.0010516454712734629, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1279424, |
| "step": 2820, |
| "train_runtime": 601.2667, |
| "train_tokens_per_second": 2127.881 |
| }, |
| { |
| "epoch": 17.879746835443036, |
| "grad_norm": 0.00148773193359375, |
| "learning_rate": 0.0010213838554841027, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1281728, |
| "step": 2825, |
| "train_runtime": 602.256, |
| "train_tokens_per_second": 2128.211 |
| }, |
| { |
| "epoch": 17.911392405063292, |
| "grad_norm": 5.698204040527344e-05, |
| "learning_rate": 0.0009915486656638728, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1283968, |
| "step": 2830, |
| "train_runtime": 603.2398, |
| "train_tokens_per_second": 2128.454 |
| }, |
| { |
| "epoch": 17.943037974683545, |
| "grad_norm": 0.00543212890625, |
| "learning_rate": 0.0009621408119529234, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 1286272, |
| "step": 2835, |
| "train_runtime": 604.2235, |
| "train_tokens_per_second": 2128.802 |
| }, |
| { |
| "epoch": 17.974683544303797, |
| "grad_norm": 0.000476837158203125, |
| "learning_rate": 0.0009331611914552607, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 1288544, |
| "step": 2840, |
| "train_runtime": 605.202, |
| "train_tokens_per_second": 2129.114 |
| }, |
| { |
| "epoch": 18.0, |
| "eval_loss": 0.0919828787446022, |
| "eval_runtime": 1.6803, |
| "eval_samples_per_second": 41.66, |
| "eval_steps_per_second": 10.713, |
| "num_input_tokens_seen": 1290144, |
| "step": 2844 |
| }, |
| { |
| "epoch": 18.00632911392405, |
| "grad_norm": 0.0001678466796875, |
| "learning_rate": 0.0009046106882113752, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1290624, |
| "step": 2845, |
| "train_runtime": 608.7518, |
| "train_tokens_per_second": 2120.115 |
| }, |
| { |
| "epoch": 18.037974683544302, |
| "grad_norm": 0.0013885498046875, |
| "learning_rate": 0.000876490173171291, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1292992, |
| "step": 2850, |
| "train_runtime": 609.7974, |
| "train_tokens_per_second": 2120.363 |
| }, |
| { |
| "epoch": 18.069620253164558, |
| "grad_norm": 0.003570556640625, |
| "learning_rate": 0.0008488005041679841, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1295328, |
| "step": 2855, |
| "train_runtime": 610.7944, |
| "train_tokens_per_second": 2120.727 |
| }, |
| { |
| "epoch": 18.10126582278481, |
| "grad_norm": 0.00093841552734375, |
| "learning_rate": 0.0008215425258912096, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 1297568, |
| "step": 2860, |
| "train_runtime": 611.7703, |
| "train_tokens_per_second": 2121.005 |
| }, |
| { |
| "epoch": 18.132911392405063, |
| "grad_norm": 6.961822509765625e-05, |
| "learning_rate": 0.0007947170698617595, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1299840, |
| "step": 2865, |
| "train_runtime": 612.7479, |
| "train_tokens_per_second": 2121.329 |
| }, |
| { |
| "epoch": 18.164556962025316, |
| "grad_norm": 5.650520324707031e-05, |
| "learning_rate": 0.0007683249544060571, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1302080, |
| "step": 2870, |
| "train_runtime": 613.7222, |
| "train_tokens_per_second": 2121.611 |
| }, |
| { |
| "epoch": 18.196202531645568, |
| "grad_norm": 0.00182342529296875, |
| "learning_rate": 0.000742366984631227, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 1304288, |
| "step": 2875, |
| "train_runtime": 614.6948, |
| "train_tokens_per_second": 2121.846 |
| }, |
| { |
| "epoch": 18.227848101265824, |
| "grad_norm": 0.000766754150390625, |
| "learning_rate": 0.000716843952400522, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1306784, |
| "step": 2880, |
| "train_runtime": 615.7196, |
| "train_tokens_per_second": 2122.369 |
| }, |
| { |
| "epoch": 18.259493670886076, |
| "grad_norm": 0.004425048828125, |
| "learning_rate": 0.0006917566363091609, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1309120, |
| "step": 2885, |
| "train_runtime": 616.7115, |
| "train_tokens_per_second": 2122.743 |
| }, |
| { |
| "epoch": 18.29113924050633, |
| "grad_norm": 3.170967102050781e-05, |
| "learning_rate": 0.000667105801660589, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1311296, |
| "step": 2890, |
| "train_runtime": 617.6819, |
| "train_tokens_per_second": 2122.931 |
| }, |
| { |
| "epoch": 18.32278481012658, |
| "grad_norm": 0.0004444122314453125, |
| "learning_rate": 0.0006428922004431298, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1313568, |
| "step": 2895, |
| "train_runtime": 618.6625, |
| "train_tokens_per_second": 2123.238 |
| }, |
| { |
| "epoch": 18.354430379746834, |
| "grad_norm": 0.0010528564453125, |
| "learning_rate": 0.000619116571307029, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1315776, |
| "step": 2900, |
| "train_runtime": 619.6407, |
| "train_tokens_per_second": 2123.45 |
| }, |
| { |
| "epoch": 18.38607594936709, |
| "grad_norm": 0.005950927734375, |
| "learning_rate": 0.0005957796395419484, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 1318080, |
| "step": 2905, |
| "train_runtime": 620.6294, |
| "train_tokens_per_second": 2123.779 |
| }, |
| { |
| "epoch": 18.417721518987342, |
| "grad_norm": 0.00469970703125, |
| "learning_rate": 0.0005728821170548199, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1320320, |
| "step": 2910, |
| "train_runtime": 621.6135, |
| "train_tokens_per_second": 2124.021 |
| }, |
| { |
| "epoch": 18.449367088607595, |
| "grad_norm": 5.14984130859375e-05, |
| "learning_rate": 0.0005504247023481373, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1322528, |
| "step": 2915, |
| "train_runtime": 622.5944, |
| "train_tokens_per_second": 2124.221 |
| }, |
| { |
| "epoch": 18.481012658227847, |
| "grad_norm": 0.0037994384765625, |
| "learning_rate": 0.0005284080804986412, |
| "loss": 0.0012, |
| "num_input_tokens_seen": 1324928, |
| "step": 2920, |
| "train_runtime": 623.5899, |
| "train_tokens_per_second": 2124.678 |
| }, |
| { |
| "epoch": 18.5126582278481, |
| "grad_norm": 0.00122833251953125, |
| "learning_rate": 0.0005068329231364282, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1327136, |
| "step": 2925, |
| "train_runtime": 624.5635, |
| "train_tokens_per_second": 2124.902 |
| }, |
| { |
| "epoch": 18.544303797468356, |
| "grad_norm": 0.000865936279296875, |
| "learning_rate": 0.00048569988842446065, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 1329408, |
| "step": 2930, |
| "train_runtime": 625.5408, |
| "train_tokens_per_second": 2125.214 |
| }, |
| { |
| "epoch": 18.575949367088608, |
| "grad_norm": 0.002044677734375, |
| "learning_rate": 0.00046500962103848795, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 1331680, |
| "step": 2935, |
| "train_runtime": 626.5185, |
| "train_tokens_per_second": 2125.524 |
| }, |
| { |
| "epoch": 18.60759493670886, |
| "grad_norm": 8.249282836914062e-05, |
| "learning_rate": 0.00044476275214737235, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1333856, |
| "step": 2940, |
| "train_runtime": 627.4887, |
| "train_tokens_per_second": 2125.705 |
| }, |
| { |
| "epoch": 18.639240506329113, |
| "grad_norm": 0.0025177001953125, |
| "learning_rate": 0.00042495989939384915, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1336160, |
| "step": 2945, |
| "train_runtime": 628.4775, |
| "train_tokens_per_second": 2126.027 |
| }, |
| { |
| "epoch": 18.67088607594937, |
| "grad_norm": 0.003509521484375, |
| "learning_rate": 0.0004056016668756801, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1338496, |
| "step": 2950, |
| "train_runtime": 629.4608, |
| "train_tokens_per_second": 2126.417 |
| }, |
| { |
| "epoch": 18.70253164556962, |
| "grad_norm": 4.7206878662109375e-05, |
| "learning_rate": 0.00038668864512721667, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1340736, |
| "step": 2955, |
| "train_runtime": 630.4347, |
| "train_tokens_per_second": 2126.685 |
| }, |
| { |
| "epoch": 18.734177215189874, |
| "grad_norm": 0.000598907470703125, |
| "learning_rate": 0.00036822141110139594, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1342976, |
| "step": 2960, |
| "train_runtime": 631.4181, |
| "train_tokens_per_second": 2126.92 |
| }, |
| { |
| "epoch": 18.765822784810126, |
| "grad_norm": 0.0005340576171875, |
| "learning_rate": 0.00035020052815213477, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1345312, |
| "step": 2965, |
| "train_runtime": 632.4043, |
| "train_tokens_per_second": 2127.297 |
| }, |
| { |
| "epoch": 18.79746835443038, |
| "grad_norm": 6.437301635742188e-05, |
| "learning_rate": 0.0003326265460171468, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1347552, |
| "step": 2970, |
| "train_runtime": 633.3905, |
| "train_tokens_per_second": 2127.522 |
| }, |
| { |
| "epoch": 18.82911392405063, |
| "grad_norm": 0.0013427734375, |
| "learning_rate": 0.0003155000008011727, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1349792, |
| "step": 2975, |
| "train_runtime": 634.3732, |
| "train_tokens_per_second": 2127.757 |
| }, |
| { |
| "epoch": 18.860759493670887, |
| "grad_norm": 0.004302978515625, |
| "learning_rate": 0.0002988214149596197, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 1352096, |
| "step": 2980, |
| "train_runtime": 635.3617, |
| "train_tokens_per_second": 2128.073 |
| }, |
| { |
| "epoch": 18.89240506329114, |
| "grad_norm": 0.00010442733764648438, |
| "learning_rate": 0.00028259129728263607, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 1354400, |
| "step": 2985, |
| "train_runtime": 636.3409, |
| "train_tokens_per_second": 2128.419 |
| }, |
| { |
| "epoch": 18.924050632911392, |
| "grad_norm": 0.00167083740234375, |
| "learning_rate": 0.0002668101428795788, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 1356672, |
| "step": 2990, |
| "train_runtime": 637.3271, |
| "train_tokens_per_second": 2128.69 |
| }, |
| { |
| "epoch": 18.955696202531644, |
| "grad_norm": 0.0003376007080078125, |
| "learning_rate": 0.00025147843316391524, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1358944, |
| "step": 2995, |
| "train_runtime": 638.3058, |
| "train_tokens_per_second": 2128.986 |
| }, |
| { |
| "epoch": 18.9873417721519, |
| "grad_norm": 8.296966552734375e-05, |
| "learning_rate": 0.0002365966358385335, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1361312, |
| "step": 3000, |
| "train_runtime": 639.3002, |
| "train_tokens_per_second": 2129.378 |
| }, |
| { |
| "epoch": 19.0, |
| "eval_loss": 0.0921000987291336, |
| "eval_runtime": 1.6825, |
| "eval_samples_per_second": 41.604, |
| "eval_steps_per_second": 10.698, |
| "num_input_tokens_seen": 1361984, |
| "step": 3002 |
| }, |
| { |
| "epoch": 19.018987341772153, |
| "grad_norm": 0.0005035400390625, |
| "learning_rate": 0.00022216520488148206, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1363328, |
| "step": 3005, |
| "train_runtime": 642.8252, |
| "train_tokens_per_second": 2120.838 |
| }, |
| { |
| "epoch": 19.050632911392405, |
| "grad_norm": 0.004241943359375, |
| "learning_rate": 0.00020818458053211252, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 1365600, |
| "step": 3010, |
| "train_runtime": 643.8422, |
| "train_tokens_per_second": 2121.017 |
| }, |
| { |
| "epoch": 19.082278481012658, |
| "grad_norm": 0.0032196044921875, |
| "learning_rate": 0.00019465518927765712, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 1368032, |
| "step": 3015, |
| "train_runtime": 644.8884, |
| "train_tokens_per_second": 2121.347 |
| }, |
| { |
| "epoch": 19.11392405063291, |
| "grad_norm": 0.0007476806640625, |
| "learning_rate": 0.00018157744384021234, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1370272, |
| "step": 3020, |
| "train_runtime": 645.8738, |
| "train_tokens_per_second": 2121.579 |
| }, |
| { |
| "epoch": 19.145569620253166, |
| "grad_norm": 0.005950927734375, |
| "learning_rate": 0.00016895174316415405, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1372480, |
| "step": 3025, |
| "train_runtime": 646.8474, |
| "train_tokens_per_second": 2121.799 |
| }, |
| { |
| "epoch": 19.17721518987342, |
| "grad_norm": 5.14984130859375e-05, |
| "learning_rate": 0.0001567784724039589, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1374688, |
| "step": 3030, |
| "train_runtime": 647.8217, |
| "train_tokens_per_second": 2122.016 |
| }, |
| { |
| "epoch": 19.20886075949367, |
| "grad_norm": 0.0020294189453125, |
| "learning_rate": 0.00014505800291247207, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 1376960, |
| "step": 3035, |
| "train_runtime": 648.7999, |
| "train_tokens_per_second": 2122.319 |
| }, |
| { |
| "epoch": 19.240506329113924, |
| "grad_norm": 0.000232696533203125, |
| "learning_rate": 0.00013379069222955618, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 1379232, |
| "step": 3040, |
| "train_runtime": 649.7776, |
| "train_tokens_per_second": 2122.622 |
| }, |
| { |
| "epoch": 19.272151898734176, |
| "grad_norm": 0.01190185546875, |
| "learning_rate": 0.00012297688407120032, |
| "loss": 0.0011, |
| "num_input_tokens_seen": 1381632, |
| "step": 3045, |
| "train_runtime": 650.7744, |
| "train_tokens_per_second": 2123.058 |
| }, |
| { |
| "epoch": 19.303797468354432, |
| "grad_norm": 0.000354766845703125, |
| "learning_rate": 0.00011261690831903481, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1383808, |
| "step": 3050, |
| "train_runtime": 651.7447, |
| "train_tokens_per_second": 2123.236 |
| }, |
| { |
| "epoch": 19.335443037974684, |
| "grad_norm": 0.00099945068359375, |
| "learning_rate": 0.00010271108101025439, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1386080, |
| "step": 3055, |
| "train_runtime": 652.7319, |
| "train_tokens_per_second": 2123.506 |
| }, |
| { |
| "epoch": 19.367088607594937, |
| "grad_norm": 4.172325134277344e-05, |
| "learning_rate": 9.325970432799424e-05, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1388288, |
| "step": 3060, |
| "train_runtime": 653.705, |
| "train_tokens_per_second": 2123.722 |
| }, |
| { |
| "epoch": 19.39873417721519, |
| "grad_norm": 0.006439208984375, |
| "learning_rate": 8.426306659209903e-05, |
| "loss": 0.0009, |
| "num_input_tokens_seen": 1390560, |
| "step": 3065, |
| "train_runtime": 654.6916, |
| "train_tokens_per_second": 2123.993 |
| }, |
| { |
| "epoch": 19.430379746835442, |
| "grad_norm": 0.003448486328125, |
| "learning_rate": 7.572144225033495e-05, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1392864, |
| "step": 3070, |
| "train_runtime": 655.6816, |
| "train_tokens_per_second": 2124.299 |
| }, |
| { |
| "epoch": 19.462025316455698, |
| "grad_norm": 0.0002994537353515625, |
| "learning_rate": 6.76350918700147e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1395040, |
| "step": 3075, |
| "train_runtime": 656.6509, |
| "train_tokens_per_second": 2124.477 |
| }, |
| { |
| "epoch": 19.49367088607595, |
| "grad_norm": 0.0013580322265625, |
| "learning_rate": 6.0004262130048946e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1397344, |
| "step": 3080, |
| "train_runtime": 657.6314, |
| "train_tokens_per_second": 2124.813 |
| }, |
| { |
| "epoch": 19.525316455696203, |
| "grad_norm": 3.743171691894531e-05, |
| "learning_rate": 5.282918581341889e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1399520, |
| "step": 3085, |
| "train_runtime": 658.6027, |
| "train_tokens_per_second": 2124.984 |
| }, |
| { |
| "epoch": 19.556962025316455, |
| "grad_norm": 0.00010538101196289062, |
| "learning_rate": 4.6110081800082025e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1401728, |
| "step": 3090, |
| "train_runtime": 659.5798, |
| "train_tokens_per_second": 2125.183 |
| }, |
| { |
| "epoch": 19.588607594936708, |
| "grad_norm": 0.0032501220703125, |
| "learning_rate": 3.98471550602858e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1404032, |
| "step": 3095, |
| "train_runtime": 660.5686, |
| "train_tokens_per_second": 2125.49 |
| }, |
| { |
| "epoch": 19.620253164556964, |
| "grad_norm": 9.441375732421875e-05, |
| "learning_rate": 3.404059664832259e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1406240, |
| "step": 3100, |
| "train_runtime": 661.5418, |
| "train_tokens_per_second": 2125.701 |
| }, |
| { |
| "epoch": 19.651898734177216, |
| "grad_norm": 0.007568359375, |
| "learning_rate": 2.869058369669941e-05, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 1408640, |
| "step": 3105, |
| "train_runtime": 662.548, |
| "train_tokens_per_second": 2126.095 |
| }, |
| { |
| "epoch": 19.68354430379747, |
| "grad_norm": 0.0057373046875, |
| "learning_rate": 2.3797279410728844e-05, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1410944, |
| "step": 3110, |
| "train_runtime": 663.5279, |
| "train_tokens_per_second": 2126.427 |
| }, |
| { |
| "epoch": 19.71518987341772, |
| "grad_norm": 0.0002593994140625, |
| "learning_rate": 1.9360833063559732e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1413056, |
| "step": 3115, |
| "train_runtime": 664.4925, |
| "train_tokens_per_second": 2126.519 |
| }, |
| { |
| "epoch": 19.746835443037973, |
| "grad_norm": 0.00299072265625, |
| "learning_rate": 1.5381379991615817e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1415360, |
| "step": 3120, |
| "train_runtime": 665.4727, |
| "train_tokens_per_second": 2126.849 |
| }, |
| { |
| "epoch": 19.77848101265823, |
| "grad_norm": 0.000598907470703125, |
| "learning_rate": 1.1859041590472351e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1417600, |
| "step": 3125, |
| "train_runtime": 666.4479, |
| "train_tokens_per_second": 2127.098 |
| }, |
| { |
| "epoch": 19.810126582278482, |
| "grad_norm": 0.00103759765625, |
| "learning_rate": 8.793925311149087e-06, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 1419904, |
| "step": 3130, |
| "train_runtime": 667.4289, |
| "train_tokens_per_second": 2127.424 |
| }, |
| { |
| "epoch": 19.841772151898734, |
| "grad_norm": 6.866455078125e-05, |
| "learning_rate": 6.18612465683288e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1422176, |
| "step": 3135, |
| "train_runtime": 668.4162, |
| "train_tokens_per_second": 2127.68 |
| }, |
| { |
| "epoch": 19.873417721518987, |
| "grad_norm": 0.00011920928955078125, |
| "learning_rate": 4.035719180031649e-06, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1424608, |
| "step": 3140, |
| "train_runtime": 669.4247, |
| "train_tokens_per_second": 2128.108 |
| }, |
| { |
| "epoch": 19.90506329113924, |
| "grad_norm": 0.0012969970703125, |
| "learning_rate": 2.3427744801363113e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1426912, |
| "step": 3145, |
| "train_runtime": 670.4043, |
| "train_tokens_per_second": 2128.435 |
| }, |
| { |
| "epoch": 19.936708860759495, |
| "grad_norm": 0.000553131103515625, |
| "learning_rate": 1.107342201427386e-06, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1429248, |
| "step": 3150, |
| "train_runtime": 671.3956, |
| "train_tokens_per_second": 2128.772 |
| }, |
| { |
| "epoch": 19.968354430379748, |
| "grad_norm": 3.0517578125e-05, |
| "learning_rate": 3.294600315012497e-07, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 1431552, |
| "step": 3155, |
| "train_runtime": 672.3857, |
| "train_tokens_per_second": 2129.064 |
| }, |
| { |
| "epoch": 20.0, |
| "grad_norm": 0.00592041015625, |
| "learning_rate": 9.151700112730588e-09, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 1433520, |
| "step": 3160, |
| "train_runtime": 673.523, |
| "train_tokens_per_second": 2128.391 |
| }, |
| { |
| "epoch": 20.0, |
| "eval_loss": 0.09257736802101135, |
| "eval_runtime": 1.6798, |
| "eval_samples_per_second": 41.672, |
| "eval_steps_per_second": 10.716, |
| "num_input_tokens_seen": 1433520, |
| "step": 3160 |
| }, |
| { |
| "epoch": 20.0, |
| "num_input_tokens_seen": 1433520, |
| "step": 3160, |
| "total_flos": 6.455075769483264e+16, |
| "train_loss": 0.07119393949640976, |
| "train_runtime": 676.0485, |
| "train_samples_per_second": 18.638, |
| "train_steps_per_second": 4.674 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 3160, |
| "num_input_tokens_seen": 1433520, |
| "num_train_epochs": 20, |
| "save_steps": 158, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 6.455075769483264e+16, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|