| { |
| "best_global_step": 684, |
| "best_metric": 0.08342564105987549, |
| "best_model_checkpoint": "saves_multiple/prompt-tuning/llama-3-8b-instruct/train_cb_123_1760637638/checkpoint-684", |
| "epoch": 20.0, |
| "eval_steps": 57, |
| "global_step": 1140, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.08771929824561403, |
| "grad_norm": 16.375, |
| "learning_rate": 0.0010526315789473684, |
| "loss": 0.69, |
| "num_input_tokens_seen": 3552, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.17543859649122806, |
| "grad_norm": 516.0, |
| "learning_rate": 0.0023684210526315787, |
| "loss": 3.5875, |
| "num_input_tokens_seen": 7264, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.2631578947368421, |
| "grad_norm": 107.0, |
| "learning_rate": 0.003684210526315789, |
| "loss": 7.0102, |
| "num_input_tokens_seen": 10528, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.3508771929824561, |
| "grad_norm": 60.0, |
| "learning_rate": 0.004999999999999999, |
| "loss": 1.3562, |
| "num_input_tokens_seen": 14784, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.43859649122807015, |
| "grad_norm": 29.25, |
| "learning_rate": 0.0063157894736842095, |
| "loss": 1.617, |
| "num_input_tokens_seen": 18112, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.5263157894736842, |
| "grad_norm": 87.5, |
| "learning_rate": 0.0076315789473684215, |
| "loss": 1.8857, |
| "num_input_tokens_seen": 20736, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.6140350877192983, |
| "grad_norm": 79.5, |
| "learning_rate": 0.008947368421052631, |
| "loss": 2.0246, |
| "num_input_tokens_seen": 24896, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.7017543859649122, |
| "grad_norm": 5.53125, |
| "learning_rate": 0.010263157894736842, |
| "loss": 0.9179, |
| "num_input_tokens_seen": 28160, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.7894736842105263, |
| "grad_norm": 19.75, |
| "learning_rate": 0.011578947368421052, |
| "loss": 2.7834, |
| "num_input_tokens_seen": 31040, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.8771929824561403, |
| "grad_norm": 3.8125, |
| "learning_rate": 0.012894736842105263, |
| "loss": 0.6199, |
| "num_input_tokens_seen": 33760, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.9649122807017544, |
| "grad_norm": 2.71875, |
| "learning_rate": 0.014210526315789472, |
| "loss": 0.7941, |
| "num_input_tokens_seen": 36416, |
| "step": 55 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 5.37905216217041, |
| "eval_runtime": 0.7379, |
| "eval_samples_per_second": 33.881, |
| "eval_steps_per_second": 9.487, |
| "num_input_tokens_seen": 37160, |
| "step": 57 |
| }, |
| { |
| "epoch": 1.0526315789473684, |
| "grad_norm": 2.6875, |
| "learning_rate": 0.015526315789473685, |
| "loss": 2.1448, |
| "num_input_tokens_seen": 39176, |
| "step": 60 |
| }, |
| { |
| "epoch": 1.1403508771929824, |
| "grad_norm": 8.5625, |
| "learning_rate": 0.016842105263157894, |
| "loss": 0.7608, |
| "num_input_tokens_seen": 42632, |
| "step": 65 |
| }, |
| { |
| "epoch": 1.2280701754385965, |
| "grad_norm": 0.56640625, |
| "learning_rate": 0.018157894736842106, |
| "loss": 1.3338, |
| "num_input_tokens_seen": 45704, |
| "step": 70 |
| }, |
| { |
| "epoch": 1.3157894736842106, |
| "grad_norm": 0.1943359375, |
| "learning_rate": 0.019473684210526317, |
| "loss": 0.3023, |
| "num_input_tokens_seen": 49448, |
| "step": 75 |
| }, |
| { |
| "epoch": 1.4035087719298245, |
| "grad_norm": 0.484375, |
| "learning_rate": 0.020789473684210528, |
| "loss": 0.4547, |
| "num_input_tokens_seen": 52456, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.4912280701754386, |
| "grad_norm": 0.1708984375, |
| "learning_rate": 0.022105263157894735, |
| "loss": 0.2902, |
| "num_input_tokens_seen": 56488, |
| "step": 85 |
| }, |
| { |
| "epoch": 1.5789473684210527, |
| "grad_norm": 7.75, |
| "learning_rate": 0.023421052631578947, |
| "loss": 0.772, |
| "num_input_tokens_seen": 59208, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 2.734375, |
| "learning_rate": 0.024736842105263158, |
| "loss": 0.332, |
| "num_input_tokens_seen": 62696, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.7543859649122808, |
| "grad_norm": 0.42578125, |
| "learning_rate": 0.02605263157894737, |
| "loss": 0.509, |
| "num_input_tokens_seen": 66024, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.8421052631578947, |
| "grad_norm": 0.2578125, |
| "learning_rate": 0.027368421052631577, |
| "loss": 0.396, |
| "num_input_tokens_seen": 69000, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.9298245614035088, |
| "grad_norm": 0.375, |
| "learning_rate": 0.028684210526315788, |
| "loss": 0.2777, |
| "num_input_tokens_seen": 72072, |
| "step": 110 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.45154136419296265, |
| "eval_runtime": 0.7386, |
| "eval_samples_per_second": 33.846, |
| "eval_steps_per_second": 9.477, |
| "num_input_tokens_seen": 73720, |
| "step": 114 |
| }, |
| { |
| "epoch": 2.017543859649123, |
| "grad_norm": 0.11767578125, |
| "learning_rate": 0.03, |
| "loss": 0.2162, |
| "num_input_tokens_seen": 74264, |
| "step": 115 |
| }, |
| { |
| "epoch": 2.1052631578947367, |
| "grad_norm": 4.6875, |
| "learning_rate": 0.02999824208523885, |
| "loss": 0.3999, |
| "num_input_tokens_seen": 77528, |
| "step": 120 |
| }, |
| { |
| "epoch": 2.192982456140351, |
| "grad_norm": 0.2421875, |
| "learning_rate": 0.029992968752990647, |
| "loss": 0.2404, |
| "num_input_tokens_seen": 80600, |
| "step": 125 |
| }, |
| { |
| "epoch": 2.280701754385965, |
| "grad_norm": 0.04345703125, |
| "learning_rate": 0.02998418123926453, |
| "loss": 0.2384, |
| "num_input_tokens_seen": 83800, |
| "step": 130 |
| }, |
| { |
| "epoch": 2.3684210526315788, |
| "grad_norm": 0.2158203125, |
| "learning_rate": 0.029971881603753848, |
| "loss": 0.3961, |
| "num_input_tokens_seen": 86552, |
| "step": 135 |
| }, |
| { |
| "epoch": 2.456140350877193, |
| "grad_norm": 0.1650390625, |
| "learning_rate": 0.02995607272935338, |
| "loss": 0.3283, |
| "num_input_tokens_seen": 90008, |
| "step": 140 |
| }, |
| { |
| "epoch": 2.543859649122807, |
| "grad_norm": 0.10986328125, |
| "learning_rate": 0.02993675832148361, |
| "loss": 0.3304, |
| "num_input_tokens_seen": 93496, |
| "step": 145 |
| }, |
| { |
| "epoch": 2.6315789473684212, |
| "grad_norm": 0.154296875, |
| "learning_rate": 0.029913942907222237, |
| "loss": 0.2329, |
| "num_input_tokens_seen": 96888, |
| "step": 150 |
| }, |
| { |
| "epoch": 2.719298245614035, |
| "grad_norm": 0.158203125, |
| "learning_rate": 0.029887631834243058, |
| "loss": 0.1809, |
| "num_input_tokens_seen": 100728, |
| "step": 155 |
| }, |
| { |
| "epoch": 2.807017543859649, |
| "grad_norm": 0.03662109375, |
| "learning_rate": 0.029857831269562547, |
| "loss": 0.3177, |
| "num_input_tokens_seen": 103960, |
| "step": 160 |
| }, |
| { |
| "epoch": 2.8947368421052633, |
| "grad_norm": 0.076171875, |
| "learning_rate": 0.029824548198094384, |
| "loss": 0.2628, |
| "num_input_tokens_seen": 106648, |
| "step": 165 |
| }, |
| { |
| "epoch": 2.982456140350877, |
| "grad_norm": 0.053466796875, |
| "learning_rate": 0.029787790421012244, |
| "loss": 0.1865, |
| "num_input_tokens_seen": 110136, |
| "step": 170 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.17734654247760773, |
| "eval_runtime": 0.7444, |
| "eval_samples_per_second": 33.584, |
| "eval_steps_per_second": 9.403, |
| "num_input_tokens_seen": 110296, |
| "step": 171 |
| }, |
| { |
| "epoch": 3.0701754385964914, |
| "grad_norm": 0.08837890625, |
| "learning_rate": 0.029747566553921325, |
| "loss": 0.1585, |
| "num_input_tokens_seen": 112984, |
| "step": 175 |
| }, |
| { |
| "epoch": 3.1578947368421053, |
| "grad_norm": 0.0908203125, |
| "learning_rate": 0.029703886024838914, |
| "loss": 0.21, |
| "num_input_tokens_seen": 116120, |
| "step": 180 |
| }, |
| { |
| "epoch": 3.245614035087719, |
| "grad_norm": 0.060791015625, |
| "learning_rate": 0.0296567590719846, |
| "loss": 0.1993, |
| "num_input_tokens_seen": 119576, |
| "step": 185 |
| }, |
| { |
| "epoch": 3.3333333333333335, |
| "grad_norm": 0.0927734375, |
| "learning_rate": 0.029606196741380517, |
| "loss": 0.1112, |
| "num_input_tokens_seen": 122968, |
| "step": 190 |
| }, |
| { |
| "epoch": 3.4210526315789473, |
| "grad_norm": 0.035888671875, |
| "learning_rate": 0.029552210884262308, |
| "loss": 0.1778, |
| "num_input_tokens_seen": 126104, |
| "step": 195 |
| }, |
| { |
| "epoch": 3.5087719298245617, |
| "grad_norm": 0.07861328125, |
| "learning_rate": 0.029494814154301326, |
| "loss": 0.2974, |
| "num_input_tokens_seen": 129368, |
| "step": 200 |
| }, |
| { |
| "epoch": 3.5964912280701755, |
| "grad_norm": 0.1455078125, |
| "learning_rate": 0.029434020004638753, |
| "loss": 0.1908, |
| "num_input_tokens_seen": 132888, |
| "step": 205 |
| }, |
| { |
| "epoch": 3.6842105263157894, |
| "grad_norm": 0.042724609375, |
| "learning_rate": 0.029369842684732334, |
| "loss": 0.216, |
| "num_input_tokens_seen": 135832, |
| "step": 210 |
| }, |
| { |
| "epoch": 3.7719298245614032, |
| "grad_norm": 0.1005859375, |
| "learning_rate": 0.02930229723701646, |
| "loss": 0.3242, |
| "num_input_tokens_seen": 138936, |
| "step": 215 |
| }, |
| { |
| "epoch": 3.8596491228070176, |
| "grad_norm": 0.138671875, |
| "learning_rate": 0.029231399493376414, |
| "loss": 0.2246, |
| "num_input_tokens_seen": 142488, |
| "step": 220 |
| }, |
| { |
| "epoch": 3.9473684210526314, |
| "grad_norm": 0.0966796875, |
| "learning_rate": 0.02915716607143754, |
| "loss": 0.2287, |
| "num_input_tokens_seen": 146808, |
| "step": 225 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 0.22634856402873993, |
| "eval_runtime": 0.7401, |
| "eval_samples_per_second": 33.778, |
| "eval_steps_per_second": 9.458, |
| "num_input_tokens_seen": 147784, |
| "step": 228 |
| }, |
| { |
| "epoch": 4.035087719298246, |
| "grad_norm": 0.060791015625, |
| "learning_rate": 0.029079614370670265, |
| "loss": 0.2226, |
| "num_input_tokens_seen": 149448, |
| "step": 230 |
| }, |
| { |
| "epoch": 4.12280701754386, |
| "grad_norm": 0.087890625, |
| "learning_rate": 0.028998762568311857, |
| "loss": 0.1728, |
| "num_input_tokens_seen": 152488, |
| "step": 235 |
| }, |
| { |
| "epoch": 4.2105263157894735, |
| "grad_norm": 0.05126953125, |
| "learning_rate": 0.028914629615105897, |
| "loss": 0.1703, |
| "num_input_tokens_seen": 155208, |
| "step": 240 |
| }, |
| { |
| "epoch": 4.298245614035087, |
| "grad_norm": 0.17578125, |
| "learning_rate": 0.028827235230860424, |
| "loss": 0.2546, |
| "num_input_tokens_seen": 158088, |
| "step": 245 |
| }, |
| { |
| "epoch": 4.385964912280702, |
| "grad_norm": 0.06103515625, |
| "learning_rate": 0.02873659989982586, |
| "loss": 0.2404, |
| "num_input_tokens_seen": 161032, |
| "step": 250 |
| }, |
| { |
| "epoch": 4.473684210526316, |
| "grad_norm": 0.06787109375, |
| "learning_rate": 0.02864274486589371, |
| "loss": 0.1903, |
| "num_input_tokens_seen": 164744, |
| "step": 255 |
| }, |
| { |
| "epoch": 4.56140350877193, |
| "grad_norm": 0.04638671875, |
| "learning_rate": 0.028545692127617244, |
| "loss": 0.1672, |
| "num_input_tokens_seen": 167592, |
| "step": 260 |
| }, |
| { |
| "epoch": 4.649122807017544, |
| "grad_norm": 0.0673828125, |
| "learning_rate": 0.0284454644330553, |
| "loss": 0.1302, |
| "num_input_tokens_seen": 170696, |
| "step": 265 |
| }, |
| { |
| "epoch": 4.7368421052631575, |
| "grad_norm": 0.1337890625, |
| "learning_rate": 0.02834208527444037, |
| "loss": 0.1811, |
| "num_input_tokens_seen": 174024, |
| "step": 270 |
| }, |
| { |
| "epoch": 4.824561403508772, |
| "grad_norm": 0.0751953125, |
| "learning_rate": 0.028235578882672318, |
| "loss": 0.2531, |
| "num_input_tokens_seen": 178152, |
| "step": 275 |
| }, |
| { |
| "epoch": 4.912280701754386, |
| "grad_norm": 0.049072265625, |
| "learning_rate": 0.028125970221638905, |
| "loss": 0.1738, |
| "num_input_tokens_seen": 181768, |
| "step": 280 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.5, |
| "learning_rate": 0.028013284982364554, |
| "loss": 0.2082, |
| "num_input_tokens_seen": 184368, |
| "step": 285 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 0.1401529610157013, |
| "eval_runtime": 0.7421, |
| "eval_samples_per_second": 33.687, |
| "eval_steps_per_second": 9.432, |
| "num_input_tokens_seen": 184368, |
| "step": 285 |
| }, |
| { |
| "epoch": 5.087719298245614, |
| "grad_norm": 0.091796875, |
| "learning_rate": 0.027897549576988666, |
| "loss": 0.2181, |
| "num_input_tokens_seen": 187312, |
| "step": 290 |
| }, |
| { |
| "epoch": 5.175438596491228, |
| "grad_norm": 0.078125, |
| "learning_rate": 0.027778791132574907, |
| "loss": 0.1867, |
| "num_input_tokens_seen": 190736, |
| "step": 295 |
| }, |
| { |
| "epoch": 5.2631578947368425, |
| "grad_norm": 0.11572265625, |
| "learning_rate": 0.02765703748475293, |
| "loss": 0.2, |
| "num_input_tokens_seen": 194480, |
| "step": 300 |
| }, |
| { |
| "epoch": 5.350877192982456, |
| "grad_norm": 0.062255859375, |
| "learning_rate": 0.027532317171194046, |
| "loss": 0.1937, |
| "num_input_tokens_seen": 198576, |
| "step": 305 |
| }, |
| { |
| "epoch": 5.43859649122807, |
| "grad_norm": 0.083984375, |
| "learning_rate": 0.027404659424922272, |
| "loss": 0.1219, |
| "num_input_tokens_seen": 201808, |
| "step": 310 |
| }, |
| { |
| "epoch": 5.526315789473684, |
| "grad_norm": 0.0625, |
| "learning_rate": 0.027274094167462487, |
| "loss": 0.1492, |
| "num_input_tokens_seen": 204912, |
| "step": 315 |
| }, |
| { |
| "epoch": 5.614035087719298, |
| "grad_norm": 0.09716796875, |
| "learning_rate": 0.02714065200182714, |
| "loss": 0.21, |
| "num_input_tokens_seen": 207760, |
| "step": 320 |
| }, |
| { |
| "epoch": 5.701754385964913, |
| "grad_norm": 0.050537109375, |
| "learning_rate": 0.02700436420534326, |
| "loss": 0.1753, |
| "num_input_tokens_seen": 210544, |
| "step": 325 |
| }, |
| { |
| "epoch": 5.7894736842105265, |
| "grad_norm": 0.0546875, |
| "learning_rate": 0.02686526272232141, |
| "loss": 0.2484, |
| "num_input_tokens_seen": 214480, |
| "step": 330 |
| }, |
| { |
| "epoch": 5.87719298245614, |
| "grad_norm": 0.06494140625, |
| "learning_rate": 0.026723380156568298, |
| "loss": 0.1863, |
| "num_input_tokens_seen": 218064, |
| "step": 335 |
| }, |
| { |
| "epoch": 5.964912280701754, |
| "grad_norm": 0.1513671875, |
| "learning_rate": 0.026578749763744813, |
| "loss": 0.2529, |
| "num_input_tokens_seen": 220848, |
| "step": 340 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_loss": 0.15308572351932526, |
| "eval_runtime": 0.7432, |
| "eval_samples_per_second": 33.638, |
| "eval_steps_per_second": 9.419, |
| "num_input_tokens_seen": 221536, |
| "step": 342 |
| }, |
| { |
| "epoch": 6.052631578947368, |
| "grad_norm": 0.058349609375, |
| "learning_rate": 0.026431405443571282, |
| "loss": 0.0901, |
| "num_input_tokens_seen": 223616, |
| "step": 345 |
| }, |
| { |
| "epoch": 6.140350877192983, |
| "grad_norm": 0.0771484375, |
| "learning_rate": 0.02628138173188176, |
| "loss": 0.3285, |
| "num_input_tokens_seen": 227008, |
| "step": 350 |
| }, |
| { |
| "epoch": 6.228070175438597, |
| "grad_norm": 0.07763671875, |
| "learning_rate": 0.026128713792529224, |
| "loss": 0.1935, |
| "num_input_tokens_seen": 230752, |
| "step": 355 |
| }, |
| { |
| "epoch": 6.315789473684211, |
| "grad_norm": 0.046630859375, |
| "learning_rate": 0.025973437409143554, |
| "loss": 0.2851, |
| "num_input_tokens_seen": 234208, |
| "step": 360 |
| }, |
| { |
| "epoch": 6.4035087719298245, |
| "grad_norm": 0.08447265625, |
| "learning_rate": 0.025815588976744273, |
| "loss": 0.1699, |
| "num_input_tokens_seen": 237664, |
| "step": 365 |
| }, |
| { |
| "epoch": 6.491228070175438, |
| "grad_norm": 0.025146484375, |
| "learning_rate": 0.02565520549320996, |
| "loss": 0.1707, |
| "num_input_tokens_seen": 240704, |
| "step": 370 |
| }, |
| { |
| "epoch": 6.578947368421053, |
| "grad_norm": 0.064453125, |
| "learning_rate": 0.02549232455060637, |
| "loss": 0.1825, |
| "num_input_tokens_seen": 244256, |
| "step": 375 |
| }, |
| { |
| "epoch": 6.666666666666667, |
| "grad_norm": 0.08935546875, |
| "learning_rate": 0.025326984326375274, |
| "loss": 0.1599, |
| "num_input_tokens_seen": 246816, |
| "step": 380 |
| }, |
| { |
| "epoch": 6.754385964912281, |
| "grad_norm": 0.0712890625, |
| "learning_rate": 0.025159223574386116, |
| "loss": 0.2119, |
| "num_input_tokens_seen": 249728, |
| "step": 385 |
| }, |
| { |
| "epoch": 6.842105263157895, |
| "grad_norm": 0.111328125, |
| "learning_rate": 0.02498908161585253, |
| "loss": 0.2225, |
| "num_input_tokens_seen": 252480, |
| "step": 390 |
| }, |
| { |
| "epoch": 6.9298245614035086, |
| "grad_norm": 0.043701171875, |
| "learning_rate": 0.024816598330115895, |
| "loss": 0.0837, |
| "num_input_tokens_seen": 256480, |
| "step": 395 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_loss": 0.15708060562610626, |
| "eval_runtime": 0.7416, |
| "eval_samples_per_second": 33.712, |
| "eval_steps_per_second": 9.439, |
| "num_input_tokens_seen": 258720, |
| "step": 399 |
| }, |
| { |
| "epoch": 7.017543859649122, |
| "grad_norm": 0.0390625, |
| "learning_rate": 0.024641814145298088, |
| "loss": 0.1405, |
| "num_input_tokens_seen": 259424, |
| "step": 400 |
| }, |
| { |
| "epoch": 7.105263157894737, |
| "grad_norm": 0.041015625, |
| "learning_rate": 0.024464770028825585, |
| "loss": 0.0942, |
| "num_input_tokens_seen": 262656, |
| "step": 405 |
| }, |
| { |
| "epoch": 7.192982456140351, |
| "grad_norm": 0.07177734375, |
| "learning_rate": 0.024285507477827137, |
| "loss": 0.1099, |
| "num_input_tokens_seen": 265408, |
| "step": 410 |
| }, |
| { |
| "epoch": 7.280701754385965, |
| "grad_norm": 0.17578125, |
| "learning_rate": 0.02410406850940735, |
| "loss": 0.304, |
| "num_input_tokens_seen": 268992, |
| "step": 415 |
| }, |
| { |
| "epoch": 7.368421052631579, |
| "grad_norm": 0.134765625, |
| "learning_rate": 0.02392049565079834, |
| "loss": 0.2995, |
| "num_input_tokens_seen": 272288, |
| "step": 420 |
| }, |
| { |
| "epoch": 7.456140350877193, |
| "grad_norm": 0.04052734375, |
| "learning_rate": 0.023734831929391822, |
| "loss": 0.1281, |
| "num_input_tokens_seen": 275104, |
| "step": 425 |
| }, |
| { |
| "epoch": 7.543859649122807, |
| "grad_norm": 0.042236328125, |
| "learning_rate": 0.02354712086265399, |
| "loss": 0.1621, |
| "num_input_tokens_seen": 278272, |
| "step": 430 |
| }, |
| { |
| "epoch": 7.631578947368421, |
| "grad_norm": 0.076171875, |
| "learning_rate": 0.023357406447925527, |
| "loss": 0.2195, |
| "num_input_tokens_seen": 282048, |
| "step": 435 |
| }, |
| { |
| "epoch": 7.719298245614035, |
| "grad_norm": 0.0732421875, |
| "learning_rate": 0.023165733152109094, |
| "loss": 0.2152, |
| "num_input_tokens_seen": 285536, |
| "step": 440 |
| }, |
| { |
| "epoch": 7.807017543859649, |
| "grad_norm": 0.140625, |
| "learning_rate": 0.022972145901246837, |
| "loss": 0.1531, |
| "num_input_tokens_seen": 288416, |
| "step": 445 |
| }, |
| { |
| "epoch": 7.894736842105263, |
| "grad_norm": 0.06201171875, |
| "learning_rate": 0.022776690069990207, |
| "loss": 0.2291, |
| "num_input_tokens_seen": 291968, |
| "step": 450 |
| }, |
| { |
| "epoch": 7.982456140350877, |
| "grad_norm": 0.05517578125, |
| "learning_rate": 0.022579411470964646, |
| "loss": 0.1676, |
| "num_input_tokens_seen": 295296, |
| "step": 455 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_loss": 0.129821315407753, |
| "eval_runtime": 0.7407, |
| "eval_samples_per_second": 33.753, |
| "eval_steps_per_second": 9.451, |
| "num_input_tokens_seen": 295408, |
| "step": 456 |
| }, |
| { |
| "epoch": 8.070175438596491, |
| "grad_norm": 0.255859375, |
| "learning_rate": 0.022380356344031675, |
| "loss": 0.1459, |
| "num_input_tokens_seen": 298480, |
| "step": 460 |
| }, |
| { |
| "epoch": 8.157894736842104, |
| "grad_norm": 0.062255859375, |
| "learning_rate": 0.02217957134545074, |
| "loss": 0.2508, |
| "num_input_tokens_seen": 301904, |
| "step": 465 |
| }, |
| { |
| "epoch": 8.24561403508772, |
| "grad_norm": 0.1240234375, |
| "learning_rate": 0.02197710353694355, |
| "loss": 0.151, |
| "num_input_tokens_seen": 305488, |
| "step": 470 |
| }, |
| { |
| "epoch": 8.333333333333334, |
| "grad_norm": 0.025634765625, |
| "learning_rate": 0.02177300037466334, |
| "loss": 0.0978, |
| "num_input_tokens_seen": 308336, |
| "step": 475 |
| }, |
| { |
| "epoch": 8.421052631578947, |
| "grad_norm": 0.08837890625, |
| "learning_rate": 0.02156730969807168, |
| "loss": 0.1797, |
| "num_input_tokens_seen": 311600, |
| "step": 480 |
| }, |
| { |
| "epoch": 8.508771929824562, |
| "grad_norm": 0.09765625, |
| "learning_rate": 0.021360079718725448, |
| "loss": 0.1841, |
| "num_input_tokens_seen": 315216, |
| "step": 485 |
| }, |
| { |
| "epoch": 8.596491228070175, |
| "grad_norm": 0.03662109375, |
| "learning_rate": 0.021151359008976602, |
| "loss": 0.2088, |
| "num_input_tokens_seen": 318640, |
| "step": 490 |
| }, |
| { |
| "epoch": 8.68421052631579, |
| "grad_norm": 0.2001953125, |
| "learning_rate": 0.02094119649058735, |
| "loss": 0.1501, |
| "num_input_tokens_seen": 321680, |
| "step": 495 |
| }, |
| { |
| "epoch": 8.771929824561404, |
| "grad_norm": 0.062255859375, |
| "learning_rate": 0.020729641423263476, |
| "loss": 0.1501, |
| "num_input_tokens_seen": 324464, |
| "step": 500 |
| }, |
| { |
| "epoch": 8.859649122807017, |
| "grad_norm": 0.0537109375, |
| "learning_rate": 0.0205167433931084, |
| "loss": 0.173, |
| "num_input_tokens_seen": 327888, |
| "step": 505 |
| }, |
| { |
| "epoch": 8.947368421052632, |
| "grad_norm": 0.08544921875, |
| "learning_rate": 0.020302552301000754, |
| "loss": 0.205, |
| "num_input_tokens_seen": 331312, |
| "step": 510 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_loss": 0.13734634220600128, |
| "eval_runtime": 0.7435, |
| "eval_samples_per_second": 33.626, |
| "eval_steps_per_second": 9.415, |
| "num_input_tokens_seen": 332648, |
| "step": 513 |
| }, |
| { |
| "epoch": 9.035087719298245, |
| "grad_norm": 0.045166015625, |
| "learning_rate": 0.02008711835089822, |
| "loss": 0.1623, |
| "num_input_tokens_seen": 334120, |
| "step": 515 |
| }, |
| { |
| "epoch": 9.12280701754386, |
| "grad_norm": 0.05712890625, |
| "learning_rate": 0.019870492038070255, |
| "loss": 0.1595, |
| "num_input_tokens_seen": 338280, |
| "step": 520 |
| }, |
| { |
| "epoch": 9.210526315789474, |
| "grad_norm": 0.11962890625, |
| "learning_rate": 0.01965272413726258, |
| "loss": 0.1964, |
| "num_input_tokens_seen": 341064, |
| "step": 525 |
| }, |
| { |
| "epoch": 9.298245614035087, |
| "grad_norm": 0.06982421875, |
| "learning_rate": 0.01943386569079618, |
| "loss": 0.1241, |
| "num_input_tokens_seen": 344552, |
| "step": 530 |
| }, |
| { |
| "epoch": 9.385964912280702, |
| "grad_norm": 0.099609375, |
| "learning_rate": 0.01921396799660354, |
| "loss": 0.1272, |
| "num_input_tokens_seen": 347592, |
| "step": 535 |
| }, |
| { |
| "epoch": 9.473684210526315, |
| "grad_norm": 0.15234375, |
| "learning_rate": 0.018993082596205, |
| "loss": 0.2197, |
| "num_input_tokens_seen": 350440, |
| "step": 540 |
| }, |
| { |
| "epoch": 9.56140350877193, |
| "grad_norm": 0.07470703125, |
| "learning_rate": 0.018771261262628014, |
| "loss": 0.1533, |
| "num_input_tokens_seen": 353992, |
| "step": 545 |
| }, |
| { |
| "epoch": 9.649122807017545, |
| "grad_norm": 0.08544921875, |
| "learning_rate": 0.018548555988272136, |
| "loss": 0.1385, |
| "num_input_tokens_seen": 357544, |
| "step": 550 |
| }, |
| { |
| "epoch": 9.736842105263158, |
| "grad_norm": 0.06396484375, |
| "learning_rate": 0.018325018972722578, |
| "loss": 0.1268, |
| "num_input_tokens_seen": 360872, |
| "step": 555 |
| }, |
| { |
| "epoch": 9.824561403508772, |
| "grad_norm": 0.125, |
| "learning_rate": 0.01810070261051526, |
| "loss": 0.2355, |
| "num_input_tokens_seen": 364200, |
| "step": 560 |
| }, |
| { |
| "epoch": 9.912280701754385, |
| "grad_norm": 0.068359375, |
| "learning_rate": 0.01787565947885608, |
| "loss": 0.1584, |
| "num_input_tokens_seen": 367656, |
| "step": 565 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.0634765625, |
| "learning_rate": 0.01764994232529744, |
| "loss": 0.2853, |
| "num_input_tokens_seen": 369976, |
| "step": 570 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_loss": 0.12465743720531464, |
| "eval_runtime": 0.7405, |
| "eval_samples_per_second": 33.759, |
| "eval_steps_per_second": 9.452, |
| "num_input_tokens_seen": 369976, |
| "step": 570 |
| }, |
| { |
| "epoch": 10.087719298245615, |
| "grad_norm": 0.05322265625, |
| "learning_rate": 0.01742360405537482, |
| "loss": 0.2618, |
| "num_input_tokens_seen": 373304, |
| "step": 575 |
| }, |
| { |
| "epoch": 10.175438596491228, |
| "grad_norm": 0.061279296875, |
| "learning_rate": 0.017196697720206326, |
| "loss": 0.1646, |
| "num_input_tokens_seen": 376632, |
| "step": 580 |
| }, |
| { |
| "epoch": 10.263157894736842, |
| "grad_norm": 0.11474609375, |
| "learning_rate": 0.01696927650405807, |
| "loss": 0.1935, |
| "num_input_tokens_seen": 379608, |
| "step": 585 |
| }, |
| { |
| "epoch": 10.350877192982455, |
| "grad_norm": 0.04443359375, |
| "learning_rate": 0.016741393711878452, |
| "loss": 0.1146, |
| "num_input_tokens_seen": 382296, |
| "step": 590 |
| }, |
| { |
| "epoch": 10.43859649122807, |
| "grad_norm": 0.10986328125, |
| "learning_rate": 0.016513102756804024, |
| "loss": 0.1293, |
| "num_input_tokens_seen": 385880, |
| "step": 595 |
| }, |
| { |
| "epoch": 10.526315789473685, |
| "grad_norm": 0.06689453125, |
| "learning_rate": 0.016284457147640083, |
| "loss": 0.17, |
| "num_input_tokens_seen": 388824, |
| "step": 600 |
| }, |
| { |
| "epoch": 10.614035087719298, |
| "grad_norm": 0.036865234375, |
| "learning_rate": 0.016055510476318827, |
| "loss": 0.1254, |
| "num_input_tokens_seen": 391640, |
| "step": 605 |
| }, |
| { |
| "epoch": 10.701754385964913, |
| "grad_norm": 0.099609375, |
| "learning_rate": 0.015826316405337982, |
| "loss": 0.1194, |
| "num_input_tokens_seen": 394968, |
| "step": 610 |
| }, |
| { |
| "epoch": 10.789473684210526, |
| "grad_norm": 0.06298828125, |
| "learning_rate": 0.015596928655182963, |
| "loss": 0.0889, |
| "num_input_tokens_seen": 398520, |
| "step": 615 |
| }, |
| { |
| "epoch": 10.87719298245614, |
| "grad_norm": 0.072265625, |
| "learning_rate": 0.015367400991735372, |
| "loss": 0.1441, |
| "num_input_tokens_seen": 402328, |
| "step": 620 |
| }, |
| { |
| "epoch": 10.964912280701755, |
| "grad_norm": 0.035400390625, |
| "learning_rate": 0.015137787213670897, |
| "loss": 0.106, |
| "num_input_tokens_seen": 405880, |
| "step": 625 |
| }, |
| { |
| "epoch": 11.0, |
| "eval_loss": 0.12050420790910721, |
| "eval_runtime": 0.7396, |
| "eval_samples_per_second": 33.803, |
| "eval_steps_per_second": 9.465, |
| "num_input_tokens_seen": 406840, |
| "step": 627 |
| }, |
| { |
| "epoch": 11.052631578947368, |
| "grad_norm": 0.021240234375, |
| "learning_rate": 0.014908141139849508, |
| "loss": 0.1113, |
| "num_input_tokens_seen": 408632, |
| "step": 630 |
| }, |
| { |
| "epoch": 11.140350877192983, |
| "grad_norm": 0.1025390625, |
| "learning_rate": 0.014678516596700955, |
| "loss": 0.1159, |
| "num_input_tokens_seen": 411544, |
| "step": 635 |
| }, |
| { |
| "epoch": 11.228070175438596, |
| "grad_norm": 0.05517578125, |
| "learning_rate": 0.014448967405608415, |
| "loss": 0.1659, |
| "num_input_tokens_seen": 414648, |
| "step": 640 |
| }, |
| { |
| "epoch": 11.31578947368421, |
| "grad_norm": 0.1650390625, |
| "learning_rate": 0.014219547370293413, |
| "loss": 0.1448, |
| "num_input_tokens_seen": 418968, |
| "step": 645 |
| }, |
| { |
| "epoch": 11.403508771929825, |
| "grad_norm": 0.1416015625, |
| "learning_rate": 0.013990310264204829, |
| "loss": 0.1033, |
| "num_input_tokens_seen": 422584, |
| "step": 650 |
| }, |
| { |
| "epoch": 11.491228070175438, |
| "grad_norm": 0.1513671875, |
| "learning_rate": 0.013761309817915016, |
| "loss": 0.1087, |
| "num_input_tokens_seen": 426136, |
| "step": 655 |
| }, |
| { |
| "epoch": 11.578947368421053, |
| "grad_norm": 0.02197265625, |
| "learning_rate": 0.013532599706525941, |
| "loss": 0.1414, |
| "num_input_tokens_seen": 429752, |
| "step": 660 |
| }, |
| { |
| "epoch": 11.666666666666666, |
| "grad_norm": 0.150390625, |
| "learning_rate": 0.013304233537088392, |
| "loss": 0.2471, |
| "num_input_tokens_seen": 432920, |
| "step": 665 |
| }, |
| { |
| "epoch": 11.75438596491228, |
| "grad_norm": 0.2099609375, |
| "learning_rate": 0.013076264836037051, |
| "loss": 0.1626, |
| "num_input_tokens_seen": 435608, |
| "step": 670 |
| }, |
| { |
| "epoch": 11.842105263157894, |
| "grad_norm": 0.1357421875, |
| "learning_rate": 0.012848747036644558, |
| "loss": 0.2155, |
| "num_input_tokens_seen": 438744, |
| "step": 675 |
| }, |
| { |
| "epoch": 11.929824561403509, |
| "grad_norm": 0.083984375, |
| "learning_rate": 0.012621733466497287, |
| "loss": 0.1006, |
| "num_input_tokens_seen": 442200, |
| "step": 680 |
| }, |
| { |
| "epoch": 12.0, |
| "eval_loss": 0.08342564105987549, |
| "eval_runtime": 0.7433, |
| "eval_samples_per_second": 33.633, |
| "eval_steps_per_second": 9.417, |
| "num_input_tokens_seen": 444728, |
| "step": 684 |
| }, |
| { |
| "epoch": 12.017543859649123, |
| "grad_norm": 0.08349609375, |
| "learning_rate": 0.012395277334996044, |
| "loss": 0.1252, |
| "num_input_tokens_seen": 445624, |
| "step": 685 |
| }, |
| { |
| "epoch": 12.105263157894736, |
| "grad_norm": 0.04736328125, |
| "learning_rate": 0.012169431720884335, |
| "loss": 0.0541, |
| "num_input_tokens_seen": 448632, |
| "step": 690 |
| }, |
| { |
| "epoch": 12.192982456140351, |
| "grad_norm": 0.0234375, |
| "learning_rate": 0.01194424955980734, |
| "loss": 0.1043, |
| "num_input_tokens_seen": 452248, |
| "step": 695 |
| }, |
| { |
| "epoch": 12.280701754385966, |
| "grad_norm": 0.11376953125, |
| "learning_rate": 0.011719783631904363, |
| "loss": 0.1166, |
| "num_input_tokens_seen": 455544, |
| "step": 700 |
| }, |
| { |
| "epoch": 12.368421052631579, |
| "grad_norm": 0.03271484375, |
| "learning_rate": 0.01149608654943782, |
| "loss": 0.1715, |
| "num_input_tokens_seen": 458520, |
| "step": 705 |
| }, |
| { |
| "epoch": 12.456140350877194, |
| "grad_norm": 0.046630859375, |
| "learning_rate": 0.011273210744461479, |
| "loss": 0.069, |
| "num_input_tokens_seen": 462264, |
| "step": 710 |
| }, |
| { |
| "epoch": 12.543859649122806, |
| "grad_norm": 0.0966796875, |
| "learning_rate": 0.011051208456531014, |
| "loss": 0.108, |
| "num_input_tokens_seen": 465624, |
| "step": 715 |
| }, |
| { |
| "epoch": 12.631578947368421, |
| "grad_norm": 0.162109375, |
| "learning_rate": 0.010830131720459601, |
| "loss": 0.1374, |
| "num_input_tokens_seen": 468536, |
| "step": 720 |
| }, |
| { |
| "epoch": 12.719298245614034, |
| "grad_norm": 0.016357421875, |
| "learning_rate": 0.010610032354121612, |
| "loss": 0.0853, |
| "num_input_tokens_seen": 472408, |
| "step": 725 |
| }, |
| { |
| "epoch": 12.807017543859649, |
| "grad_norm": 0.0186767578125, |
| "learning_rate": 0.01039096194630704, |
| "loss": 0.1083, |
| "num_input_tokens_seen": 475416, |
| "step": 730 |
| }, |
| { |
| "epoch": 12.894736842105264, |
| "grad_norm": 0.232421875, |
| "learning_rate": 0.010172971844629716, |
| "loss": 0.1537, |
| "num_input_tokens_seen": 478712, |
| "step": 735 |
| }, |
| { |
| "epoch": 12.982456140350877, |
| "grad_norm": 0.1630859375, |
| "learning_rate": 0.009956113143491957, |
| "loss": 0.1631, |
| "num_input_tokens_seen": 481592, |
| "step": 740 |
| }, |
| { |
| "epoch": 13.0, |
| "eval_loss": 0.14919985830783844, |
| "eval_runtime": 0.7423, |
| "eval_samples_per_second": 33.681, |
| "eval_steps_per_second": 9.431, |
| "num_input_tokens_seen": 481720, |
| "step": 741 |
| }, |
| { |
| "epoch": 13.070175438596491, |
| "grad_norm": 0.0888671875, |
| "learning_rate": 0.009740436672108685, |
| "loss": 0.0392, |
| "num_input_tokens_seen": 484856, |
| "step": 745 |
| }, |
| { |
| "epoch": 13.157894736842104, |
| "grad_norm": 0.10546875, |
| "learning_rate": 0.009525992982593583, |
| "loss": 0.1222, |
| "num_input_tokens_seen": 487480, |
| "step": 750 |
| }, |
| { |
| "epoch": 13.24561403508772, |
| "grad_norm": 0.072265625, |
| "learning_rate": 0.009312832338110292, |
| "loss": 0.098, |
| "num_input_tokens_seen": 490520, |
| "step": 755 |
| }, |
| { |
| "epoch": 13.333333333333334, |
| "grad_norm": 0.11767578125, |
| "learning_rate": 0.009101004701091252, |
| "loss": 0.1119, |
| "num_input_tokens_seen": 493976, |
| "step": 760 |
| }, |
| { |
| "epoch": 13.421052631578947, |
| "grad_norm": 0.01495361328125, |
| "learning_rate": 0.008890559721527138, |
| "loss": 0.1298, |
| "num_input_tokens_seen": 497080, |
| "step": 765 |
| }, |
| { |
| "epoch": 13.508771929824562, |
| "grad_norm": 0.263671875, |
| "learning_rate": 0.008681546725329408, |
| "loss": 0.075, |
| "num_input_tokens_seen": 500792, |
| "step": 770 |
| }, |
| { |
| "epoch": 13.596491228070175, |
| "grad_norm": 0.2109375, |
| "learning_rate": 0.008474014702768904, |
| "loss": 0.1238, |
| "num_input_tokens_seen": 503832, |
| "step": 775 |
| }, |
| { |
| "epoch": 13.68421052631579, |
| "grad_norm": 0.1513671875, |
| "learning_rate": 0.008268012296993067, |
| "loss": 0.0857, |
| "num_input_tokens_seen": 507256, |
| "step": 780 |
| }, |
| { |
| "epoch": 13.771929824561404, |
| "grad_norm": 0.1142578125, |
| "learning_rate": 0.008063587792624567, |
| "loss": 0.0511, |
| "num_input_tokens_seen": 510616, |
| "step": 785 |
| }, |
| { |
| "epoch": 13.859649122807017, |
| "grad_norm": 0.1708984375, |
| "learning_rate": 0.007860789104443896, |
| "loss": 0.1149, |
| "num_input_tokens_seen": 514200, |
| "step": 790 |
| }, |
| { |
| "epoch": 13.947368421052632, |
| "grad_norm": 0.4140625, |
| "learning_rate": 0.0076596637661587325, |
| "loss": 0.1947, |
| "num_input_tokens_seen": 517400, |
| "step": 795 |
| }, |
| { |
| "epoch": 14.0, |
| "eval_loss": 0.16880536079406738, |
| "eval_runtime": 0.7451, |
| "eval_samples_per_second": 33.552, |
| "eval_steps_per_second": 9.394, |
| "num_input_tokens_seen": 518664, |
| "step": 798 |
| }, |
| { |
| "epoch": 14.035087719298245, |
| "grad_norm": 0.134765625, |
| "learning_rate": 0.007460258919262529, |
| "loss": 0.1395, |
| "num_input_tokens_seen": 519848, |
| "step": 800 |
| }, |
| { |
| "epoch": 14.12280701754386, |
| "grad_norm": 0.031005859375, |
| "learning_rate": 0.007262621301985144, |
| "loss": 0.1472, |
| "num_input_tokens_seen": 523176, |
| "step": 805 |
| }, |
| { |
| "epoch": 14.210526315789474, |
| "grad_norm": 0.205078125, |
| "learning_rate": 0.007066797238337862, |
| "loss": 0.1038, |
| "num_input_tokens_seen": 526280, |
| "step": 810 |
| }, |
| { |
| "epoch": 14.298245614035087, |
| "grad_norm": 0.1455078125, |
| "learning_rate": 0.006872832627255643, |
| "loss": 0.0905, |
| "num_input_tokens_seen": 529160, |
| "step": 815 |
| }, |
| { |
| "epoch": 14.385964912280702, |
| "grad_norm": 0.474609375, |
| "learning_rate": 0.006680772931838868, |
| "loss": 0.1048, |
| "num_input_tokens_seen": 533256, |
| "step": 820 |
| }, |
| { |
| "epoch": 14.473684210526315, |
| "grad_norm": 0.171875, |
| "learning_rate": 0.00649066316869736, |
| "loss": 0.0905, |
| "num_input_tokens_seen": 536712, |
| "step": 825 |
| }, |
| { |
| "epoch": 14.56140350877193, |
| "grad_norm": 0.04296875, |
| "learning_rate": 0.0063025478973989585, |
| "loss": 0.0904, |
| "num_input_tokens_seen": 539912, |
| "step": 830 |
| }, |
| { |
| "epoch": 14.649122807017545, |
| "grad_norm": 0.080078125, |
| "learning_rate": 0.006116471210025301, |
| "loss": 0.0864, |
| "num_input_tokens_seen": 543336, |
| "step": 835 |
| }, |
| { |
| "epoch": 14.736842105263158, |
| "grad_norm": 0.060302734375, |
| "learning_rate": 0.005932476720837105, |
| "loss": 0.0591, |
| "num_input_tokens_seen": 547496, |
| "step": 840 |
| }, |
| { |
| "epoch": 14.824561403508772, |
| "grad_norm": 0.109375, |
| "learning_rate": 0.005750607556051514, |
| "loss": 0.1668, |
| "num_input_tokens_seen": 550280, |
| "step": 845 |
| }, |
| { |
| "epoch": 14.912280701754385, |
| "grad_norm": 0.0262451171875, |
| "learning_rate": 0.0055709063437337685, |
| "loss": 0.0702, |
| "num_input_tokens_seen": 553672, |
| "step": 850 |
| }, |
| { |
| "epoch": 15.0, |
| "grad_norm": 0.185546875, |
| "learning_rate": 0.005393415203805707, |
| "loss": 0.0517, |
| "num_input_tokens_seen": 555728, |
| "step": 855 |
| }, |
| { |
| "epoch": 15.0, |
| "eval_loss": 0.08917699009180069, |
| "eval_runtime": 0.7506, |
| "eval_samples_per_second": 33.305, |
| "eval_steps_per_second": 9.325, |
| "num_input_tokens_seen": 555728, |
| "step": 855 |
| }, |
| { |
| "epoch": 15.087719298245615, |
| "grad_norm": 0.041748046875, |
| "learning_rate": 0.005218175738173303, |
| "loss": 0.0408, |
| "num_input_tokens_seen": 559312, |
| "step": 860 |
| }, |
| { |
| "epoch": 15.175438596491228, |
| "grad_norm": 0.1494140625, |
| "learning_rate": 0.005045229020975681, |
| "loss": 0.0717, |
| "num_input_tokens_seen": 563056, |
| "step": 865 |
| }, |
| { |
| "epoch": 15.263157894736842, |
| "grad_norm": 0.1298828125, |
| "learning_rate": 0.004874615588957773, |
| "loss": 0.1417, |
| "num_input_tokens_seen": 566224, |
| "step": 870 |
| }, |
| { |
| "epoch": 15.350877192982455, |
| "grad_norm": 0.212890625, |
| "learning_rate": 0.0047063754319689975, |
| "loss": 0.0827, |
| "num_input_tokens_seen": 569776, |
| "step": 875 |
| }, |
| { |
| "epoch": 15.43859649122807, |
| "grad_norm": 0.16796875, |
| "learning_rate": 0.004540547983590069, |
| "loss": 0.0437, |
| "num_input_tokens_seen": 573360, |
| "step": 880 |
| }, |
| { |
| "epoch": 15.526315789473685, |
| "grad_norm": 0.1748046875, |
| "learning_rate": 0.0043771721118902335, |
| "loss": 0.0388, |
| "num_input_tokens_seen": 576144, |
| "step": 885 |
| }, |
| { |
| "epoch": 15.614035087719298, |
| "grad_norm": 0.240234375, |
| "learning_rate": 0.004216286110317013, |
| "loss": 0.0481, |
| "num_input_tokens_seen": 579088, |
| "step": 890 |
| }, |
| { |
| "epoch": 15.701754385964913, |
| "grad_norm": 0.05322265625, |
| "learning_rate": 0.00405792768872069, |
| "loss": 0.0502, |
| "num_input_tokens_seen": 582032, |
| "step": 895 |
| }, |
| { |
| "epoch": 15.789473684210526, |
| "grad_norm": 0.044189453125, |
| "learning_rate": 0.003902133964515502, |
| "loss": 0.0272, |
| "num_input_tokens_seen": 585584, |
| "step": 900 |
| }, |
| { |
| "epoch": 15.87719298245614, |
| "grad_norm": 0.1103515625, |
| "learning_rate": 0.00374894145397979, |
| "loss": 0.0533, |
| "num_input_tokens_seen": 588944, |
| "step": 905 |
| }, |
| { |
| "epoch": 15.964912280701755, |
| "grad_norm": 0.28125, |
| "learning_rate": 0.0035983860636969525, |
| "loss": 0.036, |
| "num_input_tokens_seen": 592112, |
| "step": 910 |
| }, |
| { |
| "epoch": 16.0, |
| "eval_loss": 0.10122097283601761, |
| "eval_runtime": 0.7415, |
| "eval_samples_per_second": 33.716, |
| "eval_steps_per_second": 9.441, |
| "num_input_tokens_seen": 593096, |
| "step": 912 |
| }, |
| { |
| "epoch": 16.05263157894737, |
| "grad_norm": 0.2041015625, |
| "learning_rate": 0.003450503082139393, |
| "loss": 0.1277, |
| "num_input_tokens_seen": 594440, |
| "step": 915 |
| }, |
| { |
| "epoch": 16.140350877192983, |
| "grad_norm": 1.4453125, |
| "learning_rate": 0.003305327171397263, |
| "loss": 0.1257, |
| "num_input_tokens_seen": 598440, |
| "step": 920 |
| }, |
| { |
| "epoch": 16.228070175438596, |
| "grad_norm": 0.0458984375, |
| "learning_rate": 0.003162892359054098, |
| "loss": 0.0796, |
| "num_input_tokens_seen": 602152, |
| "step": 925 |
| }, |
| { |
| "epoch": 16.31578947368421, |
| "grad_norm": 0.06884765625, |
| "learning_rate": 0.0030232320302111047, |
| "loss": 0.061, |
| "num_input_tokens_seen": 605736, |
| "step": 930 |
| }, |
| { |
| "epoch": 16.403508771929825, |
| "grad_norm": 0.0206298828125, |
| "learning_rate": 0.0028863789196621093, |
| "loss": 0.0292, |
| "num_input_tokens_seen": 608520, |
| "step": 935 |
| }, |
| { |
| "epoch": 16.49122807017544, |
| "grad_norm": 0.00885009765625, |
| "learning_rate": 0.0027523651042208564, |
| "loss": 0.0129, |
| "num_input_tokens_seen": 612040, |
| "step": 940 |
| }, |
| { |
| "epoch": 16.57894736842105, |
| "grad_norm": 0.42578125, |
| "learning_rate": 0.0026212219952026, |
| "loss": 0.0482, |
| "num_input_tokens_seen": 614920, |
| "step": 945 |
| }, |
| { |
| "epoch": 16.666666666666668, |
| "grad_norm": 0.023681640625, |
| "learning_rate": 0.0024929803310616222, |
| "loss": 0.0229, |
| "num_input_tokens_seen": 618440, |
| "step": 950 |
| }, |
| { |
| "epoch": 16.75438596491228, |
| "grad_norm": 0.12255859375, |
| "learning_rate": 0.002367670170186516, |
| "loss": 0.0582, |
| "num_input_tokens_seen": 621320, |
| "step": 955 |
| }, |
| { |
| "epoch": 16.842105263157894, |
| "grad_norm": 0.0177001953125, |
| "learning_rate": 0.00224532088385481, |
| "loss": 0.0224, |
| "num_input_tokens_seen": 624648, |
| "step": 960 |
| }, |
| { |
| "epoch": 16.92982456140351, |
| "grad_norm": 0.08203125, |
| "learning_rate": 0.002125961149348706, |
| "loss": 0.0262, |
| "num_input_tokens_seen": 628232, |
| "step": 965 |
| }, |
| { |
| "epoch": 17.0, |
| "eval_loss": 0.11464478075504303, |
| "eval_runtime": 0.7524, |
| "eval_samples_per_second": 33.228, |
| "eval_steps_per_second": 9.304, |
| "num_input_tokens_seen": 629760, |
| "step": 969 |
| }, |
| { |
| "epoch": 17.017543859649123, |
| "grad_norm": 0.09423828125, |
| "learning_rate": 0.0020096189432334192, |
| "loss": 0.0581, |
| "num_input_tokens_seen": 630528, |
| "step": 970 |
| }, |
| { |
| "epoch": 17.105263157894736, |
| "grad_norm": 0.044189453125, |
| "learning_rate": 0.001896321534799823, |
| "loss": 0.0182, |
| "num_input_tokens_seen": 633600, |
| "step": 975 |
| }, |
| { |
| "epoch": 17.19298245614035, |
| "grad_norm": 0.043701171875, |
| "learning_rate": 0.0017860954796727994, |
| "loss": 0.0213, |
| "num_input_tokens_seen": 637024, |
| "step": 980 |
| }, |
| { |
| "epoch": 17.280701754385966, |
| "grad_norm": 0.0118408203125, |
| "learning_rate": 0.0016789666135869374, |
| "loss": 0.0481, |
| "num_input_tokens_seen": 640160, |
| "step": 985 |
| }, |
| { |
| "epoch": 17.36842105263158, |
| "grad_norm": 0.0277099609375, |
| "learning_rate": 0.0015749600463309049, |
| "loss": 0.0442, |
| "num_input_tokens_seen": 643040, |
| "step": 990 |
| }, |
| { |
| "epoch": 17.45614035087719, |
| "grad_norm": 0.0927734375, |
| "learning_rate": 0.0014741001558620163, |
| "loss": 0.0175, |
| "num_input_tokens_seen": 646336, |
| "step": 995 |
| }, |
| { |
| "epoch": 17.54385964912281, |
| "grad_norm": 0.1962890625, |
| "learning_rate": 0.0013764105825923066, |
| "loss": 0.0325, |
| "num_input_tokens_seen": 649984, |
| "step": 1000 |
| }, |
| { |
| "epoch": 17.63157894736842, |
| "grad_norm": 0.255859375, |
| "learning_rate": 0.0012819142238474862, |
| "loss": 0.0514, |
| "num_input_tokens_seen": 653632, |
| "step": 1005 |
| }, |
| { |
| "epoch": 17.719298245614034, |
| "grad_norm": 0.0277099609375, |
| "learning_rate": 0.0011906332285000793, |
| "loss": 0.0158, |
| "num_input_tokens_seen": 656960, |
| "step": 1010 |
| }, |
| { |
| "epoch": 17.80701754385965, |
| "grad_norm": 0.02783203125, |
| "learning_rate": 0.0011025889917779736, |
| "loss": 0.0141, |
| "num_input_tokens_seen": 659840, |
| "step": 1015 |
| }, |
| { |
| "epoch": 17.894736842105264, |
| "grad_norm": 0.111328125, |
| "learning_rate": 0.0010178021502496165, |
| "loss": 0.0237, |
| "num_input_tokens_seen": 663872, |
| "step": 1020 |
| }, |
| { |
| "epoch": 17.982456140350877, |
| "grad_norm": 0.0771484375, |
| "learning_rate": 0.0009362925769870394, |
| "loss": 0.0158, |
| "num_input_tokens_seen": 667264, |
| "step": 1025 |
| }, |
| { |
| "epoch": 18.0, |
| "eval_loss": 0.10041998326778412, |
| "eval_runtime": 0.7419, |
| "eval_samples_per_second": 33.695, |
| "eval_steps_per_second": 9.435, |
| "num_input_tokens_seen": 667432, |
| "step": 1026 |
| }, |
| { |
| "epoch": 18.07017543859649, |
| "grad_norm": 0.130859375, |
| "learning_rate": 0.0008580793769078487, |
| "loss": 0.0242, |
| "num_input_tokens_seen": 669544, |
| "step": 1030 |
| }, |
| { |
| "epoch": 18.157894736842106, |
| "grad_norm": 0.04150390625, |
| "learning_rate": 0.0007831808822972391, |
| "loss": 0.0193, |
| "num_input_tokens_seen": 672776, |
| "step": 1035 |
| }, |
| { |
| "epoch": 18.24561403508772, |
| "grad_norm": 0.0263671875, |
| "learning_rate": 0.0007116146485111063, |
| "loss": 0.0089, |
| "num_input_tokens_seen": 676008, |
| "step": 1040 |
| }, |
| { |
| "epoch": 18.333333333333332, |
| "grad_norm": 0.00933837890625, |
| "learning_rate": 0.0006433974498612882, |
| "loss": 0.0161, |
| "num_input_tokens_seen": 679848, |
| "step": 1045 |
| }, |
| { |
| "epoch": 18.42105263157895, |
| "grad_norm": 0.024169921875, |
| "learning_rate": 0.0005785452756838482, |
| "loss": 0.011, |
| "num_input_tokens_seen": 683144, |
| "step": 1050 |
| }, |
| { |
| "epoch": 18.50877192982456, |
| "grad_norm": 0.03271484375, |
| "learning_rate": 0.0005170733265913585, |
| "loss": 0.0204, |
| "num_input_tokens_seen": 686280, |
| "step": 1055 |
| }, |
| { |
| "epoch": 18.596491228070175, |
| "grad_norm": 0.00927734375, |
| "learning_rate": 0.0004589960109100444, |
| "loss": 0.0134, |
| "num_input_tokens_seen": 689480, |
| "step": 1060 |
| }, |
| { |
| "epoch": 18.68421052631579, |
| "grad_norm": 0.2734375, |
| "learning_rate": 0.00040432694130264286, |
| "loss": 0.0541, |
| "num_input_tokens_seen": 692296, |
| "step": 1065 |
| }, |
| { |
| "epoch": 18.771929824561404, |
| "grad_norm": 0.00933837890625, |
| "learning_rate": 0.0003530789315777466, |
| "loss": 0.0109, |
| "num_input_tokens_seen": 695912, |
| "step": 1070 |
| }, |
| { |
| "epoch": 18.859649122807017, |
| "grad_norm": 0.10498046875, |
| "learning_rate": 0.000305263993686391, |
| "loss": 0.022, |
| "num_input_tokens_seen": 699432, |
| "step": 1075 |
| }, |
| { |
| "epoch": 18.94736842105263, |
| "grad_norm": 0.1884765625, |
| "learning_rate": 0.000260893334906595, |
| "loss": 0.0446, |
| "num_input_tokens_seen": 703080, |
| "step": 1080 |
| }, |
| { |
| "epoch": 19.0, |
| "eval_loss": 0.09905951470136642, |
| "eval_runtime": 0.7596, |
| "eval_samples_per_second": 32.911, |
| "eval_steps_per_second": 9.215, |
| "num_input_tokens_seen": 704816, |
| "step": 1083 |
| }, |
| { |
| "epoch": 19.035087719298247, |
| "grad_norm": 0.022216796875, |
| "learning_rate": 0.00021997735521649408, |
| "loss": 0.0284, |
| "num_input_tokens_seen": 706192, |
| "step": 1085 |
| }, |
| { |
| "epoch": 19.12280701754386, |
| "grad_norm": 0.032958984375, |
| "learning_rate": 0.00018252564485670973, |
| "loss": 0.0271, |
| "num_input_tokens_seen": 709072, |
| "step": 1090 |
| }, |
| { |
| "epoch": 19.210526315789473, |
| "grad_norm": 0.01220703125, |
| "learning_rate": 0.00014854698208250638, |
| "loss": 0.0134, |
| "num_input_tokens_seen": 712496, |
| "step": 1095 |
| }, |
| { |
| "epoch": 19.29824561403509, |
| "grad_norm": 0.045166015625, |
| "learning_rate": 0.00011804933110626359, |
| "loss": 0.016, |
| "num_input_tokens_seen": 715056, |
| "step": 1100 |
| }, |
| { |
| "epoch": 19.385964912280702, |
| "grad_norm": 0.021240234375, |
| "learning_rate": 9.103984023075773e-05, |
| "loss": 0.0323, |
| "num_input_tokens_seen": 718416, |
| "step": 1105 |
| }, |
| { |
| "epoch": 19.473684210526315, |
| "grad_norm": 0.2734375, |
| "learning_rate": 6.752484017368553e-05, |
| "loss": 0.0613, |
| "num_input_tokens_seen": 721744, |
| "step": 1110 |
| }, |
| { |
| "epoch": 19.56140350877193, |
| "grad_norm": 0.06982421875, |
| "learning_rate": 4.750984258380608e-05, |
| "loss": 0.0148, |
| "num_input_tokens_seen": 725328, |
| "step": 1115 |
| }, |
| { |
| "epoch": 19.649122807017545, |
| "grad_norm": 0.021240234375, |
| "learning_rate": 3.099953874908079e-05, |
| "loss": 0.0154, |
| "num_input_tokens_seen": 728976, |
| "step": 1120 |
| }, |
| { |
| "epoch": 19.736842105263158, |
| "grad_norm": 0.1904296875, |
| "learning_rate": 1.7997798497084714e-05, |
| "loss": 0.0241, |
| "num_input_tokens_seen": 732368, |
| "step": 1125 |
| }, |
| { |
| "epoch": 19.82456140350877, |
| "grad_norm": 0.0634765625, |
| "learning_rate": 8.50766928796709e-06, |
| "loss": 0.0123, |
| "num_input_tokens_seen": 735440, |
| "step": 1130 |
| }, |
| { |
| "epoch": 19.912280701754387, |
| "grad_norm": 0.02783203125, |
| "learning_rate": 2.5313755001593604e-06, |
| "loss": 0.0076, |
| "num_input_tokens_seen": 739120, |
| "step": 1135 |
| }, |
| { |
| "epoch": 20.0, |
| "grad_norm": 0.01007080078125, |
| "learning_rate": 7.03179089989181e-08, |
| "loss": 0.0233, |
| "num_input_tokens_seen": 742296, |
| "step": 1140 |
| }, |
| { |
| "epoch": 20.0, |
| "eval_loss": 0.09988410025835037, |
| "eval_runtime": 0.7421, |
| "eval_samples_per_second": 33.687, |
| "eval_steps_per_second": 9.432, |
| "num_input_tokens_seen": 742296, |
| "step": 1140 |
| }, |
| { |
| "epoch": 20.0, |
| "num_input_tokens_seen": 742296, |
| "step": 1140, |
| "total_flos": 3.342525338596147e+16, |
| "train_loss": 0.25635436321363636, |
| "train_runtime": 289.0272, |
| "train_samples_per_second": 15.569, |
| "train_steps_per_second": 3.944 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1140, |
| "num_input_tokens_seen": 742296, |
| "num_train_epochs": 20, |
| "save_steps": 57, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.342525338596147e+16, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|