| { |
| "best_global_step": 1140, |
| "best_metric": 0.10313867032527924, |
| "best_model_checkpoint": "saves_multiple/ia3/llama-3-8b-instruct/train_cb_42_1760623601/checkpoint-1140", |
| "epoch": 20.0, |
| "eval_steps": 57, |
| "global_step": 1140, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.08771929824561403, |
| "grad_norm": 3.354792833328247, |
| "learning_rate": 1.7543859649122807e-06, |
| "loss": 1.0538, |
| "num_input_tokens_seen": 3232, |
| "step": 5, |
| "train_runtime": 4.0544, |
| "train_tokens_per_second": 797.153 |
| }, |
| { |
| "epoch": 0.17543859649122806, |
| "grad_norm": 3.0176336765289307, |
| "learning_rate": 3.9473684210526315e-06, |
| "loss": 1.0309, |
| "num_input_tokens_seen": 6432, |
| "step": 10, |
| "train_runtime": 5.4768, |
| "train_tokens_per_second": 1174.404 |
| }, |
| { |
| "epoch": 0.2631578947368421, |
| "grad_norm": 2.661266565322876, |
| "learning_rate": 6.140350877192982e-06, |
| "loss": 1.0359, |
| "num_input_tokens_seen": 9376, |
| "step": 15, |
| "train_runtime": 6.6968, |
| "train_tokens_per_second": 1400.074 |
| }, |
| { |
| "epoch": 0.3508771929824561, |
| "grad_norm": 3.1714117527008057, |
| "learning_rate": 8.333333333333334e-06, |
| "loss": 1.0625, |
| "num_input_tokens_seen": 12448, |
| "step": 20, |
| "train_runtime": 7.7696, |
| "train_tokens_per_second": 1602.146 |
| }, |
| { |
| "epoch": 0.43859649122807015, |
| "grad_norm": 3.6407687664031982, |
| "learning_rate": 1.0526315789473684e-05, |
| "loss": 1.1393, |
| "num_input_tokens_seen": 15456, |
| "step": 25, |
| "train_runtime": 8.7928, |
| "train_tokens_per_second": 1757.808 |
| }, |
| { |
| "epoch": 0.5263157894736842, |
| "grad_norm": 3.01566481590271, |
| "learning_rate": 1.2719298245614037e-05, |
| "loss": 1.0549, |
| "num_input_tokens_seen": 18592, |
| "step": 30, |
| "train_runtime": 9.8688, |
| "train_tokens_per_second": 1883.908 |
| }, |
| { |
| "epoch": 0.6140350877192983, |
| "grad_norm": 3.68526291847229, |
| "learning_rate": 1.4912280701754386e-05, |
| "loss": 1.1235, |
| "num_input_tokens_seen": 23136, |
| "step": 35, |
| "train_runtime": 11.2885, |
| "train_tokens_per_second": 2049.515 |
| }, |
| { |
| "epoch": 0.7017543859649122, |
| "grad_norm": 3.1010959148406982, |
| "learning_rate": 1.7105263157894737e-05, |
| "loss": 1.1976, |
| "num_input_tokens_seen": 26496, |
| "step": 40, |
| "train_runtime": 12.4133, |
| "train_tokens_per_second": 2134.492 |
| }, |
| { |
| "epoch": 0.7894736842105263, |
| "grad_norm": 2.408888816833496, |
| "learning_rate": 1.929824561403509e-05, |
| "loss": 1.1655, |
| "num_input_tokens_seen": 29728, |
| "step": 45, |
| "train_runtime": 13.5113, |
| "train_tokens_per_second": 2200.238 |
| }, |
| { |
| "epoch": 0.8771929824561403, |
| "grad_norm": 2.720629930496216, |
| "learning_rate": 2.149122807017544e-05, |
| "loss": 1.184, |
| "num_input_tokens_seen": 33024, |
| "step": 50, |
| "train_runtime": 14.5961, |
| "train_tokens_per_second": 2262.517 |
| }, |
| { |
| "epoch": 0.9649122807017544, |
| "grad_norm": 3.56002140045166, |
| "learning_rate": 2.368421052631579e-05, |
| "loss": 1.0884, |
| "num_input_tokens_seen": 35904, |
| "step": 55, |
| "train_runtime": 15.6262, |
| "train_tokens_per_second": 2297.68 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 1.0677411556243896, |
| "eval_runtime": 0.5825, |
| "eval_samples_per_second": 42.919, |
| "eval_steps_per_second": 12.017, |
| "num_input_tokens_seen": 36480, |
| "step": 57 |
| }, |
| { |
| "epoch": 1.0526315789473684, |
| "grad_norm": 2.2148914337158203, |
| "learning_rate": 2.5877192982456143e-05, |
| "loss": 0.9972, |
| "num_input_tokens_seen": 38016, |
| "step": 60, |
| "train_runtime": 19.5204, |
| "train_tokens_per_second": 1947.499 |
| }, |
| { |
| "epoch": 1.1403508771929824, |
| "grad_norm": 2.6269009113311768, |
| "learning_rate": 2.8070175438596492e-05, |
| "loss": 1.1312, |
| "num_input_tokens_seen": 40800, |
| "step": 65, |
| "train_runtime": 20.511, |
| "train_tokens_per_second": 1989.179 |
| }, |
| { |
| "epoch": 1.2280701754385965, |
| "grad_norm": 3.3006234169006348, |
| "learning_rate": 3.0263157894736844e-05, |
| "loss": 1.1464, |
| "num_input_tokens_seen": 43648, |
| "step": 70, |
| "train_runtime": 21.543, |
| "train_tokens_per_second": 2026.084 |
| }, |
| { |
| "epoch": 1.3157894736842106, |
| "grad_norm": 3.3827505111694336, |
| "learning_rate": 3.24561403508772e-05, |
| "loss": 1.1615, |
| "num_input_tokens_seen": 46112, |
| "step": 75, |
| "train_runtime": 22.4546, |
| "train_tokens_per_second": 2053.561 |
| }, |
| { |
| "epoch": 1.4035087719298245, |
| "grad_norm": 2.930447578430176, |
| "learning_rate": 3.4649122807017546e-05, |
| "loss": 1.1548, |
| "num_input_tokens_seen": 49632, |
| "step": 80, |
| "train_runtime": 23.6151, |
| "train_tokens_per_second": 2101.707 |
| }, |
| { |
| "epoch": 1.4912280701754386, |
| "grad_norm": 2.7595036029815674, |
| "learning_rate": 3.6842105263157895e-05, |
| "loss": 1.0904, |
| "num_input_tokens_seen": 53024, |
| "step": 85, |
| "train_runtime": 24.7506, |
| "train_tokens_per_second": 2142.33 |
| }, |
| { |
| "epoch": 1.5789473684210527, |
| "grad_norm": 4.10291051864624, |
| "learning_rate": 3.9035087719298244e-05, |
| "loss": 1.0917, |
| "num_input_tokens_seen": 56672, |
| "step": 90, |
| "train_runtime": 25.9484, |
| "train_tokens_per_second": 2184.027 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 2.9336965084075928, |
| "learning_rate": 4.12280701754386e-05, |
| "loss": 1.1794, |
| "num_input_tokens_seen": 59296, |
| "step": 95, |
| "train_runtime": 26.8826, |
| "train_tokens_per_second": 2205.741 |
| }, |
| { |
| "epoch": 1.7543859649122808, |
| "grad_norm": 3.761521339416504, |
| "learning_rate": 4.342105263157895e-05, |
| "loss": 1.098, |
| "num_input_tokens_seen": 62528, |
| "step": 100, |
| "train_runtime": 28.0143, |
| "train_tokens_per_second": 2232.007 |
| }, |
| { |
| "epoch": 1.8421052631578947, |
| "grad_norm": 2.3180630207061768, |
| "learning_rate": 4.56140350877193e-05, |
| "loss": 0.6296, |
| "num_input_tokens_seen": 65856, |
| "step": 105, |
| "train_runtime": 29.1276, |
| "train_tokens_per_second": 2260.95 |
| }, |
| { |
| "epoch": 1.9298245614035088, |
| "grad_norm": 3.160735607147217, |
| "learning_rate": 4.780701754385965e-05, |
| "loss": 0.7728, |
| "num_input_tokens_seen": 69664, |
| "step": 110, |
| "train_runtime": 30.3849, |
| "train_tokens_per_second": 2292.717 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.47614240646362305, |
| "eval_runtime": 0.5864, |
| "eval_samples_per_second": 42.63, |
| "eval_steps_per_second": 11.936, |
| "num_input_tokens_seen": 72112, |
| "step": 114 |
| }, |
| { |
| "epoch": 2.017543859649123, |
| "grad_norm": 2.040980339050293, |
| "learning_rate": 5e-05, |
| "loss": 0.7205, |
| "num_input_tokens_seen": 72848, |
| "step": 115, |
| "train_runtime": 33.2194, |
| "train_tokens_per_second": 2192.937 |
| }, |
| { |
| "epoch": 2.1052631578947367, |
| "grad_norm": 2.362532615661621, |
| "learning_rate": 4.999707014206475e-05, |
| "loss": 0.4737, |
| "num_input_tokens_seen": 75888, |
| "step": 120, |
| "train_runtime": 34.3036, |
| "train_tokens_per_second": 2212.246 |
| }, |
| { |
| "epoch": 2.192982456140351, |
| "grad_norm": 0.6496001482009888, |
| "learning_rate": 4.9988281254984414e-05, |
| "loss": 0.2419, |
| "num_input_tokens_seen": 78896, |
| "step": 125, |
| "train_runtime": 35.3364, |
| "train_tokens_per_second": 2232.71 |
| }, |
| { |
| "epoch": 2.280701754385965, |
| "grad_norm": 0.8470844626426697, |
| "learning_rate": 4.997363539877422e-05, |
| "loss": 0.1957, |
| "num_input_tokens_seen": 82288, |
| "step": 130, |
| "train_runtime": 36.469, |
| "train_tokens_per_second": 2256.384 |
| }, |
| { |
| "epoch": 2.3684210526315788, |
| "grad_norm": 2.7049901485443115, |
| "learning_rate": 4.9953136006256415e-05, |
| "loss": 0.3056, |
| "num_input_tokens_seen": 85936, |
| "step": 135, |
| "train_runtime": 37.6794, |
| "train_tokens_per_second": 2280.717 |
| }, |
| { |
| "epoch": 2.456140350877193, |
| "grad_norm": 0.4993942081928253, |
| "learning_rate": 4.9926787882255636e-05, |
| "loss": 0.46, |
| "num_input_tokens_seen": 88784, |
| "step": 140, |
| "train_runtime": 38.6737, |
| "train_tokens_per_second": 2295.723 |
| }, |
| { |
| "epoch": 2.543859649122807, |
| "grad_norm": 1.035400390625, |
| "learning_rate": 4.9894597202472696e-05, |
| "loss": 0.2249, |
| "num_input_tokens_seen": 91696, |
| "step": 145, |
| "train_runtime": 39.6938, |
| "train_tokens_per_second": 2310.085 |
| }, |
| { |
| "epoch": 2.6315789473684212, |
| "grad_norm": 1.3134247064590454, |
| "learning_rate": 4.985657151203706e-05, |
| "loss": 0.1069, |
| "num_input_tokens_seen": 95024, |
| "step": 150, |
| "train_runtime": 40.8049, |
| "train_tokens_per_second": 2328.738 |
| }, |
| { |
| "epoch": 2.719298245614035, |
| "grad_norm": 0.3608320653438568, |
| "learning_rate": 4.9812719723738435e-05, |
| "loss": 0.1625, |
| "num_input_tokens_seen": 98512, |
| "step": 155, |
| "train_runtime": 41.9799, |
| "train_tokens_per_second": 2346.65 |
| }, |
| { |
| "epoch": 2.807017543859649, |
| "grad_norm": 0.513091504573822, |
| "learning_rate": 4.976305211593758e-05, |
| "loss": 0.1937, |
| "num_input_tokens_seen": 101744, |
| "step": 160, |
| "train_runtime": 43.0862, |
| "train_tokens_per_second": 2361.407 |
| }, |
| { |
| "epoch": 2.8947368421052633, |
| "grad_norm": 0.9133481979370117, |
| "learning_rate": 4.970758033015731e-05, |
| "loss": 0.1252, |
| "num_input_tokens_seen": 105264, |
| "step": 165, |
| "train_runtime": 44.2516, |
| "train_tokens_per_second": 2378.76 |
| }, |
| { |
| "epoch": 2.982456140350877, |
| "grad_norm": 0.6826978921890259, |
| "learning_rate": 4.9646317368353743e-05, |
| "loss": 0.1341, |
| "num_input_tokens_seen": 108592, |
| "step": 170, |
| "train_runtime": 45.3979, |
| "train_tokens_per_second": 2392.003 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.1234603002667427, |
| "eval_runtime": 0.5871, |
| "eval_samples_per_second": 42.579, |
| "eval_steps_per_second": 11.922, |
| "num_input_tokens_seen": 108712, |
| "step": 171 |
| }, |
| { |
| "epoch": 3.0701754385964914, |
| "grad_norm": 1.365899920463562, |
| "learning_rate": 4.957927758986888e-05, |
| "loss": 0.6538, |
| "num_input_tokens_seen": 111976, |
| "step": 175, |
| "train_runtime": 48.1579, |
| "train_tokens_per_second": 2325.186 |
| }, |
| { |
| "epoch": 3.1578947368421053, |
| "grad_norm": 0.25882503390312195, |
| "learning_rate": 4.9506476708064865e-05, |
| "loss": 0.1908, |
| "num_input_tokens_seen": 114568, |
| "step": 180, |
| "train_runtime": 49.0895, |
| "train_tokens_per_second": 2333.859 |
| }, |
| { |
| "epoch": 3.245614035087719, |
| "grad_norm": 0.8045045733451843, |
| "learning_rate": 4.9427931786641e-05, |
| "loss": 0.1842, |
| "num_input_tokens_seen": 117384, |
| "step": 185, |
| "train_runtime": 50.0762, |
| "train_tokens_per_second": 2344.106 |
| }, |
| { |
| "epoch": 3.3333333333333335, |
| "grad_norm": 1.152528166770935, |
| "learning_rate": 4.93436612356342e-05, |
| "loss": 0.1629, |
| "num_input_tokens_seen": 120968, |
| "step": 190, |
| "train_runtime": 51.2816, |
| "train_tokens_per_second": 2358.898 |
| }, |
| { |
| "epoch": 3.4210526315789473, |
| "grad_norm": 2.5998055934906006, |
| "learning_rate": 4.925368480710385e-05, |
| "loss": 0.1379, |
| "num_input_tokens_seen": 124616, |
| "step": 195, |
| "train_runtime": 52.4735, |
| "train_tokens_per_second": 2374.837 |
| }, |
| { |
| "epoch": 3.5087719298245617, |
| "grad_norm": 0.23191165924072266, |
| "learning_rate": 4.915802359050222e-05, |
| "loss": 0.2213, |
| "num_input_tokens_seen": 128104, |
| "step": 200, |
| "train_runtime": 53.658, |
| "train_tokens_per_second": 2387.417 |
| }, |
| { |
| "epoch": 3.5964912280701755, |
| "grad_norm": 0.7694133520126343, |
| "learning_rate": 4.905670000773126e-05, |
| "loss": 0.1228, |
| "num_input_tokens_seen": 131528, |
| "step": 205, |
| "train_runtime": 54.7944, |
| "train_tokens_per_second": 2400.393 |
| }, |
| { |
| "epoch": 3.6842105263157894, |
| "grad_norm": 3.1285037994384766, |
| "learning_rate": 4.894973780788722e-05, |
| "loss": 0.2468, |
| "num_input_tokens_seen": 134600, |
| "step": 210, |
| "train_runtime": 55.8504, |
| "train_tokens_per_second": 2410.011 |
| }, |
| { |
| "epoch": 3.7719298245614032, |
| "grad_norm": 0.4966896176338196, |
| "learning_rate": 4.88371620616941e-05, |
| "loss": 0.1399, |
| "num_input_tokens_seen": 137896, |
| "step": 215, |
| "train_runtime": 56.9513, |
| "train_tokens_per_second": 2421.296 |
| }, |
| { |
| "epoch": 3.8596491228070176, |
| "grad_norm": 0.8622232675552368, |
| "learning_rate": 4.871899915562736e-05, |
| "loss": 0.1856, |
| "num_input_tokens_seen": 141064, |
| "step": 220, |
| "train_runtime": 58.0238, |
| "train_tokens_per_second": 2431.141 |
| }, |
| { |
| "epoch": 3.9473684210526314, |
| "grad_norm": 0.5208907723426819, |
| "learning_rate": 4.8595276785729236e-05, |
| "loss": 0.0575, |
| "num_input_tokens_seen": 144104, |
| "step": 225, |
| "train_runtime": 59.0531, |
| "train_tokens_per_second": 2440.242 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 0.12084977328777313, |
| "eval_runtime": 0.588, |
| "eval_samples_per_second": 42.516, |
| "eval_steps_per_second": 11.905, |
| "num_input_tokens_seen": 145296, |
| "step": 228 |
| }, |
| { |
| "epoch": 4.035087719298246, |
| "grad_norm": 2.5595507621765137, |
| "learning_rate": 4.846602395111711e-05, |
| "loss": 0.2, |
| "num_input_tokens_seen": 146576, |
| "step": 230, |
| "train_runtime": 61.6276, |
| "train_tokens_per_second": 2378.416 |
| }, |
| { |
| "epoch": 4.12280701754386, |
| "grad_norm": 2.71787166595459, |
| "learning_rate": 4.833127094718643e-05, |
| "loss": 0.2242, |
| "num_input_tokens_seen": 149456, |
| "step": 235, |
| "train_runtime": 62.6238, |
| "train_tokens_per_second": 2386.567 |
| }, |
| { |
| "epoch": 4.2105263157894735, |
| "grad_norm": 0.4318540692329407, |
| "learning_rate": 4.819104935850983e-05, |
| "loss": 0.1248, |
| "num_input_tokens_seen": 152496, |
| "step": 240, |
| "train_runtime": 63.7523, |
| "train_tokens_per_second": 2392.006 |
| }, |
| { |
| "epoch": 4.298245614035087, |
| "grad_norm": 0.40779757499694824, |
| "learning_rate": 4.804539205143405e-05, |
| "loss": 0.1094, |
| "num_input_tokens_seen": 155504, |
| "step": 245, |
| "train_runtime": 64.7945, |
| "train_tokens_per_second": 2399.955 |
| }, |
| { |
| "epoch": 4.385964912280702, |
| "grad_norm": 0.5033511519432068, |
| "learning_rate": 4.789433316637644e-05, |
| "loss": 0.2048, |
| "num_input_tokens_seen": 158640, |
| "step": 250, |
| "train_runtime": 65.8462, |
| "train_tokens_per_second": 2409.25 |
| }, |
| { |
| "epoch": 4.473684210526316, |
| "grad_norm": 0.175580233335495, |
| "learning_rate": 4.7737908109822854e-05, |
| "loss": 0.1292, |
| "num_input_tokens_seen": 161296, |
| "step": 255, |
| "train_runtime": 66.7852, |
| "train_tokens_per_second": 2415.145 |
| }, |
| { |
| "epoch": 4.56140350877193, |
| "grad_norm": 0.5260064601898193, |
| "learning_rate": 4.757615354602874e-05, |
| "loss": 0.3014, |
| "num_input_tokens_seen": 164656, |
| "step": 260, |
| "train_runtime": 67.92, |
| "train_tokens_per_second": 2424.263 |
| }, |
| { |
| "epoch": 4.649122807017544, |
| "grad_norm": 0.22309179604053497, |
| "learning_rate": 4.7409107388425504e-05, |
| "loss": 0.1858, |
| "num_input_tokens_seen": 168240, |
| "step": 265, |
| "train_runtime": 69.1058, |
| "train_tokens_per_second": 2434.528 |
| }, |
| { |
| "epoch": 4.7368421052631575, |
| "grad_norm": 1.1554216146469116, |
| "learning_rate": 4.723680879073396e-05, |
| "loss": 0.0617, |
| "num_input_tokens_seen": 171568, |
| "step": 270, |
| "train_runtime": 70.2239, |
| "train_tokens_per_second": 2443.156 |
| }, |
| { |
| "epoch": 4.824561403508772, |
| "grad_norm": 0.8314106464385986, |
| "learning_rate": 4.70592981377872e-05, |
| "loss": 0.0912, |
| "num_input_tokens_seen": 174320, |
| "step": 275, |
| "train_runtime": 71.1856, |
| "train_tokens_per_second": 2448.81 |
| }, |
| { |
| "epoch": 4.912280701754386, |
| "grad_norm": 0.33456218242645264, |
| "learning_rate": 4.6876617036064844e-05, |
| "loss": 0.0936, |
| "num_input_tokens_seen": 178448, |
| "step": 280, |
| "train_runtime": 72.5198, |
| "train_tokens_per_second": 2460.681 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.03806276246905327, |
| "learning_rate": 4.668880830394093e-05, |
| "loss": 0.0977, |
| "num_input_tokens_seen": 181408, |
| "step": 285, |
| "train_runtime": 73.6465, |
| "train_tokens_per_second": 2463.226 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 0.11310870945453644, |
| "eval_runtime": 0.5808, |
| "eval_samples_per_second": 43.044, |
| "eval_steps_per_second": 12.052, |
| "num_input_tokens_seen": 181408, |
| "step": 285 |
| }, |
| { |
| "epoch": 5.087719298245614, |
| "grad_norm": 0.6238030195236206, |
| "learning_rate": 4.649591596164778e-05, |
| "loss": 0.0658, |
| "num_input_tokens_seen": 184352, |
| "step": 290, |
| "train_runtime": 76.2491, |
| "train_tokens_per_second": 2417.761 |
| }, |
| { |
| "epoch": 5.175438596491228, |
| "grad_norm": 0.5384188890457153, |
| "learning_rate": 4.629798522095818e-05, |
| "loss": 0.0977, |
| "num_input_tokens_seen": 187680, |
| "step": 295, |
| "train_runtime": 77.4202, |
| "train_tokens_per_second": 2424.174 |
| }, |
| { |
| "epoch": 5.2631578947368425, |
| "grad_norm": 0.6808844804763794, |
| "learning_rate": 4.6095062474588225e-05, |
| "loss": 0.1278, |
| "num_input_tokens_seen": 190880, |
| "step": 300, |
| "train_runtime": 78.5301, |
| "train_tokens_per_second": 2430.66 |
| }, |
| { |
| "epoch": 5.350877192982456, |
| "grad_norm": 0.35950031876564026, |
| "learning_rate": 4.588719528532342e-05, |
| "loss": 0.1046, |
| "num_input_tokens_seen": 194592, |
| "step": 305, |
| "train_runtime": 79.752, |
| "train_tokens_per_second": 2439.965 |
| }, |
| { |
| "epoch": 5.43859649122807, |
| "grad_norm": 0.3269582688808441, |
| "learning_rate": 4.5674432374870455e-05, |
| "loss": 0.075, |
| "num_input_tokens_seen": 198720, |
| "step": 310, |
| "train_runtime": 81.0899, |
| "train_tokens_per_second": 2450.614 |
| }, |
| { |
| "epoch": 5.526315789473684, |
| "grad_norm": 0.5484315752983093, |
| "learning_rate": 4.545682361243748e-05, |
| "loss": 0.379, |
| "num_input_tokens_seen": 201856, |
| "step": 315, |
| "train_runtime": 82.1892, |
| "train_tokens_per_second": 2455.991 |
| }, |
| { |
| "epoch": 5.614035087719298, |
| "grad_norm": 0.9835972189903259, |
| "learning_rate": 4.5234420003045236e-05, |
| "loss": 0.1695, |
| "num_input_tokens_seen": 204800, |
| "step": 320, |
| "train_runtime": 83.2065, |
| "train_tokens_per_second": 2461.345 |
| }, |
| { |
| "epoch": 5.701754385964913, |
| "grad_norm": 1.7251195907592773, |
| "learning_rate": 4.5007273675572104e-05, |
| "loss": 0.2308, |
| "num_input_tokens_seen": 208064, |
| "step": 325, |
| "train_runtime": 84.3235, |
| "train_tokens_per_second": 2467.45 |
| }, |
| { |
| "epoch": 5.7894736842105265, |
| "grad_norm": 1.3619377613067627, |
| "learning_rate": 4.4775437870535685e-05, |
| "loss": 0.1602, |
| "num_input_tokens_seen": 211168, |
| "step": 330, |
| "train_runtime": 85.3782, |
| "train_tokens_per_second": 2473.325 |
| }, |
| { |
| "epoch": 5.87719298245614, |
| "grad_norm": 0.35113874077796936, |
| "learning_rate": 4.4538966927613836e-05, |
| "loss": 0.0766, |
| "num_input_tokens_seen": 214144, |
| "step": 335, |
| "train_runtime": 86.4216, |
| "train_tokens_per_second": 2477.9 |
| }, |
| { |
| "epoch": 5.964912280701754, |
| "grad_norm": 0.24905243515968323, |
| "learning_rate": 4.4297916272908024e-05, |
| "loss": 0.1258, |
| "num_input_tokens_seen": 217216, |
| "step": 340, |
| "train_runtime": 87.4816, |
| "train_tokens_per_second": 2482.992 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_loss": 0.1171339601278305, |
| "eval_runtime": 0.5892, |
| "eval_samples_per_second": 42.433, |
| "eval_steps_per_second": 11.881, |
| "num_input_tokens_seen": 217760, |
| "step": 342 |
| }, |
| { |
| "epoch": 6.052631578947368, |
| "grad_norm": 0.6464580297470093, |
| "learning_rate": 4.405234240595214e-05, |
| "loss": 0.0566, |
| "num_input_tokens_seen": 219808, |
| "step": 345, |
| "train_runtime": 90.143, |
| "train_tokens_per_second": 2438.438 |
| }, |
| { |
| "epoch": 6.140350877192983, |
| "grad_norm": 0.3178783357143402, |
| "learning_rate": 4.3802302886469606e-05, |
| "loss": 0.2094, |
| "num_input_tokens_seen": 223232, |
| "step": 350, |
| "train_runtime": 91.3086, |
| "train_tokens_per_second": 2444.807 |
| }, |
| { |
| "epoch": 6.228070175438597, |
| "grad_norm": 0.7545506954193115, |
| "learning_rate": 4.3547856320882044e-05, |
| "loss": 0.206, |
| "num_input_tokens_seen": 227008, |
| "step": 355, |
| "train_runtime": 92.5365, |
| "train_tokens_per_second": 2453.172 |
| }, |
| { |
| "epoch": 6.315789473684211, |
| "grad_norm": 0.19895721971988678, |
| "learning_rate": 4.328906234857259e-05, |
| "loss": 0.1101, |
| "num_input_tokens_seen": 230016, |
| "step": 360, |
| "train_runtime": 93.5746, |
| "train_tokens_per_second": 2458.103 |
| }, |
| { |
| "epoch": 6.4035087719298245, |
| "grad_norm": 1.6584346294403076, |
| "learning_rate": 4.302598162790712e-05, |
| "loss": 0.1999, |
| "num_input_tokens_seen": 233632, |
| "step": 365, |
| "train_runtime": 94.7787, |
| "train_tokens_per_second": 2465.027 |
| }, |
| { |
| "epoch": 6.491228070175438, |
| "grad_norm": 1.0977205038070679, |
| "learning_rate": 4.27586758220166e-05, |
| "loss": 0.1736, |
| "num_input_tokens_seen": 236960, |
| "step": 370, |
| "train_runtime": 95.8823, |
| "train_tokens_per_second": 2471.362 |
| }, |
| { |
| "epoch": 6.578947368421053, |
| "grad_norm": 1.279994249343872, |
| "learning_rate": 4.2487207584343955e-05, |
| "loss": 0.116, |
| "num_input_tokens_seen": 240160, |
| "step": 375, |
| "train_runtime": 96.9751, |
| "train_tokens_per_second": 2476.511 |
| }, |
| { |
| "epoch": 6.666666666666667, |
| "grad_norm": 1.1907933950424194, |
| "learning_rate": 4.2211640543958796e-05, |
| "loss": 0.1461, |
| "num_input_tokens_seen": 243264, |
| "step": 380, |
| "train_runtime": 98.0739, |
| "train_tokens_per_second": 2480.417 |
| }, |
| { |
| "epoch": 6.754385964912281, |
| "grad_norm": 0.3227393925189972, |
| "learning_rate": 4.193203929064353e-05, |
| "loss": 0.1582, |
| "num_input_tokens_seen": 246560, |
| "step": 385, |
| "train_runtime": 99.1776, |
| "train_tokens_per_second": 2486.046 |
| }, |
| { |
| "epoch": 6.842105263157895, |
| "grad_norm": 0.8687011003494263, |
| "learning_rate": 4.164846935975421e-05, |
| "loss": 0.1395, |
| "num_input_tokens_seen": 249408, |
| "step": 390, |
| "train_runtime": 100.1683, |
| "train_tokens_per_second": 2489.89 |
| }, |
| { |
| "epoch": 6.9298245614035086, |
| "grad_norm": 0.2596314251422882, |
| "learning_rate": 4.136099721685983e-05, |
| "loss": 0.0496, |
| "num_input_tokens_seen": 252192, |
| "step": 395, |
| "train_runtime": 101.1549, |
| "train_tokens_per_second": 2493.126 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_loss": 0.11278051882982254, |
| "eval_runtime": 0.585, |
| "eval_samples_per_second": 42.735, |
| "eval_steps_per_second": 11.966, |
| "num_input_tokens_seen": 254568, |
| "step": 399 |
| }, |
| { |
| "epoch": 7.017543859649122, |
| "grad_norm": 0.154341921210289, |
| "learning_rate": 4.1069690242163484e-05, |
| "loss": 0.0433, |
| "num_input_tokens_seen": 255432, |
| "step": 400, |
| "train_runtime": 103.8416, |
| "train_tokens_per_second": 2459.823 |
| }, |
| { |
| "epoch": 7.105263157894737, |
| "grad_norm": 0.5618856549263, |
| "learning_rate": 4.0774616714709316e-05, |
| "loss": 0.1658, |
| "num_input_tokens_seen": 258408, |
| "step": 405, |
| "train_runtime": 104.9452, |
| "train_tokens_per_second": 2462.314 |
| }, |
| { |
| "epoch": 7.192982456140351, |
| "grad_norm": 0.22434987127780914, |
| "learning_rate": 4.047584579637857e-05, |
| "loss": 0.0465, |
| "num_input_tokens_seen": 262024, |
| "step": 410, |
| "train_runtime": 106.1142, |
| "train_tokens_per_second": 2469.264 |
| }, |
| { |
| "epoch": 7.280701754385965, |
| "grad_norm": 0.10421310365200043, |
| "learning_rate": 4.0173447515678916e-05, |
| "loss": 0.0636, |
| "num_input_tokens_seen": 265032, |
| "step": 415, |
| "train_runtime": 107.1595, |
| "train_tokens_per_second": 2473.248 |
| }, |
| { |
| "epoch": 7.368421052631579, |
| "grad_norm": 3.0621981620788574, |
| "learning_rate": 3.986749275133057e-05, |
| "loss": 0.3187, |
| "num_input_tokens_seen": 267656, |
| "step": 420, |
| "train_runtime": 108.1184, |
| "train_tokens_per_second": 2475.583 |
| }, |
| { |
| "epoch": 7.456140350877193, |
| "grad_norm": 0.2600061297416687, |
| "learning_rate": 3.955805321565304e-05, |
| "loss": 0.0868, |
| "num_input_tokens_seen": 271592, |
| "step": 425, |
| "train_runtime": 109.3564, |
| "train_tokens_per_second": 2483.548 |
| }, |
| { |
| "epoch": 7.543859649122807, |
| "grad_norm": 0.14819173514842987, |
| "learning_rate": 3.9245201437756654e-05, |
| "loss": 0.0824, |
| "num_input_tokens_seen": 275016, |
| "step": 430, |
| "train_runtime": 110.5238, |
| "train_tokens_per_second": 2488.296 |
| }, |
| { |
| "epoch": 7.631578947368421, |
| "grad_norm": 0.38911575078964233, |
| "learning_rate": 3.892901074654255e-05, |
| "loss": 0.1303, |
| "num_input_tokens_seen": 278248, |
| "step": 435, |
| "train_runtime": 111.6434, |
| "train_tokens_per_second": 2492.293 |
| }, |
| { |
| "epoch": 7.719298245614035, |
| "grad_norm": 0.18203085660934448, |
| "learning_rate": 3.860955525351516e-05, |
| "loss": 0.0562, |
| "num_input_tokens_seen": 281320, |
| "step": 440, |
| "train_runtime": 112.6743, |
| "train_tokens_per_second": 2496.753 |
| }, |
| { |
| "epoch": 7.807017543859649, |
| "grad_norm": 0.27270665764808655, |
| "learning_rate": 3.82869098354114e-05, |
| "loss": 0.0551, |
| "num_input_tokens_seen": 284360, |
| "step": 445, |
| "train_runtime": 113.7167, |
| "train_tokens_per_second": 2500.601 |
| }, |
| { |
| "epoch": 7.894736842105263, |
| "grad_norm": 1.1799060106277466, |
| "learning_rate": 3.796115011665034e-05, |
| "loss": 0.1744, |
| "num_input_tokens_seen": 287944, |
| "step": 450, |
| "train_runtime": 114.905, |
| "train_tokens_per_second": 2505.931 |
| }, |
| { |
| "epoch": 7.982456140350877, |
| "grad_norm": 1.0825549364089966, |
| "learning_rate": 3.763235245160775e-05, |
| "loss": 0.3172, |
| "num_input_tokens_seen": 290696, |
| "step": 455, |
| "train_runtime": 115.87, |
| "train_tokens_per_second": 2508.811 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_loss": 0.11445556581020355, |
| "eval_runtime": 0.5874, |
| "eval_samples_per_second": 42.564, |
| "eval_steps_per_second": 11.918, |
| "num_input_tokens_seen": 291000, |
| "step": 456 |
| }, |
| { |
| "epoch": 8.070175438596491, |
| "grad_norm": 0.09401629120111465, |
| "learning_rate": 3.7300593906719464e-05, |
| "loss": 0.1706, |
| "num_input_tokens_seen": 293432, |
| "step": 460, |
| "train_runtime": 119.3961, |
| "train_tokens_per_second": 2457.635 |
| }, |
| { |
| "epoch": 8.157894736842104, |
| "grad_norm": 0.689762532711029, |
| "learning_rate": 3.69659522424179e-05, |
| "loss": 0.0841, |
| "num_input_tokens_seen": 296888, |
| "step": 465, |
| "train_runtime": 120.5629, |
| "train_tokens_per_second": 2462.516 |
| }, |
| { |
| "epoch": 8.24561403508772, |
| "grad_norm": 0.9293464422225952, |
| "learning_rate": 3.662850589490592e-05, |
| "loss": 0.1268, |
| "num_input_tokens_seen": 300312, |
| "step": 470, |
| "train_runtime": 121.7003, |
| "train_tokens_per_second": 2467.636 |
| }, |
| { |
| "epoch": 8.333333333333334, |
| "grad_norm": 0.3482949435710907, |
| "learning_rate": 3.628833395777224e-05, |
| "loss": 0.1068, |
| "num_input_tokens_seen": 303544, |
| "step": 475, |
| "train_runtime": 122.7941, |
| "train_tokens_per_second": 2471.975 |
| }, |
| { |
| "epoch": 8.421052631578947, |
| "grad_norm": 1.2682899236679077, |
| "learning_rate": 3.59455161634528e-05, |
| "loss": 0.1337, |
| "num_input_tokens_seen": 306872, |
| "step": 480, |
| "train_runtime": 123.9208, |
| "train_tokens_per_second": 2476.356 |
| }, |
| { |
| "epoch": 8.508771929824562, |
| "grad_norm": 0.7482170462608337, |
| "learning_rate": 3.560013286454242e-05, |
| "loss": 0.1132, |
| "num_input_tokens_seen": 309592, |
| "step": 485, |
| "train_runtime": 124.9127, |
| "train_tokens_per_second": 2478.468 |
| }, |
| { |
| "epoch": 8.596491228070175, |
| "grad_norm": 0.5950433015823364, |
| "learning_rate": 3.5252265014961006e-05, |
| "loss": 0.0949, |
| "num_input_tokens_seen": 312728, |
| "step": 490, |
| "train_runtime": 125.9779, |
| "train_tokens_per_second": 2482.403 |
| }, |
| { |
| "epoch": 8.68421052631579, |
| "grad_norm": 1.1345596313476562, |
| "learning_rate": 3.490199415097892e-05, |
| "loss": 0.2452, |
| "num_input_tokens_seen": 316088, |
| "step": 495, |
| "train_runtime": 127.1117, |
| "train_tokens_per_second": 2486.694 |
| }, |
| { |
| "epoch": 8.771929824561404, |
| "grad_norm": 0.1347583383321762, |
| "learning_rate": 3.45494023721058e-05, |
| "loss": 0.1226, |
| "num_input_tokens_seen": 319352, |
| "step": 500, |
| "train_runtime": 128.2476, |
| "train_tokens_per_second": 2490.121 |
| }, |
| { |
| "epoch": 8.859649122807017, |
| "grad_norm": 0.5796114206314087, |
| "learning_rate": 3.4194572321847336e-05, |
| "loss": 0.0672, |
| "num_input_tokens_seen": 323256, |
| "step": 505, |
| "train_runtime": 129.5565, |
| "train_tokens_per_second": 2495.098 |
| }, |
| { |
| "epoch": 8.947368421052632, |
| "grad_norm": 0.4608415961265564, |
| "learning_rate": 3.383758716833459e-05, |
| "loss": 0.0538, |
| "num_input_tokens_seen": 326200, |
| "step": 510, |
| "train_runtime": 130.551, |
| "train_tokens_per_second": 2498.641 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_loss": 0.11233614385128021, |
| "eval_runtime": 0.5858, |
| "eval_samples_per_second": 42.679, |
| "eval_steps_per_second": 11.95, |
| "num_input_tokens_seen": 327792, |
| "step": 513 |
| }, |
| { |
| "epoch": 9.035087719298245, |
| "grad_norm": 1.1988062858581543, |
| "learning_rate": 3.347853058483037e-05, |
| "loss": 0.2043, |
| "num_input_tokens_seen": 329008, |
| "step": 515, |
| "train_runtime": 133.5482, |
| "train_tokens_per_second": 2463.59 |
| }, |
| { |
| "epoch": 9.12280701754386, |
| "grad_norm": 0.12025721371173859, |
| "learning_rate": 3.311748673011709e-05, |
| "loss": 0.0332, |
| "num_input_tokens_seen": 332304, |
| "step": 520, |
| "train_runtime": 134.7048, |
| "train_tokens_per_second": 2466.905 |
| }, |
| { |
| "epoch": 9.210526315789474, |
| "grad_norm": 1.076543927192688, |
| "learning_rate": 3.275454022877097e-05, |
| "loss": 0.1037, |
| "num_input_tokens_seen": 335792, |
| "step": 525, |
| "train_runtime": 135.8434, |
| "train_tokens_per_second": 2471.906 |
| }, |
| { |
| "epoch": 9.298245614035087, |
| "grad_norm": 0.06491763144731522, |
| "learning_rate": 3.238977615132697e-05, |
| "loss": 0.1265, |
| "num_input_tokens_seen": 338672, |
| "step": 530, |
| "train_runtime": 136.8441, |
| "train_tokens_per_second": 2474.875 |
| }, |
| { |
| "epoch": 9.385964912280702, |
| "grad_norm": 0.24254095554351807, |
| "learning_rate": 3.202327999433924e-05, |
| "loss": 0.0537, |
| "num_input_tokens_seen": 341968, |
| "step": 535, |
| "train_runtime": 137.946, |
| "train_tokens_per_second": 2478.999 |
| }, |
| { |
| "epoch": 9.473684210526315, |
| "grad_norm": 0.9142289161682129, |
| "learning_rate": 3.165513766034167e-05, |
| "loss": 0.18, |
| "num_input_tokens_seen": 345232, |
| "step": 540, |
| "train_runtime": 139.0867, |
| "train_tokens_per_second": 2482.135 |
| }, |
| { |
| "epoch": 9.56140350877193, |
| "grad_norm": 0.5542981028556824, |
| "learning_rate": 3.128543543771336e-05, |
| "loss": 0.0903, |
| "num_input_tokens_seen": 348656, |
| "step": 545, |
| "train_runtime": 140.2208, |
| "train_tokens_per_second": 2486.478 |
| }, |
| { |
| "epoch": 9.649122807017545, |
| "grad_norm": 0.9289880394935608, |
| "learning_rate": 3.091425998045356e-05, |
| "loss": 0.363, |
| "num_input_tokens_seen": 351248, |
| "step": 550, |
| "train_runtime": 141.1596, |
| "train_tokens_per_second": 2488.304 |
| }, |
| { |
| "epoch": 9.736842105263158, |
| "grad_norm": 0.8378196954727173, |
| "learning_rate": 3.0541698287870965e-05, |
| "loss": 0.0762, |
| "num_input_tokens_seen": 355024, |
| "step": 555, |
| "train_runtime": 142.4167, |
| "train_tokens_per_second": 2492.854 |
| }, |
| { |
| "epoch": 9.824561403508772, |
| "grad_norm": 0.09416269510984421, |
| "learning_rate": 3.01678376841921e-05, |
| "loss": 0.0364, |
| "num_input_tokens_seen": 357776, |
| "step": 560, |
| "train_runtime": 143.3826, |
| "train_tokens_per_second": 2495.253 |
| }, |
| { |
| "epoch": 9.912280701754385, |
| "grad_norm": 0.16007107496261597, |
| "learning_rate": 2.9792765798093465e-05, |
| "loss": 0.1419, |
| "num_input_tokens_seen": 360784, |
| "step": 565, |
| "train_runtime": 144.4207, |
| "train_tokens_per_second": 2498.145 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 1.0040969848632812, |
| "learning_rate": 2.94165705421624e-05, |
| "loss": 0.1347, |
| "num_input_tokens_seen": 363864, |
| "step": 570, |
| "train_runtime": 145.5797, |
| "train_tokens_per_second": 2499.414 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_loss": 0.10720973461866379, |
| "eval_runtime": 0.5841, |
| "eval_samples_per_second": 42.802, |
| "eval_steps_per_second": 11.984, |
| "num_input_tokens_seen": 363864, |
| "step": 570 |
| }, |
| { |
| "epoch": 10.087719298245615, |
| "grad_norm": 0.06115713343024254, |
| "learning_rate": 2.9039340092291373e-05, |
| "loss": 0.2521, |
| "num_input_tokens_seen": 368120, |
| "step": 575, |
| "train_runtime": 148.456, |
| "train_tokens_per_second": 2479.657 |
| }, |
| { |
| "epoch": 10.175438596491228, |
| "grad_norm": 0.9393717050552368, |
| "learning_rate": 2.8661162867010543e-05, |
| "loss": 0.1127, |
| "num_input_tokens_seen": 371416, |
| "step": 580, |
| "train_runtime": 149.5706, |
| "train_tokens_per_second": 2483.216 |
| }, |
| { |
| "epoch": 10.263157894736842, |
| "grad_norm": 0.436775803565979, |
| "learning_rate": 2.8282127506763456e-05, |
| "loss": 0.0666, |
| "num_input_tokens_seen": 374168, |
| "step": 585, |
| "train_runtime": 150.5321, |
| "train_tokens_per_second": 2485.637 |
| }, |
| { |
| "epoch": 10.350877192982455, |
| "grad_norm": 0.4993436634540558, |
| "learning_rate": 2.7902322853130757e-05, |
| "loss": 0.0503, |
| "num_input_tokens_seen": 377496, |
| "step": 590, |
| "train_runtime": 151.6608, |
| "train_tokens_per_second": 2489.08 |
| }, |
| { |
| "epoch": 10.43859649122807, |
| "grad_norm": 2.0963611602783203, |
| "learning_rate": 2.752183792800671e-05, |
| "loss": 0.256, |
| "num_input_tokens_seen": 381592, |
| "step": 595, |
| "train_runtime": 152.9709, |
| "train_tokens_per_second": 2494.539 |
| }, |
| { |
| "epoch": 10.526315789473685, |
| "grad_norm": 0.2084749937057495, |
| "learning_rate": 2.7140761912733474e-05, |
| "loss": 0.0612, |
| "num_input_tokens_seen": 384760, |
| "step": 600, |
| "train_runtime": 154.0824, |
| "train_tokens_per_second": 2497.105 |
| }, |
| { |
| "epoch": 10.614035087719298, |
| "grad_norm": 0.20247477293014526, |
| "learning_rate": 2.6759184127198046e-05, |
| "loss": 0.1003, |
| "num_input_tokens_seen": 387512, |
| "step": 605, |
| "train_runtime": 155.0647, |
| "train_tokens_per_second": 2499.035 |
| }, |
| { |
| "epoch": 10.701754385964913, |
| "grad_norm": 1.0483530759811401, |
| "learning_rate": 2.6377194008896637e-05, |
| "loss": 0.082, |
| "num_input_tokens_seen": 390680, |
| "step": 610, |
| "train_runtime": 156.1362, |
| "train_tokens_per_second": 2502.175 |
| }, |
| { |
| "epoch": 10.789473684210526, |
| "grad_norm": 0.9714593291282654, |
| "learning_rate": 2.5994881091971605e-05, |
| "loss": 0.1216, |
| "num_input_tokens_seen": 393944, |
| "step": 615, |
| "train_runtime": 157.2556, |
| "train_tokens_per_second": 2505.119 |
| }, |
| { |
| "epoch": 10.87719298245614, |
| "grad_norm": 1.19658362865448, |
| "learning_rate": 2.5612334986225623e-05, |
| "loss": 0.12, |
| "num_input_tokens_seen": 396632, |
| "step": 620, |
| "train_runtime": 158.2119, |
| "train_tokens_per_second": 2506.967 |
| }, |
| { |
| "epoch": 10.964912280701755, |
| "grad_norm": 0.9653410911560059, |
| "learning_rate": 2.5229645356118163e-05, |
| "loss": 0.1298, |
| "num_input_tokens_seen": 399288, |
| "step": 625, |
| "train_runtime": 159.1947, |
| "train_tokens_per_second": 2508.173 |
| }, |
| { |
| "epoch": 11.0, |
| "eval_loss": 0.10691339522600174, |
| "eval_runtime": 0.587, |
| "eval_samples_per_second": 42.586, |
| "eval_steps_per_second": 11.924, |
| "num_input_tokens_seen": 400104, |
| "step": 627 |
| }, |
| { |
| "epoch": 11.052631578947368, |
| "grad_norm": 0.33754128217697144, |
| "learning_rate": 2.4846901899749185e-05, |
| "loss": 0.0611, |
| "num_input_tokens_seen": 401928, |
| "step": 630, |
| "train_runtime": 162.1813, |
| "train_tokens_per_second": 2478.263 |
| }, |
| { |
| "epoch": 11.140350877192983, |
| "grad_norm": 0.5505395531654358, |
| "learning_rate": 2.4464194327834926e-05, |
| "loss": 0.1293, |
| "num_input_tokens_seen": 404360, |
| "step": 635, |
| "train_runtime": 163.1461, |
| "train_tokens_per_second": 2478.515 |
| }, |
| { |
| "epoch": 11.228070175438596, |
| "grad_norm": 0.8922757506370544, |
| "learning_rate": 2.4081612342680694e-05, |
| "loss": 0.0894, |
| "num_input_tokens_seen": 407496, |
| "step": 640, |
| "train_runtime": 164.2456, |
| "train_tokens_per_second": 2481.016 |
| }, |
| { |
| "epoch": 11.31578947368421, |
| "grad_norm": 0.22152556478977203, |
| "learning_rate": 2.369924561715569e-05, |
| "loss": 0.0581, |
| "num_input_tokens_seen": 410760, |
| "step": 645, |
| "train_runtime": 165.3215, |
| "train_tokens_per_second": 2484.613 |
| }, |
| { |
| "epoch": 11.403508771929825, |
| "grad_norm": 1.2716141939163208, |
| "learning_rate": 2.3317183773674718e-05, |
| "loss": 0.2198, |
| "num_input_tokens_seen": 413832, |
| "step": 650, |
| "train_runtime": 166.3533, |
| "train_tokens_per_second": 2487.67 |
| }, |
| { |
| "epoch": 11.491228070175438, |
| "grad_norm": 0.23000480234622955, |
| "learning_rate": 2.2935516363191693e-05, |
| "loss": 0.1275, |
| "num_input_tokens_seen": 417320, |
| "step": 655, |
| "train_runtime": 167.5146, |
| "train_tokens_per_second": 2491.246 |
| }, |
| { |
| "epoch": 11.578947368421053, |
| "grad_norm": 0.5997104644775391, |
| "learning_rate": 2.2554332844209904e-05, |
| "loss": 0.0887, |
| "num_input_tokens_seen": 420232, |
| "step": 660, |
| "train_runtime": 168.5486, |
| "train_tokens_per_second": 2493.239 |
| }, |
| { |
| "epoch": 11.666666666666666, |
| "grad_norm": 0.38558098673820496, |
| "learning_rate": 2.2173722561813987e-05, |
| "loss": 0.0379, |
| "num_input_tokens_seen": 423528, |
| "step": 665, |
| "train_runtime": 169.6363, |
| "train_tokens_per_second": 2496.683 |
| }, |
| { |
| "epoch": 11.75438596491228, |
| "grad_norm": 0.6237869262695312, |
| "learning_rate": 2.179377472672842e-05, |
| "loss": 0.1143, |
| "num_input_tokens_seen": 426920, |
| "step": 670, |
| "train_runtime": 170.7838, |
| "train_tokens_per_second": 2499.768 |
| }, |
| { |
| "epoch": 11.842105263157894, |
| "grad_norm": 0.6095879673957825, |
| "learning_rate": 2.1414578394407597e-05, |
| "loss": 0.0651, |
| "num_input_tokens_seen": 431016, |
| "step": 675, |
| "train_runtime": 172.1147, |
| "train_tokens_per_second": 2504.236 |
| }, |
| { |
| "epoch": 11.929824561403509, |
| "grad_norm": 2.0376551151275635, |
| "learning_rate": 2.1036222444162147e-05, |
| "loss": 0.2924, |
| "num_input_tokens_seen": 434600, |
| "step": 680, |
| "train_runtime": 173.3197, |
| "train_tokens_per_second": 2507.506 |
| }, |
| { |
| "epoch": 12.0, |
| "eval_loss": 0.10777657479047775, |
| "eval_runtime": 0.5862, |
| "eval_samples_per_second": 42.649, |
| "eval_steps_per_second": 11.942, |
| "num_input_tokens_seen": 436440, |
| "step": 684 |
| }, |
| { |
| "epoch": 12.017543859649123, |
| "grad_norm": 0.6392739415168762, |
| "learning_rate": 2.0658795558326743e-05, |
| "loss": 0.0623, |
| "num_input_tokens_seen": 437016, |
| "step": 685, |
| "train_runtime": 175.7733, |
| "train_tokens_per_second": 2486.249 |
| }, |
| { |
| "epoch": 12.105263157894736, |
| "grad_norm": 0.3666384518146515, |
| "learning_rate": 2.0282386201473894e-05, |
| "loss": 0.041, |
| "num_input_tokens_seen": 440280, |
| "step": 690, |
| "train_runtime": 176.8869, |
| "train_tokens_per_second": 2489.048 |
| }, |
| { |
| "epoch": 12.192982456140351, |
| "grad_norm": 0.843946635723114, |
| "learning_rate": 1.99070825996789e-05, |
| "loss": 0.0815, |
| "num_input_tokens_seen": 443800, |
| "step": 695, |
| "train_runtime": 178.0941, |
| "train_tokens_per_second": 2491.941 |
| }, |
| { |
| "epoch": 12.280701754385966, |
| "grad_norm": 0.1081356629729271, |
| "learning_rate": 1.9532972719840607e-05, |
| "loss": 0.0367, |
| "num_input_tokens_seen": 446168, |
| "step": 700, |
| "train_runtime": 178.9971, |
| "train_tokens_per_second": 2492.599 |
| }, |
| { |
| "epoch": 12.368421052631579, |
| "grad_norm": 1.2140501737594604, |
| "learning_rate": 1.9160144249063035e-05, |
| "loss": 0.4138, |
| "num_input_tokens_seen": 449560, |
| "step": 705, |
| "train_runtime": 180.1421, |
| "train_tokens_per_second": 2495.586 |
| }, |
| { |
| "epoch": 12.456140350877194, |
| "grad_norm": 0.10039330273866653, |
| "learning_rate": 1.8788684574102467e-05, |
| "loss": 0.0396, |
| "num_input_tokens_seen": 452824, |
| "step": 710, |
| "train_runtime": 181.246, |
| "train_tokens_per_second": 2498.394 |
| }, |
| { |
| "epoch": 12.543859649122806, |
| "grad_norm": 1.3357945680618286, |
| "learning_rate": 1.8418680760885027e-05, |
| "loss": 0.166, |
| "num_input_tokens_seen": 456120, |
| "step": 715, |
| "train_runtime": 182.3751, |
| "train_tokens_per_second": 2500.999 |
| }, |
| { |
| "epoch": 12.631578947368421, |
| "grad_norm": 0.7184349894523621, |
| "learning_rate": 1.805021953409934e-05, |
| "loss": 0.1237, |
| "num_input_tokens_seen": 459736, |
| "step": 720, |
| "train_runtime": 183.5708, |
| "train_tokens_per_second": 2504.407 |
| }, |
| { |
| "epoch": 12.719298245614034, |
| "grad_norm": 0.8796572089195251, |
| "learning_rate": 1.7683387256869353e-05, |
| "loss": 0.1099, |
| "num_input_tokens_seen": 462584, |
| "step": 725, |
| "train_runtime": 184.56, |
| "train_tokens_per_second": 2506.415 |
| }, |
| { |
| "epoch": 12.807017543859649, |
| "grad_norm": 0.0641786977648735, |
| "learning_rate": 1.7318269910511736e-05, |
| "loss": 0.0561, |
| "num_input_tokens_seen": 465688, |
| "step": 730, |
| "train_runtime": 185.6328, |
| "train_tokens_per_second": 2508.652 |
| }, |
| { |
| "epoch": 12.894736842105264, |
| "grad_norm": 0.7770268321037292, |
| "learning_rate": 1.6954953074382863e-05, |
| "loss": 0.0666, |
| "num_input_tokens_seen": 468856, |
| "step": 735, |
| "train_runtime": 186.7038, |
| "train_tokens_per_second": 2511.229 |
| }, |
| { |
| "epoch": 12.982456140350877, |
| "grad_norm": 0.7374283075332642, |
| "learning_rate": 1.659352190581993e-05, |
| "loss": 0.1389, |
| "num_input_tokens_seen": 471768, |
| "step": 740, |
| "train_runtime": 187.7037, |
| "train_tokens_per_second": 2513.365 |
| }, |
| { |
| "epoch": 13.0, |
| "eval_loss": 0.10621943324804306, |
| "eval_runtime": 0.5863, |
| "eval_samples_per_second": 42.641, |
| "eval_steps_per_second": 11.94, |
| "num_input_tokens_seen": 471944, |
| "step": 741 |
| }, |
| { |
| "epoch": 13.070175438596491, |
| "grad_norm": 0.05659092217683792, |
| "learning_rate": 1.6234061120181142e-05, |
| "loss": 0.2374, |
| "num_input_tokens_seen": 474344, |
| "step": 745, |
| "train_runtime": 190.9684, |
| "train_tokens_per_second": 2483.888 |
| }, |
| { |
| "epoch": 13.157894736842104, |
| "grad_norm": 1.235370397567749, |
| "learning_rate": 1.5876654970989308e-05, |
| "loss": 0.1211, |
| "num_input_tokens_seen": 478120, |
| "step": 750, |
| "train_runtime": 192.2311, |
| "train_tokens_per_second": 2487.215 |
| }, |
| { |
| "epoch": 13.24561403508772, |
| "grad_norm": 0.5860034227371216, |
| "learning_rate": 1.552138723018382e-05, |
| "loss": 0.1409, |
| "num_input_tokens_seen": 480840, |
| "step": 755, |
| "train_runtime": 193.1775, |
| "train_tokens_per_second": 2489.109 |
| }, |
| { |
| "epoch": 13.333333333333334, |
| "grad_norm": 0.6669791340827942, |
| "learning_rate": 1.5168341168485423e-05, |
| "loss": 0.1713, |
| "num_input_tokens_seen": 484552, |
| "step": 760, |
| "train_runtime": 194.4098, |
| "train_tokens_per_second": 2492.425 |
| }, |
| { |
| "epoch": 13.421052631578947, |
| "grad_norm": 0.3427886366844177, |
| "learning_rate": 1.4817599535878565e-05, |
| "loss": 0.1061, |
| "num_input_tokens_seen": 487304, |
| "step": 765, |
| "train_runtime": 195.3955, |
| "train_tokens_per_second": 2493.936 |
| }, |
| { |
| "epoch": 13.508771929824562, |
| "grad_norm": 0.4120057225227356, |
| "learning_rate": 1.4469244542215682e-05, |
| "loss": 0.0367, |
| "num_input_tokens_seen": 490536, |
| "step": 770, |
| "train_runtime": 196.4949, |
| "train_tokens_per_second": 2496.431 |
| }, |
| { |
| "epoch": 13.596491228070175, |
| "grad_norm": 0.5584914684295654, |
| "learning_rate": 1.4123357837948175e-05, |
| "loss": 0.0849, |
| "num_input_tokens_seen": 493128, |
| "step": 775, |
| "train_runtime": 197.4396, |
| "train_tokens_per_second": 2497.614 |
| }, |
| { |
| "epoch": 13.68421052631579, |
| "grad_norm": 0.942362368106842, |
| "learning_rate": 1.3780020494988446e-05, |
| "loss": 0.2508, |
| "num_input_tokens_seen": 496136, |
| "step": 780, |
| "train_runtime": 198.4693, |
| "train_tokens_per_second": 2499.813 |
| }, |
| { |
| "epoch": 13.771929824561404, |
| "grad_norm": 0.29723063111305237, |
| "learning_rate": 1.3439312987707615e-05, |
| "loss": 0.0526, |
| "num_input_tokens_seen": 499432, |
| "step": 785, |
| "train_runtime": 199.6161, |
| "train_tokens_per_second": 2501.962 |
| }, |
| { |
| "epoch": 13.859649122807017, |
| "grad_norm": 1.1332062482833862, |
| "learning_rate": 1.3101315174073162e-05, |
| "loss": 0.1816, |
| "num_input_tokens_seen": 502984, |
| "step": 790, |
| "train_runtime": 200.7801, |
| "train_tokens_per_second": 2505.149 |
| }, |
| { |
| "epoch": 13.947368421052632, |
| "grad_norm": 0.8299708962440491, |
| "learning_rate": 1.2766106276931223e-05, |
| "loss": 0.0521, |
| "num_input_tokens_seen": 506376, |
| "step": 795, |
| "train_runtime": 201.9139, |
| "train_tokens_per_second": 2507.881 |
| }, |
| { |
| "epoch": 14.0, |
| "eval_loss": 0.10609572380781174, |
| "eval_runtime": 0.5874, |
| "eval_samples_per_second": 42.562, |
| "eval_steps_per_second": 11.917, |
| "num_input_tokens_seen": 508424, |
| "step": 798 |
| }, |
| { |
| "epoch": 14.035087719298245, |
| "grad_norm": 1.2610480785369873, |
| "learning_rate": 1.243376486543755e-05, |
| "loss": 0.1066, |
| "num_input_tokens_seen": 509672, |
| "step": 800, |
| "train_runtime": 204.608, |
| "train_tokens_per_second": 2490.968 |
| }, |
| { |
| "epoch": 14.12280701754386, |
| "grad_norm": 0.10829579085111618, |
| "learning_rate": 1.2104368836641908e-05, |
| "loss": 0.1141, |
| "num_input_tokens_seen": 513320, |
| "step": 805, |
| "train_runtime": 205.8216, |
| "train_tokens_per_second": 2494.004 |
| }, |
| { |
| "epoch": 14.210526315789474, |
| "grad_norm": 0.283528596162796, |
| "learning_rate": 1.1777995397229771e-05, |
| "loss": 0.2027, |
| "num_input_tokens_seen": 516296, |
| "step": 810, |
| "train_runtime": 206.8214, |
| "train_tokens_per_second": 2496.337 |
| }, |
| { |
| "epoch": 14.298245614035087, |
| "grad_norm": 1.0561656951904297, |
| "learning_rate": 1.1454721045426073e-05, |
| "loss": 0.0441, |
| "num_input_tokens_seen": 520040, |
| "step": 815, |
| "train_runtime": 208.0835, |
| "train_tokens_per_second": 2499.189 |
| }, |
| { |
| "epoch": 14.385964912280702, |
| "grad_norm": 0.39123740792274475, |
| "learning_rate": 1.113462155306478e-05, |
| "loss": 0.0417, |
| "num_input_tokens_seen": 523272, |
| "step": 820, |
| "train_runtime": 209.1813, |
| "train_tokens_per_second": 2501.524 |
| }, |
| { |
| "epoch": 14.473684210526315, |
| "grad_norm": 0.12004759907722473, |
| "learning_rate": 1.0817771947828934e-05, |
| "loss": 0.0771, |
| "num_input_tokens_seen": 527112, |
| "step": 825, |
| "train_runtime": 210.4461, |
| "train_tokens_per_second": 2504.736 |
| }, |
| { |
| "epoch": 14.56140350877193, |
| "grad_norm": 0.46222877502441406, |
| "learning_rate": 1.0504246495664932e-05, |
| "loss": 0.0922, |
| "num_input_tokens_seen": 530568, |
| "step": 830, |
| "train_runtime": 211.5932, |
| "train_tokens_per_second": 2507.491 |
| }, |
| { |
| "epoch": 14.649122807017545, |
| "grad_norm": 1.991820216178894, |
| "learning_rate": 1.0194118683375503e-05, |
| "loss": 0.1317, |
| "num_input_tokens_seen": 533192, |
| "step": 835, |
| "train_runtime": 212.5471, |
| "train_tokens_per_second": 2508.582 |
| }, |
| { |
| "epoch": 14.736842105263158, |
| "grad_norm": 0.1747927963733673, |
| "learning_rate": 9.887461201395176e-06, |
| "loss": 0.0789, |
| "num_input_tokens_seen": 536232, |
| "step": 840, |
| "train_runtime": 213.6015, |
| "train_tokens_per_second": 2510.432 |
| }, |
| { |
| "epoch": 14.824561403508772, |
| "grad_norm": 0.7753216028213501, |
| "learning_rate": 9.584345926752524e-06, |
| "loss": 0.0801, |
| "num_input_tokens_seen": 539624, |
| "step": 845, |
| "train_runtime": 214.7284, |
| "train_tokens_per_second": 2513.053 |
| }, |
| { |
| "epoch": 14.912280701754385, |
| "grad_norm": 1.4348740577697754, |
| "learning_rate": 9.284843906222948e-06, |
| "loss": 0.2411, |
| "num_input_tokens_seen": 542664, |
| "step": 850, |
| "train_runtime": 215.7733, |
| "train_tokens_per_second": 2514.973 |
| }, |
| { |
| "epoch": 15.0, |
| "grad_norm": 1.8949453830718994, |
| "learning_rate": 8.98902533967618e-06, |
| "loss": 0.1696, |
| "num_input_tokens_seen": 545352, |
| "step": 855, |
| "train_runtime": 216.8312, |
| "train_tokens_per_second": 2515.1 |
| }, |
| { |
| "epoch": 15.0, |
| "eval_loss": 0.10493011027574539, |
| "eval_runtime": 0.5851, |
| "eval_samples_per_second": 42.728, |
| "eval_steps_per_second": 11.964, |
| "num_input_tokens_seen": 545352, |
| "step": 855 |
| }, |
| { |
| "epoch": 15.087719298245615, |
| "grad_norm": 0.3690445125102997, |
| "learning_rate": 8.696959563622174e-06, |
| "loss": 0.064, |
| "num_input_tokens_seen": 548488, |
| "step": 860, |
| "train_runtime": 219.4293, |
| "train_tokens_per_second": 2499.611 |
| }, |
| { |
| "epoch": 15.175438596491228, |
| "grad_norm": 3.105729103088379, |
| "learning_rate": 8.40871503495947e-06, |
| "loss": 0.1508, |
| "num_input_tokens_seen": 552008, |
| "step": 865, |
| "train_runtime": 220.6298, |
| "train_tokens_per_second": 2501.964 |
| }, |
| { |
| "epoch": 15.263157894736842, |
| "grad_norm": 0.8427509665489197, |
| "learning_rate": 8.124359314929622e-06, |
| "loss": 0.1135, |
| "num_input_tokens_seen": 555944, |
| "step": 870, |
| "train_runtime": 221.9416, |
| "train_tokens_per_second": 2504.911 |
| }, |
| { |
| "epoch": 15.350877192982455, |
| "grad_norm": 0.44307586550712585, |
| "learning_rate": 7.843959053281663e-06, |
| "loss": 0.1025, |
| "num_input_tokens_seen": 558888, |
| "step": 875, |
| "train_runtime": 222.9853, |
| "train_tokens_per_second": 2506.39 |
| }, |
| { |
| "epoch": 15.43859649122807, |
| "grad_norm": 0.9975228905677795, |
| "learning_rate": 7.5675799726501155e-06, |
| "loss": 0.1617, |
| "num_input_tokens_seen": 562536, |
| "step": 880, |
| "train_runtime": 224.1818, |
| "train_tokens_per_second": 2509.285 |
| }, |
| { |
| "epoch": 15.526315789473685, |
| "grad_norm": 0.6071796417236328, |
| "learning_rate": 7.295286853150391e-06, |
| "loss": 0.0518, |
| "num_input_tokens_seen": 565160, |
| "step": 885, |
| "train_runtime": 225.1357, |
| "train_tokens_per_second": 2510.308 |
| }, |
| { |
| "epoch": 15.614035087719298, |
| "grad_norm": 0.6162160038948059, |
| "learning_rate": 7.027143517195023e-06, |
| "loss": 0.0476, |
| "num_input_tokens_seen": 568232, |
| "step": 890, |
| "train_runtime": 226.1893, |
| "train_tokens_per_second": 2512.197 |
| }, |
| { |
| "epoch": 15.701754385964913, |
| "grad_norm": 0.6943656206130981, |
| "learning_rate": 6.763212814534484e-06, |
| "loss": 0.2381, |
| "num_input_tokens_seen": 571048, |
| "step": 895, |
| "train_runtime": 227.1756, |
| "train_tokens_per_second": 2513.686 |
| }, |
| { |
| "epoch": 15.789473684210526, |
| "grad_norm": 0.7030104994773865, |
| "learning_rate": 6.503556607525838e-06, |
| "loss": 0.0584, |
| "num_input_tokens_seen": 574664, |
| "step": 900, |
| "train_runtime": 228.3878, |
| "train_tokens_per_second": 2516.177 |
| }, |
| { |
| "epoch": 15.87719298245614, |
| "grad_norm": 0.8077787756919861, |
| "learning_rate": 6.248235756632984e-06, |
| "loss": 0.1518, |
| "num_input_tokens_seen": 577608, |
| "step": 905, |
| "train_runtime": 229.4146, |
| "train_tokens_per_second": 2517.747 |
| }, |
| { |
| "epoch": 15.964912280701755, |
| "grad_norm": 0.8616611361503601, |
| "learning_rate": 5.997310106161589e-06, |
| "loss": 0.1024, |
| "num_input_tokens_seen": 580680, |
| "step": 910, |
| "train_runtime": 230.4206, |
| "train_tokens_per_second": 2520.088 |
| }, |
| { |
| "epoch": 16.0, |
| "eval_loss": 0.10666210204362869, |
| "eval_runtime": 0.5897, |
| "eval_samples_per_second": 42.392, |
| "eval_steps_per_second": 11.87, |
| "num_input_tokens_seen": 581368, |
| "step": 912 |
| }, |
| { |
| "epoch": 16.05263157894737, |
| "grad_norm": 0.42227837443351746, |
| "learning_rate": 5.7508384702323226e-06, |
| "loss": 0.0354, |
| "num_input_tokens_seen": 583544, |
| "step": 915, |
| "train_runtime": 232.9828, |
| "train_tokens_per_second": 2504.666 |
| }, |
| { |
| "epoch": 16.140350877192983, |
| "grad_norm": 0.22992517054080963, |
| "learning_rate": 5.508878618995439e-06, |
| "loss": 0.1062, |
| "num_input_tokens_seen": 586424, |
| "step": 920, |
| "train_runtime": 234.019, |
| "train_tokens_per_second": 2505.882 |
| }, |
| { |
| "epoch": 16.228070175438596, |
| "grad_norm": 0.6025696396827698, |
| "learning_rate": 5.271487265090163e-06, |
| "loss": 0.1369, |
| "num_input_tokens_seen": 589976, |
| "step": 925, |
| "train_runtime": 235.2016, |
| "train_tokens_per_second": 2508.384 |
| }, |
| { |
| "epoch": 16.31578947368421, |
| "grad_norm": 0.1732168048620224, |
| "learning_rate": 5.038720050351842e-06, |
| "loss": 0.0959, |
| "num_input_tokens_seen": 593048, |
| "step": 930, |
| "train_runtime": 236.2527, |
| "train_tokens_per_second": 2510.227 |
| }, |
| { |
| "epoch": 16.403508771929825, |
| "grad_norm": 0.4299314320087433, |
| "learning_rate": 4.810631532770182e-06, |
| "loss": 0.0776, |
| "num_input_tokens_seen": 596472, |
| "step": 935, |
| "train_runtime": 237.3915, |
| "train_tokens_per_second": 2512.609 |
| }, |
| { |
| "epoch": 16.49122807017544, |
| "grad_norm": 0.14484725892543793, |
| "learning_rate": 4.587275173701428e-06, |
| "loss": 0.2379, |
| "num_input_tokens_seen": 599544, |
| "step": 940, |
| "train_runtime": 238.4376, |
| "train_tokens_per_second": 2514.469 |
| }, |
| { |
| "epoch": 16.57894736842105, |
| "grad_norm": 1.2683119773864746, |
| "learning_rate": 4.368703325337667e-06, |
| "loss": 0.0572, |
| "num_input_tokens_seen": 603032, |
| "step": 945, |
| "train_runtime": 239.5986, |
| "train_tokens_per_second": 2516.842 |
| }, |
| { |
| "epoch": 16.666666666666668, |
| "grad_norm": 0.46702468395233154, |
| "learning_rate": 4.154967218436037e-06, |
| "loss": 0.0573, |
| "num_input_tokens_seen": 605560, |
| "step": 950, |
| "train_runtime": 240.5398, |
| "train_tokens_per_second": 2517.504 |
| }, |
| { |
| "epoch": 16.75438596491228, |
| "grad_norm": 0.1284426748752594, |
| "learning_rate": 3.94611695031086e-06, |
| "loss": 0.1545, |
| "num_input_tokens_seen": 609176, |
| "step": 955, |
| "train_runtime": 241.7744, |
| "train_tokens_per_second": 2519.605 |
| }, |
| { |
| "epoch": 16.842105263157894, |
| "grad_norm": 0.9069837927818298, |
| "learning_rate": 3.74220147309135e-06, |
| "loss": 0.1051, |
| "num_input_tokens_seen": 612024, |
| "step": 960, |
| "train_runtime": 242.7614, |
| "train_tokens_per_second": 2521.092 |
| }, |
| { |
| "epoch": 16.92982456140351, |
| "grad_norm": 0.25688862800598145, |
| "learning_rate": 3.543268582247844e-06, |
| "loss": 0.0598, |
| "num_input_tokens_seen": 615064, |
| "step": 965, |
| "train_runtime": 243.8119, |
| "train_tokens_per_second": 2522.698 |
| }, |
| { |
| "epoch": 17.0, |
| "eval_loss": 0.10478480905294418, |
| "eval_runtime": 0.5911, |
| "eval_samples_per_second": 42.294, |
| "eval_steps_per_second": 11.842, |
| "num_input_tokens_seen": 616776, |
| "step": 969 |
| }, |
| { |
| "epoch": 17.017543859649123, |
| "grad_norm": 0.20608949661254883, |
| "learning_rate": 3.3493649053890326e-06, |
| "loss": 0.132, |
| "num_input_tokens_seen": 617640, |
| "step": 970, |
| "train_runtime": 246.4873, |
| "train_tokens_per_second": 2505.768 |
| }, |
| { |
| "epoch": 17.105263157894736, |
| "grad_norm": 1.5790821313858032, |
| "learning_rate": 3.1605358913330385e-06, |
| "loss": 0.1705, |
| "num_input_tokens_seen": 620424, |
| "step": 975, |
| "train_runtime": 247.4926, |
| "train_tokens_per_second": 2506.839 |
| }, |
| { |
| "epoch": 17.19298245614035, |
| "grad_norm": 0.38555988669395447, |
| "learning_rate": 2.9768257994546662e-06, |
| "loss": 0.0891, |
| "num_input_tokens_seen": 623176, |
| "step": 980, |
| "train_runtime": 248.4814, |
| "train_tokens_per_second": 2507.938 |
| }, |
| { |
| "epoch": 17.280701754385966, |
| "grad_norm": 0.05342285707592964, |
| "learning_rate": 2.7982776893115627e-06, |
| "loss": 0.029, |
| "num_input_tokens_seen": 626600, |
| "step": 985, |
| "train_runtime": 249.6185, |
| "train_tokens_per_second": 2510.231 |
| }, |
| { |
| "epoch": 17.36842105263158, |
| "grad_norm": 0.2652755081653595, |
| "learning_rate": 2.624933410551508e-06, |
| "loss": 0.036, |
| "num_input_tokens_seen": 629992, |
| "step": 990, |
| "train_runtime": 250.7293, |
| "train_tokens_per_second": 2512.638 |
| }, |
| { |
| "epoch": 17.45614035087719, |
| "grad_norm": 1.1502621173858643, |
| "learning_rate": 2.456833593103361e-06, |
| "loss": 0.0807, |
| "num_input_tokens_seen": 633128, |
| "step": 995, |
| "train_runtime": 251.8025, |
| "train_tokens_per_second": 2514.383 |
| }, |
| { |
| "epoch": 17.54385964912281, |
| "grad_norm": 1.4065945148468018, |
| "learning_rate": 2.2940176376538445e-06, |
| "loss": 0.098, |
| "num_input_tokens_seen": 635880, |
| "step": 1000, |
| "train_runtime": 252.7529, |
| "train_tokens_per_second": 2515.817 |
| }, |
| { |
| "epoch": 17.63157894736842, |
| "grad_norm": 0.12156794220209122, |
| "learning_rate": 2.136523706412477e-06, |
| "loss": 0.075, |
| "num_input_tokens_seen": 639208, |
| "step": 1005, |
| "train_runtime": 253.8752, |
| "train_tokens_per_second": 2517.804 |
| }, |
| { |
| "epoch": 17.719298245614034, |
| "grad_norm": 0.29780030250549316, |
| "learning_rate": 1.984388714166799e-06, |
| "loss": 0.2354, |
| "num_input_tokens_seen": 642152, |
| "step": 1010, |
| "train_runtime": 254.896, |
| "train_tokens_per_second": 2519.27 |
| }, |
| { |
| "epoch": 17.80701754385965, |
| "grad_norm": 0.4195615351200104, |
| "learning_rate": 1.837648319629956e-06, |
| "loss": 0.1131, |
| "num_input_tokens_seen": 645800, |
| "step": 1015, |
| "train_runtime": 256.1081, |
| "train_tokens_per_second": 2521.591 |
| }, |
| { |
| "epoch": 17.894736842105264, |
| "grad_norm": 1.0894429683685303, |
| "learning_rate": 1.6963369170826943e-06, |
| "loss": 0.2088, |
| "num_input_tokens_seen": 649736, |
| "step": 1020, |
| "train_runtime": 257.3817, |
| "train_tokens_per_second": 2524.406 |
| }, |
| { |
| "epoch": 17.982456140350877, |
| "grad_norm": 0.8357132077217102, |
| "learning_rate": 1.5604876283117326e-06, |
| "loss": 0.0994, |
| "num_input_tokens_seen": 653064, |
| "step": 1025, |
| "train_runtime": 258.4933, |
| "train_tokens_per_second": 2526.425 |
| }, |
| { |
| "epoch": 18.0, |
| "eval_loss": 0.10452605038881302, |
| "eval_runtime": 0.5882, |
| "eval_samples_per_second": 42.499, |
| "eval_steps_per_second": 11.9, |
| "num_input_tokens_seen": 653152, |
| "step": 1026 |
| }, |
| { |
| "epoch": 18.07017543859649, |
| "grad_norm": 0.881624698638916, |
| "learning_rate": 1.4301322948464147e-06, |
| "loss": 0.2031, |
| "num_input_tokens_seen": 655424, |
| "step": 1030, |
| "train_runtime": 261.0397, |
| "train_tokens_per_second": 2510.822 |
| }, |
| { |
| "epoch": 18.157894736842106, |
| "grad_norm": 0.88265061378479, |
| "learning_rate": 1.3053014704953987e-06, |
| "loss": 0.1471, |
| "num_input_tokens_seen": 658848, |
| "step": 1035, |
| "train_runtime": 262.1635, |
| "train_tokens_per_second": 2513.119 |
| }, |
| { |
| "epoch": 18.24561403508772, |
| "grad_norm": 0.20473460853099823, |
| "learning_rate": 1.1860244141851773e-06, |
| "loss": 0.0341, |
| "num_input_tokens_seen": 661888, |
| "step": 1040, |
| "train_runtime": 263.2303, |
| "train_tokens_per_second": 2514.483 |
| }, |
| { |
| "epoch": 18.333333333333332, |
| "grad_norm": 0.7834029197692871, |
| "learning_rate": 1.0723290831021471e-06, |
| "loss": 0.0833, |
| "num_input_tokens_seen": 665664, |
| "step": 1045, |
| "train_runtime": 264.4893, |
| "train_tokens_per_second": 2516.79 |
| }, |
| { |
| "epoch": 18.42105263157895, |
| "grad_norm": 0.6192132234573364, |
| "learning_rate": 9.642421261397472e-07, |
| "loss": 0.0973, |
| "num_input_tokens_seen": 668896, |
| "step": 1050, |
| "train_runtime": 265.5927, |
| "train_tokens_per_second": 2518.503 |
| }, |
| { |
| "epoch": 18.50877192982456, |
| "grad_norm": 0.48073986172676086, |
| "learning_rate": 8.617888776522642e-07, |
| "loss": 0.064, |
| "num_input_tokens_seen": 672352, |
| "step": 1055, |
| "train_runtime": 266.7674, |
| "train_tokens_per_second": 2520.368 |
| }, |
| { |
| "epoch": 18.596491228070175, |
| "grad_norm": 0.39590880274772644, |
| "learning_rate": 7.649933515167407e-07, |
| "loss": 0.1197, |
| "num_input_tokens_seen": 675392, |
| "step": 1060, |
| "train_runtime": 267.8115, |
| "train_tokens_per_second": 2521.893 |
| }, |
| { |
| "epoch": 18.68421052631579, |
| "grad_norm": 0.5385900735855103, |
| "learning_rate": 6.738782355044049e-07, |
| "loss": 0.0847, |
| "num_input_tokens_seen": 678080, |
| "step": 1065, |
| "train_runtime": 268.7582, |
| "train_tokens_per_second": 2523.012 |
| }, |
| { |
| "epoch": 18.771929824561404, |
| "grad_norm": 0.2719675302505493, |
| "learning_rate": 5.88464885962911e-07, |
| "loss": 0.0278, |
| "num_input_tokens_seen": 681440, |
| "step": 1070, |
| "train_runtime": 269.8918, |
| "train_tokens_per_second": 2524.864 |
| }, |
| { |
| "epoch": 18.859649122807017, |
| "grad_norm": 0.6708596348762512, |
| "learning_rate": 5.087733228106517e-07, |
| "loss": 0.0769, |
| "num_input_tokens_seen": 684608, |
| "step": 1075, |
| "train_runtime": 270.9537, |
| "train_tokens_per_second": 2526.661 |
| }, |
| { |
| "epoch": 18.94736842105263, |
| "grad_norm": 0.47063499689102173, |
| "learning_rate": 4.3482222484432513e-07, |
| "loss": 0.0973, |
| "num_input_tokens_seen": 688480, |
| "step": 1080, |
| "train_runtime": 272.2182, |
| "train_tokens_per_second": 2529.148 |
| }, |
| { |
| "epoch": 19.0, |
| "eval_loss": 0.1085146814584732, |
| "eval_runtime": 0.5884, |
| "eval_samples_per_second": 42.488, |
| "eval_steps_per_second": 11.897, |
| "num_input_tokens_seen": 689856, |
| "step": 1083 |
| }, |
| { |
| "epoch": 19.035087719298247, |
| "grad_norm": 0.8871979117393494, |
| "learning_rate": 3.666289253608235e-07, |
| "loss": 0.5679, |
| "num_input_tokens_seen": 691200, |
| "step": 1085, |
| "train_runtime": 274.7464, |
| "train_tokens_per_second": 2515.774 |
| }, |
| { |
| "epoch": 19.12280701754386, |
| "grad_norm": 2.014510154724121, |
| "learning_rate": 3.0420940809451624e-07, |
| "loss": 0.2126, |
| "num_input_tokens_seen": 694240, |
| "step": 1090, |
| "train_runtime": 275.8315, |
| "train_tokens_per_second": 2516.899 |
| }, |
| { |
| "epoch": 19.210526315789473, |
| "grad_norm": 0.20251081883907318, |
| "learning_rate": 2.47578303470844e-07, |
| "loss": 0.0572, |
| "num_input_tokens_seen": 697472, |
| "step": 1095, |
| "train_runtime": 276.9291, |
| "train_tokens_per_second": 2518.594 |
| }, |
| { |
| "epoch": 19.29824561403509, |
| "grad_norm": 0.9537681341171265, |
| "learning_rate": 1.96748885177106e-07, |
| "loss": 0.2066, |
| "num_input_tokens_seen": 699872, |
| "step": 1100, |
| "train_runtime": 277.8148, |
| "train_tokens_per_second": 2519.204 |
| }, |
| { |
| "epoch": 19.385964912280702, |
| "grad_norm": 0.5312708616256714, |
| "learning_rate": 1.517330670512629e-07, |
| "loss": 0.2041, |
| "num_input_tokens_seen": 704736, |
| "step": 1105, |
| "train_runtime": 279.3407, |
| "train_tokens_per_second": 2522.855 |
| }, |
| { |
| "epoch": 19.473684210526315, |
| "grad_norm": 0.2024543583393097, |
| "learning_rate": 1.125414002894759e-07, |
| "loss": 0.0422, |
| "num_input_tokens_seen": 707968, |
| "step": 1110, |
| "train_runtime": 280.4278, |
| "train_tokens_per_second": 2524.6 |
| }, |
| { |
| "epoch": 19.56140350877193, |
| "grad_norm": 0.7978918552398682, |
| "learning_rate": 7.918307097301014e-08, |
| "loss": 0.1165, |
| "num_input_tokens_seen": 711136, |
| "step": 1115, |
| "train_runtime": 281.5081, |
| "train_tokens_per_second": 2526.166 |
| }, |
| { |
| "epoch": 19.649122807017545, |
| "grad_norm": 0.37032362818717957, |
| "learning_rate": 5.166589791513465e-08, |
| "loss": 0.0263, |
| "num_input_tokens_seen": 714432, |
| "step": 1120, |
| "train_runtime": 282.6136, |
| "train_tokens_per_second": 2527.947 |
| }, |
| { |
| "epoch": 19.736842105263158, |
| "grad_norm": 0.10990393161773682, |
| "learning_rate": 2.999633082847453e-08, |
| "loss": 0.0151, |
| "num_input_tokens_seen": 717440, |
| "step": 1125, |
| "train_runtime": 283.6603, |
| "train_tokens_per_second": 2529.223 |
| }, |
| { |
| "epoch": 19.82456140350877, |
| "grad_norm": 0.7865080833435059, |
| "learning_rate": 1.4179448813278484e-08, |
| "loss": 0.1254, |
| "num_input_tokens_seen": 720480, |
| "step": 1130, |
| "train_runtime": 284.7252, |
| "train_tokens_per_second": 2530.44 |
| }, |
| { |
| "epoch": 19.912280701754387, |
| "grad_norm": 0.35326337814331055, |
| "learning_rate": 4.218959166932268e-09, |
| "loss": 0.1746, |
| "num_input_tokens_seen": 723552, |
| "step": 1135, |
| "train_runtime": 285.8181, |
| "train_tokens_per_second": 2531.512 |
| }, |
| { |
| "epoch": 20.0, |
| "grad_norm": 0.42669206857681274, |
| "learning_rate": 1.1719651499819683e-10, |
| "loss": 0.0289, |
| "num_input_tokens_seen": 725992, |
| "step": 1140, |
| "train_runtime": 286.7994, |
| "train_tokens_per_second": 2531.358 |
| }, |
| { |
| "epoch": 20.0, |
| "eval_loss": 0.10313867032527924, |
| "eval_runtime": 0.5919, |
| "eval_samples_per_second": 42.24, |
| "eval_steps_per_second": 11.827, |
| "num_input_tokens_seen": 725992, |
| "step": 1140 |
| }, |
| { |
| "epoch": 20.0, |
| "num_input_tokens_seen": 725992, |
| "step": 1140, |
| "total_flos": 3.269194804985856e+16, |
| "train_loss": 0.22796724926222833, |
| "train_runtime": 288.2339, |
| "train_samples_per_second": 15.612, |
| "train_steps_per_second": 3.955 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1140, |
| "num_input_tokens_seen": 725992, |
| "num_train_epochs": 20, |
| "save_steps": 57, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.269194804985856e+16, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|