| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 144, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.013888888888888888, |
| "grad_norm": 3.640625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.962890625, |
| "memory/device_reserved (GiB)": 20.04, |
| "memory/max_active (GiB)": 14.89, |
| "memory/max_allocated (GiB)": 14.89, |
| "ppl": 7.11988, |
| "step": 1, |
| "tokens/total": 131072, |
| "tokens/train_per_sec_per_gpu": 73.21, |
| "tokens/trainable": 70017 |
| }, |
| { |
| "epoch": 0.027777777777777776, |
| "grad_norm": 2.71875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.71484375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 5.55581, |
| "step": 2, |
| "tokens/total": 262144, |
| "tokens/train_per_sec_per_gpu": 116.98, |
| "tokens/trainable": 138493 |
| }, |
| { |
| "epoch": 0.041666666666666664, |
| "grad_norm": 1.9140625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.5576171875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.7475, |
| "step": 3, |
| "tokens/total": 393216, |
| "tokens/train_per_sec_per_gpu": 126.05, |
| "tokens/trainable": 212680 |
| }, |
| { |
| "epoch": 0.05555555555555555, |
| "grad_norm": 1.2734375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.685546875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 5.3954, |
| "step": 4, |
| "tokens/total": 524288, |
| "tokens/train_per_sec_per_gpu": 120.16, |
| "tokens/trainable": 281918 |
| }, |
| { |
| "epoch": 0.06944444444444445, |
| "grad_norm": 0.8671875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.3330078125, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.79243, |
| "step": 5, |
| "tokens/total": 655360, |
| "tokens/train_per_sec_per_gpu": 115.11, |
| "tokens/trainable": 349530 |
| }, |
| { |
| "epoch": 0.08333333333333333, |
| "grad_norm": 0.80078125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.5693359375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.80346, |
| "step": 6, |
| "tokens/total": 786432, |
| "tokens/train_per_sec_per_gpu": 123.48, |
| "tokens/trainable": 420905 |
| }, |
| { |
| "epoch": 0.09722222222222222, |
| "grad_norm": 0.77734375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.400390625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.05678, |
| "step": 7, |
| "tokens/total": 917504, |
| "tokens/train_per_sec_per_gpu": 125.22, |
| "tokens/trainable": 493455 |
| }, |
| { |
| "epoch": 0.1111111111111111, |
| "grad_norm": 0.71875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.4931640625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.45116, |
| "step": 8, |
| "tokens/total": 1048576, |
| "tokens/train_per_sec_per_gpu": 128.89, |
| "tokens/trainable": 567667 |
| }, |
| { |
| "epoch": 0.125, |
| "grad_norm": 0.71484375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.390625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.01736, |
| "step": 9, |
| "tokens/total": 1179648, |
| "tokens/train_per_sec_per_gpu": 114.58, |
| "tokens/trainable": 633584 |
| }, |
| { |
| "epoch": 0.1388888888888889, |
| "grad_norm": 0.64453125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.5556640625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.73823, |
| "step": 10, |
| "tokens/total": 1310720, |
| "tokens/train_per_sec_per_gpu": 109.47, |
| "tokens/trainable": 697682 |
| }, |
| { |
| "epoch": 0.1527777777777778, |
| "grad_norm": 0.55078125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.521484375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.57902, |
| "step": 11, |
| "tokens/total": 1441792, |
| "tokens/train_per_sec_per_gpu": 122.62, |
| "tokens/trainable": 768737 |
| }, |
| { |
| "epoch": 0.16666666666666666, |
| "grad_norm": 0.55859375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.671875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 5.32214, |
| "step": 12, |
| "tokens/total": 1572864, |
| "tokens/train_per_sec_per_gpu": 112.17, |
| "tokens/trainable": 832992 |
| }, |
| { |
| "epoch": 0.18055555555555555, |
| "grad_norm": 0.5390625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.572265625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.81755, |
| "step": 13, |
| "tokens/total": 1703936, |
| "tokens/train_per_sec_per_gpu": 107.49, |
| "tokens/trainable": 895620 |
| }, |
| { |
| "epoch": 0.19444444444444445, |
| "grad_norm": 0.51953125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.5751953125, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.83169, |
| "step": 14, |
| "tokens/total": 1835008, |
| "tokens/train_per_sec_per_gpu": 109.22, |
| "tokens/trainable": 958297 |
| }, |
| { |
| "epoch": 0.20833333333333334, |
| "grad_norm": 0.50390625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.32275390625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.75374, |
| "step": 15, |
| "tokens/total": 1966080, |
| "tokens/train_per_sec_per_gpu": 106.17, |
| "tokens/trainable": 1019137 |
| }, |
| { |
| "epoch": 0.2222222222222222, |
| "grad_norm": 0.44921875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.2939453125, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.64715, |
| "step": 16, |
| "tokens/total": 2097152, |
| "tokens/train_per_sec_per_gpu": 113.27, |
| "tokens/trainable": 1084516 |
| }, |
| { |
| "epoch": 0.2361111111111111, |
| "grad_norm": 0.451171875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.3359375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.80356, |
| "step": 17, |
| "tokens/total": 2228224, |
| "tokens/train_per_sec_per_gpu": 112.42, |
| "tokens/trainable": 1148795 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 0.4453125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.48828125, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.42948, |
| "step": 18, |
| "tokens/total": 2359296, |
| "tokens/train_per_sec_per_gpu": 110.8, |
| "tokens/trainable": 1213448 |
| }, |
| { |
| "epoch": 0.2638888888888889, |
| "grad_norm": 0.44140625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.4228515625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.14893, |
| "step": 19, |
| "tokens/total": 2490368, |
| "tokens/train_per_sec_per_gpu": 116.43, |
| "tokens/trainable": 1280709 |
| }, |
| { |
| "epoch": 0.2777777777777778, |
| "grad_norm": 0.42578125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.4150390625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.11665, |
| "step": 20, |
| "tokens/total": 2621440, |
| "tokens/train_per_sec_per_gpu": 110.64, |
| "tokens/trainable": 1345180 |
| }, |
| { |
| "epoch": 0.2916666666666667, |
| "grad_norm": 0.4453125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.4140625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.11263, |
| "step": 21, |
| "tokens/total": 2752512, |
| "tokens/train_per_sec_per_gpu": 104.17, |
| "tokens/trainable": 1406772 |
| }, |
| { |
| "epoch": 0.3055555555555556, |
| "grad_norm": 0.4140625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.5234375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.58797, |
| "step": 22, |
| "tokens/total": 2883584, |
| "tokens/train_per_sec_per_gpu": 124.41, |
| "tokens/trainable": 1477071 |
| }, |
| { |
| "epoch": 0.3194444444444444, |
| "grad_norm": 0.41796875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.4541015625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.28064, |
| "step": 23, |
| "tokens/total": 3014656, |
| "tokens/train_per_sec_per_gpu": 114.72, |
| "tokens/trainable": 1542448 |
| }, |
| { |
| "epoch": 0.3333333333333333, |
| "grad_norm": 0.37109375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.31640625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.72999, |
| "step": 24, |
| "tokens/total": 3145728, |
| "tokens/train_per_sec_per_gpu": 123.91, |
| "tokens/trainable": 1613259 |
| }, |
| { |
| "epoch": 0.3472222222222222, |
| "grad_norm": 0.515625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.322265625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.75191, |
| "step": 25, |
| "tokens/total": 3276800, |
| "tokens/train_per_sec_per_gpu": 110.91, |
| "tokens/trainable": 1676091 |
| }, |
| { |
| "epoch": 0.3611111111111111, |
| "grad_norm": 0.462890625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.541015625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.66933, |
| "step": 26, |
| "tokens/total": 3407872, |
| "tokens/train_per_sec_per_gpu": 102.95, |
| "tokens/trainable": 1735704 |
| }, |
| { |
| "epoch": 0.375, |
| "grad_norm": 0.416015625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.23095703125, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.42451, |
| "step": 27, |
| "tokens/total": 3538944, |
| "tokens/train_per_sec_per_gpu": 116.23, |
| "tokens/trainable": 1801362 |
| }, |
| { |
| "epoch": 0.3888888888888889, |
| "grad_norm": 0.451171875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.37890625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.97056, |
| "step": 28, |
| "tokens/total": 3670016, |
| "tokens/train_per_sec_per_gpu": 105.41, |
| "tokens/trainable": 1862049 |
| }, |
| { |
| "epoch": 0.4027777777777778, |
| "grad_norm": 0.408203125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.3388671875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.81472, |
| "step": 29, |
| "tokens/total": 3801088, |
| "tokens/train_per_sec_per_gpu": 102.86, |
| "tokens/trainable": 1921034 |
| }, |
| { |
| "epoch": 0.4166666666666667, |
| "grad_norm": 0.392578125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.51171875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.53452, |
| "step": 30, |
| "tokens/total": 3932160, |
| "tokens/train_per_sec_per_gpu": 118.56, |
| "tokens/trainable": 1988755 |
| }, |
| { |
| "epoch": 0.4305555555555556, |
| "grad_norm": 0.392578125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.29833984375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.66321, |
| "step": 31, |
| "tokens/total": 4063232, |
| "tokens/train_per_sec_per_gpu": 112.58, |
| "tokens/trainable": 2053308 |
| }, |
| { |
| "epoch": 0.4444444444444444, |
| "grad_norm": 0.376953125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.234375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.43623, |
| "step": 32, |
| "tokens/total": 4194304, |
| "tokens/train_per_sec_per_gpu": 113.73, |
| "tokens/trainable": 2117979 |
| }, |
| { |
| "epoch": 0.4583333333333333, |
| "grad_norm": 0.41015625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.4833984375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.4079, |
| "step": 33, |
| "tokens/total": 4325376, |
| "tokens/train_per_sec_per_gpu": 116.07, |
| "tokens/trainable": 2183699 |
| }, |
| { |
| "epoch": 0.4722222222222222, |
| "grad_norm": 0.427734375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.4267578125, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.16517, |
| "step": 34, |
| "tokens/total": 4456448, |
| "tokens/train_per_sec_per_gpu": 110.79, |
| "tokens/trainable": 2247732 |
| }, |
| { |
| "epoch": 0.4861111111111111, |
| "grad_norm": 0.373046875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.29296875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.64359, |
| "step": 35, |
| "tokens/total": 4587520, |
| "tokens/train_per_sec_per_gpu": 116.38, |
| "tokens/trainable": 2313478 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 0.3828125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.2060546875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.34028, |
| "step": 36, |
| "tokens/total": 4718592, |
| "tokens/train_per_sec_per_gpu": 99.48, |
| "tokens/trainable": 2370449 |
| }, |
| { |
| "epoch": 0.5138888888888888, |
| "grad_norm": 0.369140625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.3896484375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.01344, |
| "step": 37, |
| "tokens/total": 4849664, |
| "tokens/train_per_sec_per_gpu": 109.69, |
| "tokens/trainable": 2434107 |
| }, |
| { |
| "epoch": 0.5277777777777778, |
| "grad_norm": 0.38671875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.3388671875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.81472, |
| "step": 38, |
| "tokens/total": 4980736, |
| "tokens/train_per_sec_per_gpu": 109.51, |
| "tokens/trainable": 2497225 |
| }, |
| { |
| "epoch": 0.5416666666666666, |
| "grad_norm": 0.447265625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.44921875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.25979, |
| "step": 39, |
| "tokens/total": 5111808, |
| "tokens/train_per_sec_per_gpu": 110.86, |
| "tokens/trainable": 2559802 |
| }, |
| { |
| "epoch": 0.5555555555555556, |
| "grad_norm": 0.38671875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.4765625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.37787, |
| "step": 40, |
| "tokens/total": 5242880, |
| "tokens/train_per_sec_per_gpu": 125.32, |
| "tokens/trainable": 2630929 |
| }, |
| { |
| "epoch": 0.5694444444444444, |
| "grad_norm": 0.357421875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.373046875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.94736, |
| "step": 41, |
| "tokens/total": 5373952, |
| "tokens/train_per_sec_per_gpu": 118.02, |
| "tokens/trainable": 2698283 |
| }, |
| { |
| "epoch": 0.5833333333333334, |
| "grad_norm": 0.34765625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.35791015625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.88806, |
| "step": 42, |
| "tokens/total": 5505024, |
| "tokens/train_per_sec_per_gpu": 118.85, |
| "tokens/trainable": 2766688 |
| }, |
| { |
| "epoch": 0.5972222222222222, |
| "grad_norm": 0.3671875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.544921875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.68761, |
| "step": 43, |
| "tokens/total": 5636096, |
| "tokens/train_per_sec_per_gpu": 120.3, |
| "tokens/trainable": 2836890 |
| }, |
| { |
| "epoch": 0.6111111111111112, |
| "grad_norm": 0.38671875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.24609375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.47674, |
| "step": 44, |
| "tokens/total": 5767168, |
| "tokens/train_per_sec_per_gpu": 113.9, |
| "tokens/trainable": 2901473 |
| }, |
| { |
| "epoch": 0.625, |
| "grad_norm": 0.376953125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.28271484375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.60642, |
| "step": 45, |
| "tokens/total": 5898240, |
| "tokens/train_per_sec_per_gpu": 104.13, |
| "tokens/trainable": 2961017 |
| }, |
| { |
| "epoch": 0.6388888888888888, |
| "grad_norm": 0.341796875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.228515625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.41615, |
| "step": 46, |
| "tokens/total": 6029312, |
| "tokens/train_per_sec_per_gpu": 115.84, |
| "tokens/trainable": 3028498 |
| }, |
| { |
| "epoch": 0.6527777777777778, |
| "grad_norm": 0.361328125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.3037109375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.68294, |
| "step": 47, |
| "tokens/total": 6160384, |
| "tokens/train_per_sec_per_gpu": 114.28, |
| "tokens/trainable": 3093372 |
| }, |
| { |
| "epoch": 0.6666666666666666, |
| "grad_norm": 0.361328125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.263671875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.53839, |
| "step": 48, |
| "tokens/total": 6291456, |
| "tokens/train_per_sec_per_gpu": 110.62, |
| "tokens/trainable": 3156151 |
| }, |
| { |
| "epoch": 0.6805555555555556, |
| "grad_norm": 0.392578125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.39697265625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.04294, |
| "step": 49, |
| "tokens/total": 6422528, |
| "tokens/train_per_sec_per_gpu": 104.84, |
| "tokens/trainable": 3217411 |
| }, |
| { |
| "epoch": 0.6944444444444444, |
| "grad_norm": 0.38671875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.392578125, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.02521, |
| "step": 50, |
| "tokens/total": 6553600, |
| "tokens/train_per_sec_per_gpu": 104.7, |
| "tokens/trainable": 3277230 |
| }, |
| { |
| "epoch": 0.7083333333333334, |
| "grad_norm": 0.39453125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.5087890625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.52125, |
| "step": 51, |
| "tokens/total": 6684672, |
| "tokens/train_per_sec_per_gpu": 115.14, |
| "tokens/trainable": 3343478 |
| }, |
| { |
| "epoch": 0.7222222222222222, |
| "grad_norm": 0.357421875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.46337890625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.32053, |
| "step": 52, |
| "tokens/total": 6815744, |
| "tokens/train_per_sec_per_gpu": 118.03, |
| "tokens/trainable": 3412165 |
| }, |
| { |
| "epoch": 0.7361111111111112, |
| "grad_norm": 0.38671875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.337890625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.811, |
| "step": 53, |
| "tokens/total": 6946816, |
| "tokens/train_per_sec_per_gpu": 103.45, |
| "tokens/trainable": 3472780 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 0.34765625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.12158203125, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.06971, |
| "step": 54, |
| "tokens/total": 7077888, |
| "tokens/train_per_sec_per_gpu": 108.55, |
| "tokens/trainable": 3535299 |
| }, |
| { |
| "epoch": 0.7638888888888888, |
| "grad_norm": 0.3515625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.3837890625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.98999, |
| "step": 55, |
| "tokens/total": 7208960, |
| "tokens/train_per_sec_per_gpu": 122.32, |
| "tokens/trainable": 3606644 |
| }, |
| { |
| "epoch": 0.7777777777777778, |
| "grad_norm": 0.357421875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.232421875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.42953, |
| "step": 56, |
| "tokens/total": 7340032, |
| "tokens/train_per_sec_per_gpu": 120.64, |
| "tokens/trainable": 3676279 |
| }, |
| { |
| "epoch": 0.7916666666666666, |
| "grad_norm": 0.380859375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.443359375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.2349, |
| "step": 57, |
| "tokens/total": 7471104, |
| "tokens/train_per_sec_per_gpu": 117.13, |
| "tokens/trainable": 3742502 |
| }, |
| { |
| "epoch": 0.8055555555555556, |
| "grad_norm": 0.390625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.26171875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.53149, |
| "step": 58, |
| "tokens/total": 7602176, |
| "tokens/train_per_sec_per_gpu": 110.65, |
| "tokens/trainable": 3805375 |
| }, |
| { |
| "epoch": 0.8194444444444444, |
| "grad_norm": 0.353515625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.11279296875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.04285, |
| "step": 59, |
| "tokens/total": 7733248, |
| "tokens/train_per_sec_per_gpu": 111.67, |
| "tokens/trainable": 3869265 |
| }, |
| { |
| "epoch": 0.8333333333333334, |
| "grad_norm": 0.326171875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.2236328125, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.39952, |
| "step": 60, |
| "tokens/total": 7864320, |
| "tokens/train_per_sec_per_gpu": 125.63, |
| "tokens/trainable": 3941933 |
| }, |
| { |
| "epoch": 0.8472222222222222, |
| "grad_norm": 0.34765625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.4287109375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.17332, |
| "step": 61, |
| "tokens/total": 7995392, |
| "tokens/train_per_sec_per_gpu": 127.75, |
| "tokens/trainable": 4014729 |
| }, |
| { |
| "epoch": 0.8611111111111112, |
| "grad_norm": 0.396484375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.28369140625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.60994, |
| "step": 62, |
| "tokens/total": 8126464, |
| "tokens/train_per_sec_per_gpu": 103.96, |
| "tokens/trainable": 4073934 |
| }, |
| { |
| "epoch": 0.875, |
| "grad_norm": 0.357421875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.3369140625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.80728, |
| "step": 63, |
| "tokens/total": 8257536, |
| "tokens/train_per_sec_per_gpu": 104.09, |
| "tokens/trainable": 4133735 |
| }, |
| { |
| "epoch": 0.8888888888888888, |
| "grad_norm": 0.37890625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.4873046875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.42515, |
| "step": 64, |
| "tokens/total": 8388608, |
| "tokens/train_per_sec_per_gpu": 104.66, |
| "tokens/trainable": 4193797 |
| }, |
| { |
| "epoch": 0.9027777777777778, |
| "grad_norm": 0.3984375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.2041015625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.33376, |
| "step": 65, |
| "tokens/total": 8519680, |
| "tokens/train_per_sec_per_gpu": 117.74, |
| "tokens/trainable": 4261340 |
| }, |
| { |
| "epoch": 0.9166666666666666, |
| "grad_norm": 0.384765625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.220703125, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.38957, |
| "step": 66, |
| "tokens/total": 8650752, |
| "tokens/train_per_sec_per_gpu": 100.0, |
| "tokens/trainable": 4319643 |
| }, |
| { |
| "epoch": 0.9305555555555556, |
| "grad_norm": 0.341796875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.2412109375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.4598, |
| "step": 67, |
| "tokens/total": 8781824, |
| "tokens/train_per_sec_per_gpu": 121.85, |
| "tokens/trainable": 4391039 |
| }, |
| { |
| "epoch": 0.9444444444444444, |
| "grad_norm": 0.361328125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.228515625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.41615, |
| "step": 68, |
| "tokens/total": 8912896, |
| "tokens/train_per_sec_per_gpu": 105.16, |
| "tokens/trainable": 4450874 |
| }, |
| { |
| "epoch": 0.9583333333333334, |
| "grad_norm": 0.380859375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.158203125, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.18421, |
| "step": 69, |
| "tokens/total": 9043968, |
| "tokens/train_per_sec_per_gpu": 116.88, |
| "tokens/trainable": 4517777 |
| }, |
| { |
| "epoch": 0.9722222222222222, |
| "grad_norm": 0.361328125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.25390625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.504, |
| "step": 70, |
| "tokens/total": 9175040, |
| "tokens/train_per_sec_per_gpu": 110.04, |
| "tokens/trainable": 4581277 |
| }, |
| { |
| "epoch": 0.9861111111111112, |
| "grad_norm": 0.357421875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.33056640625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.78319, |
| "step": 71, |
| "tokens/total": 9306112, |
| "tokens/train_per_sec_per_gpu": 118.96, |
| "tokens/trainable": 4650130 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.33984375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.3134765625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.71908, |
| "step": 72, |
| "tokens/total": 9437184, |
| "tokens/train_per_sec_per_gpu": 131.5, |
| "tokens/trainable": 4725570 |
| }, |
| { |
| "epoch": 1.0138888888888888, |
| "grad_norm": 0.361328125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.314453125, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.72271, |
| "step": 73, |
| "tokens/total": 9568256, |
| "tokens/train_per_sec_per_gpu": 122.77, |
| "tokens/trainable": 4795587 |
| }, |
| { |
| "epoch": 1.0277777777777777, |
| "grad_norm": 0.34375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.244140625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.46995, |
| "step": 74, |
| "tokens/total": 9699328, |
| "tokens/train_per_sec_per_gpu": 119.83, |
| "tokens/trainable": 4864063 |
| }, |
| { |
| "epoch": 1.0416666666666667, |
| "grad_norm": 0.3203125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.21533203125, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.37141, |
| "step": 75, |
| "tokens/total": 9830400, |
| "tokens/train_per_sec_per_gpu": 129.56, |
| "tokens/trainable": 4938250 |
| }, |
| { |
| "epoch": 1.0555555555555556, |
| "grad_norm": 0.34375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.3935546875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.02915, |
| "step": 76, |
| "tokens/total": 9961472, |
| "tokens/train_per_sec_per_gpu": 120.33, |
| "tokens/trainable": 5007488 |
| }, |
| { |
| "epoch": 1.0694444444444444, |
| "grad_norm": 0.341796875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.06982421875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 2.91487, |
| "step": 77, |
| "tokens/total": 10092544, |
| "tokens/train_per_sec_per_gpu": 117.34, |
| "tokens/trainable": 5075100 |
| }, |
| { |
| "epoch": 1.0833333333333333, |
| "grad_norm": 0.33984375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.32666015625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.76844, |
| "step": 78, |
| "tokens/total": 10223616, |
| "tokens/train_per_sec_per_gpu": 124.04, |
| "tokens/trainable": 5146475 |
| }, |
| { |
| "epoch": 1.0972222222222223, |
| "grad_norm": 0.37890625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.166015625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.20918, |
| "step": 79, |
| "tokens/total": 10354688, |
| "tokens/train_per_sec_per_gpu": 125.54, |
| "tokens/trainable": 5219025 |
| }, |
| { |
| "epoch": 1.1111111111111112, |
| "grad_norm": 0.33203125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.3037109375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.68294, |
| "step": 80, |
| "tokens/total": 10485760, |
| "tokens/train_per_sec_per_gpu": 128.95, |
| "tokens/trainable": 5293237 |
| }, |
| { |
| "epoch": 1.125, |
| "grad_norm": 0.349609375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.1826171875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.2629, |
| "step": 81, |
| "tokens/total": 10616832, |
| "tokens/train_per_sec_per_gpu": 115.0, |
| "tokens/trainable": 5359154 |
| }, |
| { |
| "epoch": 1.1388888888888888, |
| "grad_norm": 0.375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.34912109375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.85404, |
| "step": 82, |
| "tokens/total": 10747904, |
| "tokens/train_per_sec_per_gpu": 111.51, |
| "tokens/trainable": 5423252 |
| }, |
| { |
| "epoch": 1.1527777777777777, |
| "grad_norm": 0.421875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.3427734375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.82965, |
| "step": 83, |
| "tokens/total": 10878976, |
| "tokens/train_per_sec_per_gpu": 123.76, |
| "tokens/trainable": 5494307 |
| }, |
| { |
| "epoch": 1.1666666666666667, |
| "grad_norm": 0.376953125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.4833984375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.4079, |
| "step": 84, |
| "tokens/total": 11010048, |
| "tokens/train_per_sec_per_gpu": 111.84, |
| "tokens/trainable": 5558562 |
| }, |
| { |
| "epoch": 1.1805555555555556, |
| "grad_norm": 0.37109375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.3857421875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.99779, |
| "step": 85, |
| "tokens/total": 11141120, |
| "tokens/train_per_sec_per_gpu": 108.82, |
| "tokens/trainable": 5621190 |
| }, |
| { |
| "epoch": 1.1944444444444444, |
| "grad_norm": 0.36328125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.396484375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.04097, |
| "step": 86, |
| "tokens/total": 11272192, |
| "tokens/train_per_sec_per_gpu": 111.48, |
| "tokens/trainable": 5683867 |
| }, |
| { |
| "epoch": 1.2083333333333333, |
| "grad_norm": 0.376953125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.1533203125, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.1687, |
| "step": 87, |
| "tokens/total": 11403264, |
| "tokens/train_per_sec_per_gpu": 107.77, |
| "tokens/trainable": 5744707 |
| }, |
| { |
| "epoch": 1.2222222222222223, |
| "grad_norm": 0.384765625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.1435546875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.1379, |
| "step": 88, |
| "tokens/total": 11534336, |
| "tokens/train_per_sec_per_gpu": 114.83, |
| "tokens/trainable": 5810086 |
| }, |
| { |
| "epoch": 1.2361111111111112, |
| "grad_norm": 0.33984375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.1845703125, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.26928, |
| "step": 89, |
| "tokens/total": 11665408, |
| "tokens/train_per_sec_per_gpu": 112.52, |
| "tokens/trainable": 5874365 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 0.365234375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.3349609375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.79985, |
| "step": 90, |
| "tokens/total": 11796480, |
| "tokens/train_per_sec_per_gpu": 113.84, |
| "tokens/trainable": 5939018 |
| }, |
| { |
| "epoch": 1.2638888888888888, |
| "grad_norm": 0.375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.287109375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.6223, |
| "step": 91, |
| "tokens/total": 11927552, |
| "tokens/train_per_sec_per_gpu": 117.29, |
| "tokens/trainable": 6006279 |
| }, |
| { |
| "epoch": 1.2777777777777777, |
| "grad_norm": 0.34765625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.28515625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.61523, |
| "step": 92, |
| "tokens/total": 12058624, |
| "tokens/train_per_sec_per_gpu": 111.81, |
| "tokens/trainable": 6070750 |
| }, |
| { |
| "epoch": 1.2916666666666667, |
| "grad_norm": 0.3671875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.2763671875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.5836, |
| "step": 93, |
| "tokens/total": 12189696, |
| "tokens/train_per_sec_per_gpu": 106.93, |
| "tokens/trainable": 6132342 |
| }, |
| { |
| "epoch": 1.3055555555555556, |
| "grad_norm": 0.451171875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.39453125, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.03308, |
| "step": 94, |
| "tokens/total": 12320768, |
| "tokens/train_per_sec_per_gpu": 122.98, |
| "tokens/trainable": 6202641 |
| }, |
| { |
| "epoch": 1.3194444444444444, |
| "grad_norm": 0.357421875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.33447265625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.79799, |
| "step": 95, |
| "tokens/total": 12451840, |
| "tokens/train_per_sec_per_gpu": 113.22, |
| "tokens/trainable": 6268018 |
| }, |
| { |
| "epoch": 1.3333333333333333, |
| "grad_norm": 0.318359375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.19970703125, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.31914, |
| "step": 96, |
| "tokens/total": 12582912, |
| "tokens/train_per_sec_per_gpu": 123.65, |
| "tokens/trainable": 6338829 |
| }, |
| { |
| "epoch": 1.3472222222222223, |
| "grad_norm": 0.357421875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.208984375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.35008, |
| "step": 97, |
| "tokens/total": 12713984, |
| "tokens/train_per_sec_per_gpu": 111.02, |
| "tokens/trainable": 6401661 |
| }, |
| { |
| "epoch": 1.3611111111111112, |
| "grad_norm": 0.400390625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.41748046875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.12671, |
| "step": 98, |
| "tokens/total": 12845056, |
| "tokens/train_per_sec_per_gpu": 103.36, |
| "tokens/trainable": 6461274 |
| }, |
| { |
| "epoch": 1.375, |
| "grad_norm": 0.35546875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.12353515625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.07571, |
| "step": 99, |
| "tokens/total": 12976128, |
| "tokens/train_per_sec_per_gpu": 115.06, |
| "tokens/trainable": 6526932 |
| }, |
| { |
| "epoch": 1.3888888888888888, |
| "grad_norm": 0.484375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.263671875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.53839, |
| "step": 100, |
| "tokens/total": 13107200, |
| "tokens/train_per_sec_per_gpu": 105.89, |
| "tokens/trainable": 6587619 |
| }, |
| { |
| "epoch": 1.4027777777777777, |
| "grad_norm": 0.365234375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.22265625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.3962, |
| "step": 101, |
| "tokens/total": 13238272, |
| "tokens/train_per_sec_per_gpu": 104.0, |
| "tokens/trainable": 6646604 |
| }, |
| { |
| "epoch": 1.4166666666666667, |
| "grad_norm": 0.37890625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.40234375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.06472, |
| "step": 102, |
| "tokens/total": 13369344, |
| "tokens/train_per_sec_per_gpu": 119.08, |
| "tokens/trainable": 6714325 |
| }, |
| { |
| "epoch": 1.4305555555555556, |
| "grad_norm": 0.373046875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.19384765625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.29975, |
| "step": 103, |
| "tokens/total": 13500416, |
| "tokens/train_per_sec_per_gpu": 112.86, |
| "tokens/trainable": 6778878 |
| }, |
| { |
| "epoch": 1.4444444444444444, |
| "grad_norm": 0.35546875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.134765625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.11044, |
| "step": 104, |
| "tokens/total": 13631488, |
| "tokens/train_per_sec_per_gpu": 113.25, |
| "tokens/trainable": 6843549 |
| }, |
| { |
| "epoch": 1.4583333333333333, |
| "grad_norm": 0.369140625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.37890625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.97056, |
| "step": 105, |
| "tokens/total": 13762560, |
| "tokens/train_per_sec_per_gpu": 115.15, |
| "tokens/trainable": 6909269 |
| }, |
| { |
| "epoch": 1.4722222222222223, |
| "grad_norm": 0.3671875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.328125, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.77396, |
| "step": 106, |
| "tokens/total": 13893632, |
| "tokens/train_per_sec_per_gpu": 112.97, |
| "tokens/trainable": 6973302 |
| }, |
| { |
| "epoch": 1.4861111111111112, |
| "grad_norm": 0.35546875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.201171875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.32401, |
| "step": 107, |
| "tokens/total": 14024704, |
| "tokens/train_per_sec_per_gpu": 115.24, |
| "tokens/trainable": 7039048 |
| }, |
| { |
| "epoch": 1.5, |
| "grad_norm": 0.376953125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.107421875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.02655, |
| "step": 108, |
| "tokens/total": 14155776, |
| "tokens/train_per_sec_per_gpu": 98.6, |
| "tokens/trainable": 7096019 |
| }, |
| { |
| "epoch": 1.5138888888888888, |
| "grad_norm": 0.359375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.2978515625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.66142, |
| "step": 109, |
| "tokens/total": 14286848, |
| "tokens/train_per_sec_per_gpu": 111.28, |
| "tokens/trainable": 7159677 |
| }, |
| { |
| "epoch": 1.5277777777777777, |
| "grad_norm": 0.35546875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.24365234375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.46826, |
| "step": 110, |
| "tokens/total": 14417920, |
| "tokens/train_per_sec_per_gpu": 109.49, |
| "tokens/trainable": 7222795 |
| }, |
| { |
| "epoch": 1.5416666666666665, |
| "grad_norm": 0.412109375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.359375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.89376, |
| "step": 111, |
| "tokens/total": 14548992, |
| "tokens/train_per_sec_per_gpu": 110.76, |
| "tokens/trainable": 7285372 |
| }, |
| { |
| "epoch": 1.5555555555555556, |
| "grad_norm": 0.359375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.388671875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.00952, |
| "step": 112, |
| "tokens/total": 14680064, |
| "tokens/train_per_sec_per_gpu": 122.89, |
| "tokens/trainable": 7356499 |
| }, |
| { |
| "epoch": 1.5694444444444444, |
| "grad_norm": 0.34375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.28515625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.61523, |
| "step": 113, |
| "tokens/total": 14811136, |
| "tokens/train_per_sec_per_gpu": 117.71, |
| "tokens/trainable": 7423853 |
| }, |
| { |
| "epoch": 1.5833333333333335, |
| "grad_norm": 0.34765625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.27392578125, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.57486, |
| "step": 114, |
| "tokens/total": 14942208, |
| "tokens/train_per_sec_per_gpu": 119.69, |
| "tokens/trainable": 7492258 |
| }, |
| { |
| "epoch": 1.5972222222222223, |
| "grad_norm": 0.353515625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.458984375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.30159, |
| "step": 115, |
| "tokens/total": 15073280, |
| "tokens/train_per_sec_per_gpu": 122.55, |
| "tokens/trainable": 7562460 |
| }, |
| { |
| "epoch": 1.6111111111111112, |
| "grad_norm": 0.365234375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.1708984375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.22489, |
| "step": 116, |
| "tokens/total": 15204352, |
| "tokens/train_per_sec_per_gpu": 113.64, |
| "tokens/trainable": 7627043 |
| }, |
| { |
| "epoch": 1.625, |
| "grad_norm": 0.3671875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.19775390625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.31267, |
| "step": 117, |
| "tokens/total": 15335424, |
| "tokens/train_per_sec_per_gpu": 104.53, |
| "tokens/trainable": 7686587 |
| }, |
| { |
| "epoch": 1.6388888888888888, |
| "grad_norm": 0.341796875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.15380859375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.17024, |
| "step": 118, |
| "tokens/total": 15466496, |
| "tokens/train_per_sec_per_gpu": 116.76, |
| "tokens/trainable": 7754068 |
| }, |
| { |
| "epoch": 1.6527777777777777, |
| "grad_norm": 0.365234375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.2255859375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.40616, |
| "step": 119, |
| "tokens/total": 15597568, |
| "tokens/train_per_sec_per_gpu": 113.3, |
| "tokens/trainable": 7818942 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 0.341796875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.18701171875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.27727, |
| "step": 120, |
| "tokens/total": 15728640, |
| "tokens/train_per_sec_per_gpu": 110.17, |
| "tokens/trainable": 7881721 |
| }, |
| { |
| "epoch": 1.6805555555555556, |
| "grad_norm": 0.375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.32080078125, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.74642, |
| "step": 121, |
| "tokens/total": 15859712, |
| "tokens/train_per_sec_per_gpu": 108.76, |
| "tokens/trainable": 7942981 |
| }, |
| { |
| "epoch": 1.6944444444444444, |
| "grad_norm": 0.380859375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.3115234375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.71182, |
| "step": 122, |
| "tokens/total": 15990784, |
| "tokens/train_per_sec_per_gpu": 104.29, |
| "tokens/trainable": 8002800 |
| }, |
| { |
| "epoch": 1.7083333333333335, |
| "grad_norm": 0.373046875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.4306640625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.18148, |
| "step": 123, |
| "tokens/total": 16121856, |
| "tokens/train_per_sec_per_gpu": 116.36, |
| "tokens/trainable": 8069048 |
| }, |
| { |
| "epoch": 1.7222222222222223, |
| "grad_norm": 0.35546875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.39111328125, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.01932, |
| "step": 124, |
| "tokens/total": 16252928, |
| "tokens/train_per_sec_per_gpu": 120.13, |
| "tokens/trainable": 8137735 |
| }, |
| { |
| "epoch": 1.7361111111111112, |
| "grad_norm": 0.39453125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.26318359375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.53666, |
| "step": 125, |
| "tokens/total": 16384000, |
| "tokens/train_per_sec_per_gpu": 105.98, |
| "tokens/trainable": 8198350 |
| }, |
| { |
| "epoch": 1.75, |
| "grad_norm": 0.353515625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.05517578125, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 2.87248, |
| "step": 126, |
| "tokens/total": 16515072, |
| "tokens/train_per_sec_per_gpu": 110.4, |
| "tokens/trainable": 8260869 |
| }, |
| { |
| "epoch": 1.7638888888888888, |
| "grad_norm": 0.38671875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.3173828125, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.73364, |
| "step": 127, |
| "tokens/total": 16646144, |
| "tokens/train_per_sec_per_gpu": 125.4, |
| "tokens/trainable": 8332214 |
| }, |
| { |
| "epoch": 1.7777777777777777, |
| "grad_norm": 0.322265625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.16796875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.21545, |
| "step": 128, |
| "tokens/total": 16777216, |
| "tokens/train_per_sec_per_gpu": 124.15, |
| "tokens/trainable": 8401849 |
| }, |
| { |
| "epoch": 1.7916666666666665, |
| "grad_norm": 0.369140625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.95508, |
| "step": 129, |
| "tokens/total": 16908288, |
| "tokens/train_per_sec_per_gpu": 115.16, |
| "tokens/trainable": 8468072 |
| }, |
| { |
| "epoch": 1.8055555555555556, |
| "grad_norm": 0.37109375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.19970703125, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.31914, |
| "step": 130, |
| "tokens/total": 17039360, |
| "tokens/train_per_sec_per_gpu": 110.93, |
| "tokens/trainable": 8530945 |
| }, |
| { |
| "epoch": 1.8194444444444444, |
| "grad_norm": 0.3515625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.05029296875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 2.85849, |
| "step": 131, |
| "tokens/total": 17170432, |
| "tokens/train_per_sec_per_gpu": 113.05, |
| "tokens/trainable": 8594835 |
| }, |
| { |
| "epoch": 1.8333333333333335, |
| "grad_norm": 0.326171875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.1650390625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.20605, |
| "step": 132, |
| "tokens/total": 17301504, |
| "tokens/train_per_sec_per_gpu": 127.28, |
| "tokens/trainable": 8667503 |
| }, |
| { |
| "epoch": 1.8472222222222223, |
| "grad_norm": 0.357421875, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.36572265625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.91855, |
| "step": 133, |
| "tokens/total": 17432576, |
| "tokens/train_per_sec_per_gpu": 126.71, |
| "tokens/trainable": 8740299 |
| }, |
| { |
| "epoch": 1.8611111111111112, |
| "grad_norm": 0.380859375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.216796875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.37636, |
| "step": 134, |
| "tokens/total": 17563648, |
| "tokens/train_per_sec_per_gpu": 105.63, |
| "tokens/trainable": 8799504 |
| }, |
| { |
| "epoch": 1.875, |
| "grad_norm": 0.365234375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.26953125, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.55918, |
| "step": 135, |
| "tokens/total": 17694720, |
| "tokens/train_per_sec_per_gpu": 103.93, |
| "tokens/trainable": 8859305 |
| }, |
| { |
| "epoch": 1.8888888888888888, |
| "grad_norm": 0.380859375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.4208984375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 4.14084, |
| "step": 136, |
| "tokens/total": 17825792, |
| "tokens/train_per_sec_per_gpu": 105.82, |
| "tokens/trainable": 8919367 |
| }, |
| { |
| "epoch": 1.9027777777777777, |
| "grad_norm": 0.3359375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.1484375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.15326, |
| "step": 137, |
| "tokens/total": 17956864, |
| "tokens/train_per_sec_per_gpu": 118.21, |
| "tokens/trainable": 8986910 |
| }, |
| { |
| "epoch": 1.9166666666666665, |
| "grad_norm": 0.375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.1552734375, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.17489, |
| "step": 138, |
| "tokens/total": 18087936, |
| "tokens/train_per_sec_per_gpu": 103.55, |
| "tokens/trainable": 9045213 |
| }, |
| { |
| "epoch": 1.9305555555555556, |
| "grad_norm": 0.345703125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.1845703125, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.26928, |
| "step": 139, |
| "tokens/total": 18219008, |
| "tokens/train_per_sec_per_gpu": 126.11, |
| "tokens/trainable": 9116609 |
| }, |
| { |
| "epoch": 1.9444444444444444, |
| "grad_norm": 0.359375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.1650390625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.20605, |
| "step": 140, |
| "tokens/total": 18350080, |
| "tokens/train_per_sec_per_gpu": 105.52, |
| "tokens/trainable": 9176444 |
| }, |
| { |
| "epoch": 1.9583333333333335, |
| "grad_norm": 0.34765625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.103515625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.01475, |
| "step": 141, |
| "tokens/total": 18481152, |
| "tokens/train_per_sec_per_gpu": 117.44, |
| "tokens/trainable": 9243347 |
| }, |
| { |
| "epoch": 1.9722222222222223, |
| "grad_norm": 0.353515625, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.1982421875, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.31429, |
| "step": 142, |
| "tokens/total": 18612224, |
| "tokens/train_per_sec_per_gpu": 112.37, |
| "tokens/trainable": 9306847 |
| }, |
| { |
| "epoch": 1.9861111111111112, |
| "grad_norm": 0.33984375, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.27197265625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.56788, |
| "step": 143, |
| "tokens/total": 18743296, |
| "tokens/train_per_sec_per_gpu": 121.56, |
| "tokens/trainable": 9375700 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.330078125, |
| "learning_rate": 9.999999747378752e-06, |
| "loss": 1.2587890625, |
| "memory/device_reserved (GiB)": 22.47, |
| "memory/max_active (GiB)": 17.41, |
| "memory/max_allocated (GiB)": 17.41, |
| "ppl": 3.52116, |
| "step": 144, |
| "tokens/total": 18874368, |
| "tokens/train_per_sec_per_gpu": 132.32, |
| "tokens/trainable": 9451140 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 144, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 72, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.1590154275447112e+18, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|