ahhhhh / trainer_state.json
Fizzarolli's picture
Upload folder using huggingface_hub
126646e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 144,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013888888888888888,
"grad_norm": 3.640625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.962890625,
"memory/device_reserved (GiB)": 20.04,
"memory/max_active (GiB)": 14.89,
"memory/max_allocated (GiB)": 14.89,
"ppl": 7.11988,
"step": 1,
"tokens/total": 131072,
"tokens/train_per_sec_per_gpu": 73.21,
"tokens/trainable": 70017
},
{
"epoch": 0.027777777777777776,
"grad_norm": 2.71875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.71484375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 5.55581,
"step": 2,
"tokens/total": 262144,
"tokens/train_per_sec_per_gpu": 116.98,
"tokens/trainable": 138493
},
{
"epoch": 0.041666666666666664,
"grad_norm": 1.9140625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.5576171875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.7475,
"step": 3,
"tokens/total": 393216,
"tokens/train_per_sec_per_gpu": 126.05,
"tokens/trainable": 212680
},
{
"epoch": 0.05555555555555555,
"grad_norm": 1.2734375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.685546875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 5.3954,
"step": 4,
"tokens/total": 524288,
"tokens/train_per_sec_per_gpu": 120.16,
"tokens/trainable": 281918
},
{
"epoch": 0.06944444444444445,
"grad_norm": 0.8671875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.3330078125,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.79243,
"step": 5,
"tokens/total": 655360,
"tokens/train_per_sec_per_gpu": 115.11,
"tokens/trainable": 349530
},
{
"epoch": 0.08333333333333333,
"grad_norm": 0.80078125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.5693359375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.80346,
"step": 6,
"tokens/total": 786432,
"tokens/train_per_sec_per_gpu": 123.48,
"tokens/trainable": 420905
},
{
"epoch": 0.09722222222222222,
"grad_norm": 0.77734375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.400390625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.05678,
"step": 7,
"tokens/total": 917504,
"tokens/train_per_sec_per_gpu": 125.22,
"tokens/trainable": 493455
},
{
"epoch": 0.1111111111111111,
"grad_norm": 0.71875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.4931640625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.45116,
"step": 8,
"tokens/total": 1048576,
"tokens/train_per_sec_per_gpu": 128.89,
"tokens/trainable": 567667
},
{
"epoch": 0.125,
"grad_norm": 0.71484375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.390625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.01736,
"step": 9,
"tokens/total": 1179648,
"tokens/train_per_sec_per_gpu": 114.58,
"tokens/trainable": 633584
},
{
"epoch": 0.1388888888888889,
"grad_norm": 0.64453125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.5556640625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.73823,
"step": 10,
"tokens/total": 1310720,
"tokens/train_per_sec_per_gpu": 109.47,
"tokens/trainable": 697682
},
{
"epoch": 0.1527777777777778,
"grad_norm": 0.55078125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.521484375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.57902,
"step": 11,
"tokens/total": 1441792,
"tokens/train_per_sec_per_gpu": 122.62,
"tokens/trainable": 768737
},
{
"epoch": 0.16666666666666666,
"grad_norm": 0.55859375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.671875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 5.32214,
"step": 12,
"tokens/total": 1572864,
"tokens/train_per_sec_per_gpu": 112.17,
"tokens/trainable": 832992
},
{
"epoch": 0.18055555555555555,
"grad_norm": 0.5390625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.572265625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.81755,
"step": 13,
"tokens/total": 1703936,
"tokens/train_per_sec_per_gpu": 107.49,
"tokens/trainable": 895620
},
{
"epoch": 0.19444444444444445,
"grad_norm": 0.51953125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.5751953125,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.83169,
"step": 14,
"tokens/total": 1835008,
"tokens/train_per_sec_per_gpu": 109.22,
"tokens/trainable": 958297
},
{
"epoch": 0.20833333333333334,
"grad_norm": 0.50390625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.32275390625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.75374,
"step": 15,
"tokens/total": 1966080,
"tokens/train_per_sec_per_gpu": 106.17,
"tokens/trainable": 1019137
},
{
"epoch": 0.2222222222222222,
"grad_norm": 0.44921875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.2939453125,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.64715,
"step": 16,
"tokens/total": 2097152,
"tokens/train_per_sec_per_gpu": 113.27,
"tokens/trainable": 1084516
},
{
"epoch": 0.2361111111111111,
"grad_norm": 0.451171875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.3359375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.80356,
"step": 17,
"tokens/total": 2228224,
"tokens/train_per_sec_per_gpu": 112.42,
"tokens/trainable": 1148795
},
{
"epoch": 0.25,
"grad_norm": 0.4453125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.48828125,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.42948,
"step": 18,
"tokens/total": 2359296,
"tokens/train_per_sec_per_gpu": 110.8,
"tokens/trainable": 1213448
},
{
"epoch": 0.2638888888888889,
"grad_norm": 0.44140625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.4228515625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.14893,
"step": 19,
"tokens/total": 2490368,
"tokens/train_per_sec_per_gpu": 116.43,
"tokens/trainable": 1280709
},
{
"epoch": 0.2777777777777778,
"grad_norm": 0.42578125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.4150390625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.11665,
"step": 20,
"tokens/total": 2621440,
"tokens/train_per_sec_per_gpu": 110.64,
"tokens/trainable": 1345180
},
{
"epoch": 0.2916666666666667,
"grad_norm": 0.4453125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.4140625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.11263,
"step": 21,
"tokens/total": 2752512,
"tokens/train_per_sec_per_gpu": 104.17,
"tokens/trainable": 1406772
},
{
"epoch": 0.3055555555555556,
"grad_norm": 0.4140625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.5234375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.58797,
"step": 22,
"tokens/total": 2883584,
"tokens/train_per_sec_per_gpu": 124.41,
"tokens/trainable": 1477071
},
{
"epoch": 0.3194444444444444,
"grad_norm": 0.41796875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.4541015625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.28064,
"step": 23,
"tokens/total": 3014656,
"tokens/train_per_sec_per_gpu": 114.72,
"tokens/trainable": 1542448
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.37109375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.31640625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.72999,
"step": 24,
"tokens/total": 3145728,
"tokens/train_per_sec_per_gpu": 123.91,
"tokens/trainable": 1613259
},
{
"epoch": 0.3472222222222222,
"grad_norm": 0.515625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.322265625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.75191,
"step": 25,
"tokens/total": 3276800,
"tokens/train_per_sec_per_gpu": 110.91,
"tokens/trainable": 1676091
},
{
"epoch": 0.3611111111111111,
"grad_norm": 0.462890625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.541015625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.66933,
"step": 26,
"tokens/total": 3407872,
"tokens/train_per_sec_per_gpu": 102.95,
"tokens/trainable": 1735704
},
{
"epoch": 0.375,
"grad_norm": 0.416015625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.23095703125,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.42451,
"step": 27,
"tokens/total": 3538944,
"tokens/train_per_sec_per_gpu": 116.23,
"tokens/trainable": 1801362
},
{
"epoch": 0.3888888888888889,
"grad_norm": 0.451171875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.37890625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.97056,
"step": 28,
"tokens/total": 3670016,
"tokens/train_per_sec_per_gpu": 105.41,
"tokens/trainable": 1862049
},
{
"epoch": 0.4027777777777778,
"grad_norm": 0.408203125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.3388671875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.81472,
"step": 29,
"tokens/total": 3801088,
"tokens/train_per_sec_per_gpu": 102.86,
"tokens/trainable": 1921034
},
{
"epoch": 0.4166666666666667,
"grad_norm": 0.392578125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.51171875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.53452,
"step": 30,
"tokens/total": 3932160,
"tokens/train_per_sec_per_gpu": 118.56,
"tokens/trainable": 1988755
},
{
"epoch": 0.4305555555555556,
"grad_norm": 0.392578125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.29833984375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.66321,
"step": 31,
"tokens/total": 4063232,
"tokens/train_per_sec_per_gpu": 112.58,
"tokens/trainable": 2053308
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.376953125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.234375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.43623,
"step": 32,
"tokens/total": 4194304,
"tokens/train_per_sec_per_gpu": 113.73,
"tokens/trainable": 2117979
},
{
"epoch": 0.4583333333333333,
"grad_norm": 0.41015625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.4833984375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.4079,
"step": 33,
"tokens/total": 4325376,
"tokens/train_per_sec_per_gpu": 116.07,
"tokens/trainable": 2183699
},
{
"epoch": 0.4722222222222222,
"grad_norm": 0.427734375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.4267578125,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.16517,
"step": 34,
"tokens/total": 4456448,
"tokens/train_per_sec_per_gpu": 110.79,
"tokens/trainable": 2247732
},
{
"epoch": 0.4861111111111111,
"grad_norm": 0.373046875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.29296875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.64359,
"step": 35,
"tokens/total": 4587520,
"tokens/train_per_sec_per_gpu": 116.38,
"tokens/trainable": 2313478
},
{
"epoch": 0.5,
"grad_norm": 0.3828125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.2060546875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.34028,
"step": 36,
"tokens/total": 4718592,
"tokens/train_per_sec_per_gpu": 99.48,
"tokens/trainable": 2370449
},
{
"epoch": 0.5138888888888888,
"grad_norm": 0.369140625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.3896484375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.01344,
"step": 37,
"tokens/total": 4849664,
"tokens/train_per_sec_per_gpu": 109.69,
"tokens/trainable": 2434107
},
{
"epoch": 0.5277777777777778,
"grad_norm": 0.38671875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.3388671875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.81472,
"step": 38,
"tokens/total": 4980736,
"tokens/train_per_sec_per_gpu": 109.51,
"tokens/trainable": 2497225
},
{
"epoch": 0.5416666666666666,
"grad_norm": 0.447265625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.44921875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.25979,
"step": 39,
"tokens/total": 5111808,
"tokens/train_per_sec_per_gpu": 110.86,
"tokens/trainable": 2559802
},
{
"epoch": 0.5555555555555556,
"grad_norm": 0.38671875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.4765625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.37787,
"step": 40,
"tokens/total": 5242880,
"tokens/train_per_sec_per_gpu": 125.32,
"tokens/trainable": 2630929
},
{
"epoch": 0.5694444444444444,
"grad_norm": 0.357421875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.373046875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.94736,
"step": 41,
"tokens/total": 5373952,
"tokens/train_per_sec_per_gpu": 118.02,
"tokens/trainable": 2698283
},
{
"epoch": 0.5833333333333334,
"grad_norm": 0.34765625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.35791015625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.88806,
"step": 42,
"tokens/total": 5505024,
"tokens/train_per_sec_per_gpu": 118.85,
"tokens/trainable": 2766688
},
{
"epoch": 0.5972222222222222,
"grad_norm": 0.3671875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.544921875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.68761,
"step": 43,
"tokens/total": 5636096,
"tokens/train_per_sec_per_gpu": 120.3,
"tokens/trainable": 2836890
},
{
"epoch": 0.6111111111111112,
"grad_norm": 0.38671875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.24609375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.47674,
"step": 44,
"tokens/total": 5767168,
"tokens/train_per_sec_per_gpu": 113.9,
"tokens/trainable": 2901473
},
{
"epoch": 0.625,
"grad_norm": 0.376953125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.28271484375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.60642,
"step": 45,
"tokens/total": 5898240,
"tokens/train_per_sec_per_gpu": 104.13,
"tokens/trainable": 2961017
},
{
"epoch": 0.6388888888888888,
"grad_norm": 0.341796875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.228515625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.41615,
"step": 46,
"tokens/total": 6029312,
"tokens/train_per_sec_per_gpu": 115.84,
"tokens/trainable": 3028498
},
{
"epoch": 0.6527777777777778,
"grad_norm": 0.361328125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.3037109375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.68294,
"step": 47,
"tokens/total": 6160384,
"tokens/train_per_sec_per_gpu": 114.28,
"tokens/trainable": 3093372
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.361328125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.263671875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.53839,
"step": 48,
"tokens/total": 6291456,
"tokens/train_per_sec_per_gpu": 110.62,
"tokens/trainable": 3156151
},
{
"epoch": 0.6805555555555556,
"grad_norm": 0.392578125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.39697265625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.04294,
"step": 49,
"tokens/total": 6422528,
"tokens/train_per_sec_per_gpu": 104.84,
"tokens/trainable": 3217411
},
{
"epoch": 0.6944444444444444,
"grad_norm": 0.38671875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.392578125,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.02521,
"step": 50,
"tokens/total": 6553600,
"tokens/train_per_sec_per_gpu": 104.7,
"tokens/trainable": 3277230
},
{
"epoch": 0.7083333333333334,
"grad_norm": 0.39453125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.5087890625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.52125,
"step": 51,
"tokens/total": 6684672,
"tokens/train_per_sec_per_gpu": 115.14,
"tokens/trainable": 3343478
},
{
"epoch": 0.7222222222222222,
"grad_norm": 0.357421875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.46337890625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.32053,
"step": 52,
"tokens/total": 6815744,
"tokens/train_per_sec_per_gpu": 118.03,
"tokens/trainable": 3412165
},
{
"epoch": 0.7361111111111112,
"grad_norm": 0.38671875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.337890625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.811,
"step": 53,
"tokens/total": 6946816,
"tokens/train_per_sec_per_gpu": 103.45,
"tokens/trainable": 3472780
},
{
"epoch": 0.75,
"grad_norm": 0.34765625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.12158203125,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.06971,
"step": 54,
"tokens/total": 7077888,
"tokens/train_per_sec_per_gpu": 108.55,
"tokens/trainable": 3535299
},
{
"epoch": 0.7638888888888888,
"grad_norm": 0.3515625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.3837890625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.98999,
"step": 55,
"tokens/total": 7208960,
"tokens/train_per_sec_per_gpu": 122.32,
"tokens/trainable": 3606644
},
{
"epoch": 0.7777777777777778,
"grad_norm": 0.357421875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.232421875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.42953,
"step": 56,
"tokens/total": 7340032,
"tokens/train_per_sec_per_gpu": 120.64,
"tokens/trainable": 3676279
},
{
"epoch": 0.7916666666666666,
"grad_norm": 0.380859375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.443359375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.2349,
"step": 57,
"tokens/total": 7471104,
"tokens/train_per_sec_per_gpu": 117.13,
"tokens/trainable": 3742502
},
{
"epoch": 0.8055555555555556,
"grad_norm": 0.390625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.26171875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.53149,
"step": 58,
"tokens/total": 7602176,
"tokens/train_per_sec_per_gpu": 110.65,
"tokens/trainable": 3805375
},
{
"epoch": 0.8194444444444444,
"grad_norm": 0.353515625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.11279296875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.04285,
"step": 59,
"tokens/total": 7733248,
"tokens/train_per_sec_per_gpu": 111.67,
"tokens/trainable": 3869265
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.326171875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.2236328125,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.39952,
"step": 60,
"tokens/total": 7864320,
"tokens/train_per_sec_per_gpu": 125.63,
"tokens/trainable": 3941933
},
{
"epoch": 0.8472222222222222,
"grad_norm": 0.34765625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.4287109375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.17332,
"step": 61,
"tokens/total": 7995392,
"tokens/train_per_sec_per_gpu": 127.75,
"tokens/trainable": 4014729
},
{
"epoch": 0.8611111111111112,
"grad_norm": 0.396484375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.28369140625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.60994,
"step": 62,
"tokens/total": 8126464,
"tokens/train_per_sec_per_gpu": 103.96,
"tokens/trainable": 4073934
},
{
"epoch": 0.875,
"grad_norm": 0.357421875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.3369140625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.80728,
"step": 63,
"tokens/total": 8257536,
"tokens/train_per_sec_per_gpu": 104.09,
"tokens/trainable": 4133735
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.37890625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.4873046875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.42515,
"step": 64,
"tokens/total": 8388608,
"tokens/train_per_sec_per_gpu": 104.66,
"tokens/trainable": 4193797
},
{
"epoch": 0.9027777777777778,
"grad_norm": 0.3984375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.2041015625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.33376,
"step": 65,
"tokens/total": 8519680,
"tokens/train_per_sec_per_gpu": 117.74,
"tokens/trainable": 4261340
},
{
"epoch": 0.9166666666666666,
"grad_norm": 0.384765625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.220703125,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.38957,
"step": 66,
"tokens/total": 8650752,
"tokens/train_per_sec_per_gpu": 100.0,
"tokens/trainable": 4319643
},
{
"epoch": 0.9305555555555556,
"grad_norm": 0.341796875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.2412109375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.4598,
"step": 67,
"tokens/total": 8781824,
"tokens/train_per_sec_per_gpu": 121.85,
"tokens/trainable": 4391039
},
{
"epoch": 0.9444444444444444,
"grad_norm": 0.361328125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.228515625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.41615,
"step": 68,
"tokens/total": 8912896,
"tokens/train_per_sec_per_gpu": 105.16,
"tokens/trainable": 4450874
},
{
"epoch": 0.9583333333333334,
"grad_norm": 0.380859375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.158203125,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.18421,
"step": 69,
"tokens/total": 9043968,
"tokens/train_per_sec_per_gpu": 116.88,
"tokens/trainable": 4517777
},
{
"epoch": 0.9722222222222222,
"grad_norm": 0.361328125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.25390625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.504,
"step": 70,
"tokens/total": 9175040,
"tokens/train_per_sec_per_gpu": 110.04,
"tokens/trainable": 4581277
},
{
"epoch": 0.9861111111111112,
"grad_norm": 0.357421875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.33056640625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.78319,
"step": 71,
"tokens/total": 9306112,
"tokens/train_per_sec_per_gpu": 118.96,
"tokens/trainable": 4650130
},
{
"epoch": 1.0,
"grad_norm": 0.33984375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.3134765625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.71908,
"step": 72,
"tokens/total": 9437184,
"tokens/train_per_sec_per_gpu": 131.5,
"tokens/trainable": 4725570
},
{
"epoch": 1.0138888888888888,
"grad_norm": 0.361328125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.314453125,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.72271,
"step": 73,
"tokens/total": 9568256,
"tokens/train_per_sec_per_gpu": 122.77,
"tokens/trainable": 4795587
},
{
"epoch": 1.0277777777777777,
"grad_norm": 0.34375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.244140625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.46995,
"step": 74,
"tokens/total": 9699328,
"tokens/train_per_sec_per_gpu": 119.83,
"tokens/trainable": 4864063
},
{
"epoch": 1.0416666666666667,
"grad_norm": 0.3203125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.21533203125,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.37141,
"step": 75,
"tokens/total": 9830400,
"tokens/train_per_sec_per_gpu": 129.56,
"tokens/trainable": 4938250
},
{
"epoch": 1.0555555555555556,
"grad_norm": 0.34375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.3935546875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.02915,
"step": 76,
"tokens/total": 9961472,
"tokens/train_per_sec_per_gpu": 120.33,
"tokens/trainable": 5007488
},
{
"epoch": 1.0694444444444444,
"grad_norm": 0.341796875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.06982421875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 2.91487,
"step": 77,
"tokens/total": 10092544,
"tokens/train_per_sec_per_gpu": 117.34,
"tokens/trainable": 5075100
},
{
"epoch": 1.0833333333333333,
"grad_norm": 0.33984375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.32666015625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.76844,
"step": 78,
"tokens/total": 10223616,
"tokens/train_per_sec_per_gpu": 124.04,
"tokens/trainable": 5146475
},
{
"epoch": 1.0972222222222223,
"grad_norm": 0.37890625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.166015625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.20918,
"step": 79,
"tokens/total": 10354688,
"tokens/train_per_sec_per_gpu": 125.54,
"tokens/trainable": 5219025
},
{
"epoch": 1.1111111111111112,
"grad_norm": 0.33203125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.3037109375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.68294,
"step": 80,
"tokens/total": 10485760,
"tokens/train_per_sec_per_gpu": 128.95,
"tokens/trainable": 5293237
},
{
"epoch": 1.125,
"grad_norm": 0.349609375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.1826171875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.2629,
"step": 81,
"tokens/total": 10616832,
"tokens/train_per_sec_per_gpu": 115.0,
"tokens/trainable": 5359154
},
{
"epoch": 1.1388888888888888,
"grad_norm": 0.375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.34912109375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.85404,
"step": 82,
"tokens/total": 10747904,
"tokens/train_per_sec_per_gpu": 111.51,
"tokens/trainable": 5423252
},
{
"epoch": 1.1527777777777777,
"grad_norm": 0.421875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.3427734375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.82965,
"step": 83,
"tokens/total": 10878976,
"tokens/train_per_sec_per_gpu": 123.76,
"tokens/trainable": 5494307
},
{
"epoch": 1.1666666666666667,
"grad_norm": 0.376953125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.4833984375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.4079,
"step": 84,
"tokens/total": 11010048,
"tokens/train_per_sec_per_gpu": 111.84,
"tokens/trainable": 5558562
},
{
"epoch": 1.1805555555555556,
"grad_norm": 0.37109375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.3857421875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.99779,
"step": 85,
"tokens/total": 11141120,
"tokens/train_per_sec_per_gpu": 108.82,
"tokens/trainable": 5621190
},
{
"epoch": 1.1944444444444444,
"grad_norm": 0.36328125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.396484375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.04097,
"step": 86,
"tokens/total": 11272192,
"tokens/train_per_sec_per_gpu": 111.48,
"tokens/trainable": 5683867
},
{
"epoch": 1.2083333333333333,
"grad_norm": 0.376953125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.1533203125,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.1687,
"step": 87,
"tokens/total": 11403264,
"tokens/train_per_sec_per_gpu": 107.77,
"tokens/trainable": 5744707
},
{
"epoch": 1.2222222222222223,
"grad_norm": 0.384765625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.1435546875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.1379,
"step": 88,
"tokens/total": 11534336,
"tokens/train_per_sec_per_gpu": 114.83,
"tokens/trainable": 5810086
},
{
"epoch": 1.2361111111111112,
"grad_norm": 0.33984375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.1845703125,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.26928,
"step": 89,
"tokens/total": 11665408,
"tokens/train_per_sec_per_gpu": 112.52,
"tokens/trainable": 5874365
},
{
"epoch": 1.25,
"grad_norm": 0.365234375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.3349609375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.79985,
"step": 90,
"tokens/total": 11796480,
"tokens/train_per_sec_per_gpu": 113.84,
"tokens/trainable": 5939018
},
{
"epoch": 1.2638888888888888,
"grad_norm": 0.375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.287109375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.6223,
"step": 91,
"tokens/total": 11927552,
"tokens/train_per_sec_per_gpu": 117.29,
"tokens/trainable": 6006279
},
{
"epoch": 1.2777777777777777,
"grad_norm": 0.34765625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.28515625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.61523,
"step": 92,
"tokens/total": 12058624,
"tokens/train_per_sec_per_gpu": 111.81,
"tokens/trainable": 6070750
},
{
"epoch": 1.2916666666666667,
"grad_norm": 0.3671875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.2763671875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.5836,
"step": 93,
"tokens/total": 12189696,
"tokens/train_per_sec_per_gpu": 106.93,
"tokens/trainable": 6132342
},
{
"epoch": 1.3055555555555556,
"grad_norm": 0.451171875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.39453125,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.03308,
"step": 94,
"tokens/total": 12320768,
"tokens/train_per_sec_per_gpu": 122.98,
"tokens/trainable": 6202641
},
{
"epoch": 1.3194444444444444,
"grad_norm": 0.357421875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.33447265625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.79799,
"step": 95,
"tokens/total": 12451840,
"tokens/train_per_sec_per_gpu": 113.22,
"tokens/trainable": 6268018
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.318359375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.19970703125,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.31914,
"step": 96,
"tokens/total": 12582912,
"tokens/train_per_sec_per_gpu": 123.65,
"tokens/trainable": 6338829
},
{
"epoch": 1.3472222222222223,
"grad_norm": 0.357421875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.208984375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.35008,
"step": 97,
"tokens/total": 12713984,
"tokens/train_per_sec_per_gpu": 111.02,
"tokens/trainable": 6401661
},
{
"epoch": 1.3611111111111112,
"grad_norm": 0.400390625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.41748046875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.12671,
"step": 98,
"tokens/total": 12845056,
"tokens/train_per_sec_per_gpu": 103.36,
"tokens/trainable": 6461274
},
{
"epoch": 1.375,
"grad_norm": 0.35546875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.12353515625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.07571,
"step": 99,
"tokens/total": 12976128,
"tokens/train_per_sec_per_gpu": 115.06,
"tokens/trainable": 6526932
},
{
"epoch": 1.3888888888888888,
"grad_norm": 0.484375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.263671875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.53839,
"step": 100,
"tokens/total": 13107200,
"tokens/train_per_sec_per_gpu": 105.89,
"tokens/trainable": 6587619
},
{
"epoch": 1.4027777777777777,
"grad_norm": 0.365234375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.22265625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.3962,
"step": 101,
"tokens/total": 13238272,
"tokens/train_per_sec_per_gpu": 104.0,
"tokens/trainable": 6646604
},
{
"epoch": 1.4166666666666667,
"grad_norm": 0.37890625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.40234375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.06472,
"step": 102,
"tokens/total": 13369344,
"tokens/train_per_sec_per_gpu": 119.08,
"tokens/trainable": 6714325
},
{
"epoch": 1.4305555555555556,
"grad_norm": 0.373046875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.19384765625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.29975,
"step": 103,
"tokens/total": 13500416,
"tokens/train_per_sec_per_gpu": 112.86,
"tokens/trainable": 6778878
},
{
"epoch": 1.4444444444444444,
"grad_norm": 0.35546875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.134765625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.11044,
"step": 104,
"tokens/total": 13631488,
"tokens/train_per_sec_per_gpu": 113.25,
"tokens/trainable": 6843549
},
{
"epoch": 1.4583333333333333,
"grad_norm": 0.369140625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.37890625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.97056,
"step": 105,
"tokens/total": 13762560,
"tokens/train_per_sec_per_gpu": 115.15,
"tokens/trainable": 6909269
},
{
"epoch": 1.4722222222222223,
"grad_norm": 0.3671875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.328125,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.77396,
"step": 106,
"tokens/total": 13893632,
"tokens/train_per_sec_per_gpu": 112.97,
"tokens/trainable": 6973302
},
{
"epoch": 1.4861111111111112,
"grad_norm": 0.35546875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.201171875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.32401,
"step": 107,
"tokens/total": 14024704,
"tokens/train_per_sec_per_gpu": 115.24,
"tokens/trainable": 7039048
},
{
"epoch": 1.5,
"grad_norm": 0.376953125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.107421875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.02655,
"step": 108,
"tokens/total": 14155776,
"tokens/train_per_sec_per_gpu": 98.6,
"tokens/trainable": 7096019
},
{
"epoch": 1.5138888888888888,
"grad_norm": 0.359375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.2978515625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.66142,
"step": 109,
"tokens/total": 14286848,
"tokens/train_per_sec_per_gpu": 111.28,
"tokens/trainable": 7159677
},
{
"epoch": 1.5277777777777777,
"grad_norm": 0.35546875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.24365234375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.46826,
"step": 110,
"tokens/total": 14417920,
"tokens/train_per_sec_per_gpu": 109.49,
"tokens/trainable": 7222795
},
{
"epoch": 1.5416666666666665,
"grad_norm": 0.412109375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.359375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.89376,
"step": 111,
"tokens/total": 14548992,
"tokens/train_per_sec_per_gpu": 110.76,
"tokens/trainable": 7285372
},
{
"epoch": 1.5555555555555556,
"grad_norm": 0.359375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.388671875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.00952,
"step": 112,
"tokens/total": 14680064,
"tokens/train_per_sec_per_gpu": 122.89,
"tokens/trainable": 7356499
},
{
"epoch": 1.5694444444444444,
"grad_norm": 0.34375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.28515625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.61523,
"step": 113,
"tokens/total": 14811136,
"tokens/train_per_sec_per_gpu": 117.71,
"tokens/trainable": 7423853
},
{
"epoch": 1.5833333333333335,
"grad_norm": 0.34765625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.27392578125,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.57486,
"step": 114,
"tokens/total": 14942208,
"tokens/train_per_sec_per_gpu": 119.69,
"tokens/trainable": 7492258
},
{
"epoch": 1.5972222222222223,
"grad_norm": 0.353515625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.458984375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.30159,
"step": 115,
"tokens/total": 15073280,
"tokens/train_per_sec_per_gpu": 122.55,
"tokens/trainable": 7562460
},
{
"epoch": 1.6111111111111112,
"grad_norm": 0.365234375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.1708984375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.22489,
"step": 116,
"tokens/total": 15204352,
"tokens/train_per_sec_per_gpu": 113.64,
"tokens/trainable": 7627043
},
{
"epoch": 1.625,
"grad_norm": 0.3671875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.19775390625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.31267,
"step": 117,
"tokens/total": 15335424,
"tokens/train_per_sec_per_gpu": 104.53,
"tokens/trainable": 7686587
},
{
"epoch": 1.6388888888888888,
"grad_norm": 0.341796875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.15380859375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.17024,
"step": 118,
"tokens/total": 15466496,
"tokens/train_per_sec_per_gpu": 116.76,
"tokens/trainable": 7754068
},
{
"epoch": 1.6527777777777777,
"grad_norm": 0.365234375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.2255859375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.40616,
"step": 119,
"tokens/total": 15597568,
"tokens/train_per_sec_per_gpu": 113.3,
"tokens/trainable": 7818942
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.341796875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.18701171875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.27727,
"step": 120,
"tokens/total": 15728640,
"tokens/train_per_sec_per_gpu": 110.17,
"tokens/trainable": 7881721
},
{
"epoch": 1.6805555555555556,
"grad_norm": 0.375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.32080078125,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.74642,
"step": 121,
"tokens/total": 15859712,
"tokens/train_per_sec_per_gpu": 108.76,
"tokens/trainable": 7942981
},
{
"epoch": 1.6944444444444444,
"grad_norm": 0.380859375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.3115234375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.71182,
"step": 122,
"tokens/total": 15990784,
"tokens/train_per_sec_per_gpu": 104.29,
"tokens/trainable": 8002800
},
{
"epoch": 1.7083333333333335,
"grad_norm": 0.373046875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.4306640625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.18148,
"step": 123,
"tokens/total": 16121856,
"tokens/train_per_sec_per_gpu": 116.36,
"tokens/trainable": 8069048
},
{
"epoch": 1.7222222222222223,
"grad_norm": 0.35546875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.39111328125,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.01932,
"step": 124,
"tokens/total": 16252928,
"tokens/train_per_sec_per_gpu": 120.13,
"tokens/trainable": 8137735
},
{
"epoch": 1.7361111111111112,
"grad_norm": 0.39453125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.26318359375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.53666,
"step": 125,
"tokens/total": 16384000,
"tokens/train_per_sec_per_gpu": 105.98,
"tokens/trainable": 8198350
},
{
"epoch": 1.75,
"grad_norm": 0.353515625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.05517578125,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 2.87248,
"step": 126,
"tokens/total": 16515072,
"tokens/train_per_sec_per_gpu": 110.4,
"tokens/trainable": 8260869
},
{
"epoch": 1.7638888888888888,
"grad_norm": 0.38671875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.3173828125,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.73364,
"step": 127,
"tokens/total": 16646144,
"tokens/train_per_sec_per_gpu": 125.4,
"tokens/trainable": 8332214
},
{
"epoch": 1.7777777777777777,
"grad_norm": 0.322265625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.16796875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.21545,
"step": 128,
"tokens/total": 16777216,
"tokens/train_per_sec_per_gpu": 124.15,
"tokens/trainable": 8401849
},
{
"epoch": 1.7916666666666665,
"grad_norm": 0.369140625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.95508,
"step": 129,
"tokens/total": 16908288,
"tokens/train_per_sec_per_gpu": 115.16,
"tokens/trainable": 8468072
},
{
"epoch": 1.8055555555555556,
"grad_norm": 0.37109375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.19970703125,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.31914,
"step": 130,
"tokens/total": 17039360,
"tokens/train_per_sec_per_gpu": 110.93,
"tokens/trainable": 8530945
},
{
"epoch": 1.8194444444444444,
"grad_norm": 0.3515625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.05029296875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 2.85849,
"step": 131,
"tokens/total": 17170432,
"tokens/train_per_sec_per_gpu": 113.05,
"tokens/trainable": 8594835
},
{
"epoch": 1.8333333333333335,
"grad_norm": 0.326171875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.1650390625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.20605,
"step": 132,
"tokens/total": 17301504,
"tokens/train_per_sec_per_gpu": 127.28,
"tokens/trainable": 8667503
},
{
"epoch": 1.8472222222222223,
"grad_norm": 0.357421875,
"learning_rate": 9.999999747378752e-06,
"loss": 1.36572265625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.91855,
"step": 133,
"tokens/total": 17432576,
"tokens/train_per_sec_per_gpu": 126.71,
"tokens/trainable": 8740299
},
{
"epoch": 1.8611111111111112,
"grad_norm": 0.380859375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.216796875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.37636,
"step": 134,
"tokens/total": 17563648,
"tokens/train_per_sec_per_gpu": 105.63,
"tokens/trainable": 8799504
},
{
"epoch": 1.875,
"grad_norm": 0.365234375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.26953125,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.55918,
"step": 135,
"tokens/total": 17694720,
"tokens/train_per_sec_per_gpu": 103.93,
"tokens/trainable": 8859305
},
{
"epoch": 1.8888888888888888,
"grad_norm": 0.380859375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.4208984375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 4.14084,
"step": 136,
"tokens/total": 17825792,
"tokens/train_per_sec_per_gpu": 105.82,
"tokens/trainable": 8919367
},
{
"epoch": 1.9027777777777777,
"grad_norm": 0.3359375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.1484375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.15326,
"step": 137,
"tokens/total": 17956864,
"tokens/train_per_sec_per_gpu": 118.21,
"tokens/trainable": 8986910
},
{
"epoch": 1.9166666666666665,
"grad_norm": 0.375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.1552734375,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.17489,
"step": 138,
"tokens/total": 18087936,
"tokens/train_per_sec_per_gpu": 103.55,
"tokens/trainable": 9045213
},
{
"epoch": 1.9305555555555556,
"grad_norm": 0.345703125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.1845703125,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.26928,
"step": 139,
"tokens/total": 18219008,
"tokens/train_per_sec_per_gpu": 126.11,
"tokens/trainable": 9116609
},
{
"epoch": 1.9444444444444444,
"grad_norm": 0.359375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.1650390625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.20605,
"step": 140,
"tokens/total": 18350080,
"tokens/train_per_sec_per_gpu": 105.52,
"tokens/trainable": 9176444
},
{
"epoch": 1.9583333333333335,
"grad_norm": 0.34765625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.103515625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.01475,
"step": 141,
"tokens/total": 18481152,
"tokens/train_per_sec_per_gpu": 117.44,
"tokens/trainable": 9243347
},
{
"epoch": 1.9722222222222223,
"grad_norm": 0.353515625,
"learning_rate": 9.999999747378752e-06,
"loss": 1.1982421875,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.31429,
"step": 142,
"tokens/total": 18612224,
"tokens/train_per_sec_per_gpu": 112.37,
"tokens/trainable": 9306847
},
{
"epoch": 1.9861111111111112,
"grad_norm": 0.33984375,
"learning_rate": 9.999999747378752e-06,
"loss": 1.27197265625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.56788,
"step": 143,
"tokens/total": 18743296,
"tokens/train_per_sec_per_gpu": 121.56,
"tokens/trainable": 9375700
},
{
"epoch": 2.0,
"grad_norm": 0.330078125,
"learning_rate": 9.999999747378752e-06,
"loss": 1.2587890625,
"memory/device_reserved (GiB)": 22.47,
"memory/max_active (GiB)": 17.41,
"memory/max_allocated (GiB)": 17.41,
"ppl": 3.52116,
"step": 144,
"tokens/total": 18874368,
"tokens/train_per_sec_per_gpu": 132.32,
"tokens/trainable": 9451140
}
],
"logging_steps": 1,
"max_steps": 144,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 72,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.1590154275447112e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}