SeedCoder-8b-LoRa-CP-120 / trainer_state.json
pandyamarut's picture
Upload folder using huggingface_hub
a94edca verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.27522935779816515,
"eval_steps": 500,
"global_step": 120,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0022935779816513763,
"grad_norm": 0.12869106233119965,
"learning_rate": 0.0,
"loss": 0.1978,
"memory/device_reserved (GiB)": 50.77,
"memory/max_active (GiB)": 48.85,
"memory/max_allocated (GiB)": 48.85,
"step": 1,
"tokens_per_second_per_gpu": 354.96
},
{
"epoch": 0.0045871559633027525,
"grad_norm": 0.15667210519313812,
"learning_rate": 4.7619047619047615e-06,
"loss": 0.2353,
"memory/device_reserved (GiB)": 50.77,
"memory/max_active (GiB)": 48.85,
"memory/max_allocated (GiB)": 48.85,
"step": 2,
"tokens_per_second_per_gpu": 406.37
},
{
"epoch": 0.006880733944954129,
"grad_norm": 0.2217973917722702,
"learning_rate": 9.523809523809523e-06,
"loss": 0.2243,
"memory/device_reserved (GiB)": 50.87,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 3,
"tokens_per_second_per_gpu": 371.18
},
{
"epoch": 0.009174311926605505,
"grad_norm": 0.15948686003684998,
"learning_rate": 1.4285714285714285e-05,
"loss": 0.2392,
"memory/device_reserved (GiB)": 50.87,
"memory/max_active (GiB)": 48.85,
"memory/max_allocated (GiB)": 48.85,
"step": 4,
"tokens_per_second_per_gpu": 414.48
},
{
"epoch": 0.011467889908256881,
"grad_norm": 0.153566375374794,
"learning_rate": 1.9047619047619046e-05,
"loss": 0.2182,
"memory/device_reserved (GiB)": 50.87,
"memory/max_active (GiB)": 48.93,
"memory/max_allocated (GiB)": 48.93,
"step": 5,
"tokens_per_second_per_gpu": 369.22
},
{
"epoch": 0.013761467889908258,
"grad_norm": 0.1521972268819809,
"learning_rate": 2.380952380952381e-05,
"loss": 0.2112,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 49.04,
"memory/max_allocated (GiB)": 49.04,
"step": 6,
"tokens_per_second_per_gpu": 429.31
},
{
"epoch": 0.016055045871559634,
"grad_norm": 0.168710395693779,
"learning_rate": 2.857142857142857e-05,
"loss": 0.226,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 7,
"tokens_per_second_per_gpu": 417.78
},
{
"epoch": 0.01834862385321101,
"grad_norm": 0.13864850997924805,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.1884,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 8,
"tokens_per_second_per_gpu": 439.56
},
{
"epoch": 0.020642201834862386,
"grad_norm": 0.15227903425693512,
"learning_rate": 3.809523809523809e-05,
"loss": 0.1996,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 48.93,
"memory/max_allocated (GiB)": 48.93,
"step": 9,
"tokens_per_second_per_gpu": 411.33
},
{
"epoch": 0.022935779816513763,
"grad_norm": 0.13421630859375,
"learning_rate": 4.2857142857142856e-05,
"loss": 0.1599,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 10,
"tokens_per_second_per_gpu": 496.3
},
{
"epoch": 0.02522935779816514,
"grad_norm": 0.14955134689807892,
"learning_rate": 4.761904761904762e-05,
"loss": 0.1735,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 49.0,
"memory/max_allocated (GiB)": 49.0,
"step": 11,
"tokens_per_second_per_gpu": 372.95
},
{
"epoch": 0.027522935779816515,
"grad_norm": 0.1432778388261795,
"learning_rate": 5.2380952380952384e-05,
"loss": 0.1515,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 49.0,
"memory/max_allocated (GiB)": 49.0,
"step": 12,
"tokens_per_second_per_gpu": 398.65
},
{
"epoch": 0.02981651376146789,
"grad_norm": 0.14163611829280853,
"learning_rate": 5.714285714285714e-05,
"loss": 0.1517,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 13,
"tokens_per_second_per_gpu": 440.5
},
{
"epoch": 0.03211009174311927,
"grad_norm": 0.15477906167507172,
"learning_rate": 6.19047619047619e-05,
"loss": 0.1444,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 48.89,
"memory/max_allocated (GiB)": 48.89,
"step": 14,
"tokens_per_second_per_gpu": 385.32
},
{
"epoch": 0.034403669724770644,
"grad_norm": 0.1055532768368721,
"learning_rate": 6.666666666666667e-05,
"loss": 0.1292,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 48.93,
"memory/max_allocated (GiB)": 48.93,
"step": 15,
"tokens_per_second_per_gpu": 453.02
},
{
"epoch": 0.03669724770642202,
"grad_norm": 0.10180933028459549,
"learning_rate": 7.142857142857143e-05,
"loss": 0.1208,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 49.0,
"memory/max_allocated (GiB)": 49.0,
"step": 16,
"tokens_per_second_per_gpu": 474.27
},
{
"epoch": 0.0389908256880734,
"grad_norm": 0.07999677956104279,
"learning_rate": 7.619047619047618e-05,
"loss": 0.132,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 48.89,
"memory/max_allocated (GiB)": 48.89,
"step": 17,
"tokens_per_second_per_gpu": 382.05
},
{
"epoch": 0.04128440366972477,
"grad_norm": 0.09194924682378769,
"learning_rate": 8.095238095238096e-05,
"loss": 0.1067,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 48.81,
"memory/max_allocated (GiB)": 48.81,
"step": 18,
"tokens_per_second_per_gpu": 398.61
},
{
"epoch": 0.04357798165137615,
"grad_norm": 0.0931428000330925,
"learning_rate": 8.571428571428571e-05,
"loss": 0.1088,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 48.93,
"memory/max_allocated (GiB)": 48.93,
"step": 19,
"tokens_per_second_per_gpu": 447.07
},
{
"epoch": 0.045871559633027525,
"grad_norm": 0.06202042102813721,
"learning_rate": 9.047619047619048e-05,
"loss": 0.0962,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 49.0,
"memory/max_allocated (GiB)": 49.0,
"step": 20,
"tokens_per_second_per_gpu": 382.57
},
{
"epoch": 0.0481651376146789,
"grad_norm": 0.04220607504248619,
"learning_rate": 9.523809523809524e-05,
"loss": 0.0963,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 48.85,
"memory/max_allocated (GiB)": 48.85,
"step": 21,
"tokens_per_second_per_gpu": 423.29
},
{
"epoch": 0.05045871559633028,
"grad_norm": 0.050066106021404266,
"learning_rate": 0.0001,
"loss": 0.1032,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 48.89,
"memory/max_allocated (GiB)": 48.89,
"step": 22,
"tokens_per_second_per_gpu": 381.35
},
{
"epoch": 0.052752293577981654,
"grad_norm": 0.0557384118437767,
"learning_rate": 9.999856734543933e-05,
"loss": 0.1025,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 23,
"tokens_per_second_per_gpu": 393.62
},
{
"epoch": 0.05504587155963303,
"grad_norm": 0.04612402245402336,
"learning_rate": 9.999426946385727e-05,
"loss": 0.0985,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 48.89,
"memory/max_allocated (GiB)": 48.89,
"step": 24,
"tokens_per_second_per_gpu": 515.46
},
{
"epoch": 0.05733944954128441,
"grad_norm": 0.09721734374761581,
"learning_rate": 9.998710660154898e-05,
"loss": 0.1062,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 48.81,
"memory/max_allocated (GiB)": 48.81,
"step": 25,
"tokens_per_second_per_gpu": 398.15
},
{
"epoch": 0.05963302752293578,
"grad_norm": 0.036745935678482056,
"learning_rate": 9.997707916899079e-05,
"loss": 0.1045,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 26,
"tokens_per_second_per_gpu": 422.42
},
{
"epoch": 0.06192660550458716,
"grad_norm": 0.04298936203122139,
"learning_rate": 9.996418774081658e-05,
"loss": 0.0923,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 48.89,
"memory/max_allocated (GiB)": 48.89,
"step": 27,
"tokens_per_second_per_gpu": 440.87
},
{
"epoch": 0.06422018348623854,
"grad_norm": 0.033536747097969055,
"learning_rate": 9.994843305578486e-05,
"loss": 0.096,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 48.89,
"memory/max_allocated (GiB)": 48.89,
"step": 28,
"tokens_per_second_per_gpu": 370.28
},
{
"epoch": 0.06651376146788991,
"grad_norm": 0.03256046772003174,
"learning_rate": 9.99298160167365e-05,
"loss": 0.0832,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 48.81,
"memory/max_allocated (GiB)": 48.81,
"step": 29,
"tokens_per_second_per_gpu": 357.19
},
{
"epoch": 0.06880733944954129,
"grad_norm": 0.042709868401288986,
"learning_rate": 9.990833769054293e-05,
"loss": 0.086,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 48.93,
"memory/max_allocated (GiB)": 48.93,
"step": 30,
"tokens_per_second_per_gpu": 441.89
},
{
"epoch": 0.07110091743119266,
"grad_norm": 0.04347776621580124,
"learning_rate": 9.988399930804504e-05,
"loss": 0.1,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 48.77,
"memory/max_allocated (GiB)": 48.77,
"step": 31,
"tokens_per_second_per_gpu": 348.66
},
{
"epoch": 0.07339449541284404,
"grad_norm": 0.030414681881666183,
"learning_rate": 9.985680226398261e-05,
"loss": 0.0811,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 49.0,
"memory/max_allocated (GiB)": 49.0,
"step": 32,
"tokens_per_second_per_gpu": 435.28
},
{
"epoch": 0.07568807339449542,
"grad_norm": 0.034023743122816086,
"learning_rate": 9.98267481169144e-05,
"loss": 0.0743,
"memory/device_reserved (GiB)": 50.93,
"memory/max_active (GiB)": 49.0,
"memory/max_allocated (GiB)": 49.0,
"step": 33,
"tokens_per_second_per_gpu": 482.51
},
{
"epoch": 0.0779816513761468,
"grad_norm": 0.03136487305164337,
"learning_rate": 9.979383858912885e-05,
"loss": 0.0739,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 49.08,
"memory/max_allocated (GiB)": 49.08,
"step": 34,
"tokens_per_second_per_gpu": 496.59
},
{
"epoch": 0.08027522935779817,
"grad_norm": 0.028108298778533936,
"learning_rate": 9.975807556654537e-05,
"loss": 0.077,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 35,
"tokens_per_second_per_gpu": 349.1
},
{
"epoch": 0.08256880733944955,
"grad_norm": 0.028020795434713364,
"learning_rate": 9.971946109860626e-05,
"loss": 0.0775,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.81,
"memory/max_allocated (GiB)": 48.81,
"step": 36,
"tokens_per_second_per_gpu": 351.02
},
{
"epoch": 0.08486238532110092,
"grad_norm": 0.028756650164723396,
"learning_rate": 9.967799739815925e-05,
"loss": 0.0788,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.93,
"memory/max_allocated (GiB)": 48.93,
"step": 37,
"tokens_per_second_per_gpu": 534.52
},
{
"epoch": 0.0871559633027523,
"grad_norm": 0.02806459739804268,
"learning_rate": 9.963368684133072e-05,
"loss": 0.0809,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 38,
"tokens_per_second_per_gpu": 367.94
},
{
"epoch": 0.08944954128440367,
"grad_norm": 0.02387731708586216,
"learning_rate": 9.958653196738954e-05,
"loss": 0.0642,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 49.04,
"memory/max_allocated (GiB)": 49.04,
"step": 39,
"tokens_per_second_per_gpu": 466.74
},
{
"epoch": 0.09174311926605505,
"grad_norm": 0.027889851480722427,
"learning_rate": 9.953653547860151e-05,
"loss": 0.0904,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.89,
"memory/max_allocated (GiB)": 48.89,
"step": 40,
"tokens_per_second_per_gpu": 371.51
},
{
"epoch": 0.09403669724770643,
"grad_norm": 0.031659577041864395,
"learning_rate": 9.948370024007454e-05,
"loss": 0.081,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.93,
"memory/max_allocated (GiB)": 48.93,
"step": 41,
"tokens_per_second_per_gpu": 479.04
},
{
"epoch": 0.0963302752293578,
"grad_norm": 0.03186093270778656,
"learning_rate": 9.942802927959443e-05,
"loss": 0.0881,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.93,
"memory/max_allocated (GiB)": 48.93,
"step": 42,
"tokens_per_second_per_gpu": 364.73
},
{
"epoch": 0.09862385321100918,
"grad_norm": 0.0313677079975605,
"learning_rate": 9.936952578745142e-05,
"loss": 0.0808,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.89,
"memory/max_allocated (GiB)": 48.89,
"step": 43,
"tokens_per_second_per_gpu": 418.0
},
{
"epoch": 0.10091743119266056,
"grad_norm": 0.0264989472925663,
"learning_rate": 9.93081931162573e-05,
"loss": 0.0664,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 49.0,
"memory/max_allocated (GiB)": 49.0,
"step": 44,
"tokens_per_second_per_gpu": 439.24
},
{
"epoch": 0.10321100917431193,
"grad_norm": 0.026272334158420563,
"learning_rate": 9.92440347807533e-05,
"loss": 0.0683,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 49.04,
"memory/max_allocated (GiB)": 49.04,
"step": 45,
"tokens_per_second_per_gpu": 482.81
},
{
"epoch": 0.10550458715596331,
"grad_norm": 0.029066840186715126,
"learning_rate": 9.91770544576087e-05,
"loss": 0.0737,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.93,
"memory/max_allocated (GiB)": 48.93,
"step": 46,
"tokens_per_second_per_gpu": 389.87
},
{
"epoch": 0.10779816513761468,
"grad_norm": 0.024542706087231636,
"learning_rate": 9.910725598521013e-05,
"loss": 0.0737,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.93,
"memory/max_allocated (GiB)": 48.93,
"step": 47,
"tokens_per_second_per_gpu": 473.12
},
{
"epoch": 0.11009174311926606,
"grad_norm": 0.042941153049468994,
"learning_rate": 9.90346433634416e-05,
"loss": 0.0951,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.81,
"memory/max_allocated (GiB)": 48.81,
"step": 48,
"tokens_per_second_per_gpu": 325.12
},
{
"epoch": 0.11238532110091744,
"grad_norm": 0.029044413939118385,
"learning_rate": 9.89592207534552e-05,
"loss": 0.0745,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.73,
"memory/max_allocated (GiB)": 48.73,
"step": 49,
"tokens_per_second_per_gpu": 315.62
},
{
"epoch": 0.11467889908256881,
"grad_norm": 0.028920788317918777,
"learning_rate": 9.888099247743283e-05,
"loss": 0.0818,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 50,
"tokens_per_second_per_gpu": 441.3
},
{
"epoch": 0.11697247706422019,
"grad_norm": 0.026095205917954445,
"learning_rate": 9.879996301833833e-05,
"loss": 0.0688,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 51,
"tokens_per_second_per_gpu": 386.22
},
{
"epoch": 0.11926605504587157,
"grad_norm": 0.024823926389217377,
"learning_rate": 9.871613701966067e-05,
"loss": 0.0701,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 52,
"tokens_per_second_per_gpu": 511.32
},
{
"epoch": 0.12155963302752294,
"grad_norm": 0.036093298345804214,
"learning_rate": 9.862951928514782e-05,
"loss": 0.0823,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 49.0,
"memory/max_allocated (GiB)": 49.0,
"step": 53,
"tokens_per_second_per_gpu": 323.2
},
{
"epoch": 0.12385321100917432,
"grad_norm": 0.03257686272263527,
"learning_rate": 9.854011477853146e-05,
"loss": 0.0769,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 49.04,
"memory/max_allocated (GiB)": 49.04,
"step": 54,
"tokens_per_second_per_gpu": 447.62
},
{
"epoch": 0.12614678899082568,
"grad_norm": 0.03413158655166626,
"learning_rate": 9.844792862324258e-05,
"loss": 0.0728,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 55,
"tokens_per_second_per_gpu": 451.05
},
{
"epoch": 0.12844036697247707,
"grad_norm": 0.02947932481765747,
"learning_rate": 9.835296610211779e-05,
"loss": 0.0713,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 56,
"tokens_per_second_per_gpu": 457.44
},
{
"epoch": 0.13073394495412843,
"grad_norm": 0.0220651775598526,
"learning_rate": 9.825523265709666e-05,
"loss": 0.0607,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 49.0,
"memory/max_allocated (GiB)": 49.0,
"step": 57,
"tokens_per_second_per_gpu": 456.49
},
{
"epoch": 0.13302752293577982,
"grad_norm": 0.026394842192530632,
"learning_rate": 9.815473388890983e-05,
"loss": 0.0716,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.93,
"memory/max_allocated (GiB)": 48.93,
"step": 58,
"tokens_per_second_per_gpu": 393.95
},
{
"epoch": 0.1353211009174312,
"grad_norm": 0.027936838567256927,
"learning_rate": 9.805147555675805e-05,
"loss": 0.0738,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 49.0,
"memory/max_allocated (GiB)": 49.0,
"step": 59,
"tokens_per_second_per_gpu": 464.83
},
{
"epoch": 0.13761467889908258,
"grad_norm": 0.023982539772987366,
"learning_rate": 9.794546357798208e-05,
"loss": 0.0608,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 60,
"tokens_per_second_per_gpu": 450.66
},
{
"epoch": 0.13990825688073394,
"grad_norm": 0.027479754760861397,
"learning_rate": 9.783670402772379e-05,
"loss": 0.0672,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 49.0,
"memory/max_allocated (GiB)": 49.0,
"step": 61,
"tokens_per_second_per_gpu": 455.94
},
{
"epoch": 0.14220183486238533,
"grad_norm": 0.02617599070072174,
"learning_rate": 9.772520313857775e-05,
"loss": 0.0804,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 62,
"tokens_per_second_per_gpu": 394.85
},
{
"epoch": 0.1444954128440367,
"grad_norm": 0.030884992331266403,
"learning_rate": 9.761096730023432e-05,
"loss": 0.0768,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.89,
"memory/max_allocated (GiB)": 48.89,
"step": 63,
"tokens_per_second_per_gpu": 446.63
},
{
"epoch": 0.14678899082568808,
"grad_norm": 0.027579287067055702,
"learning_rate": 9.749400305911322e-05,
"loss": 0.0659,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.93,
"memory/max_allocated (GiB)": 48.93,
"step": 64,
"tokens_per_second_per_gpu": 484.34
},
{
"epoch": 0.14908256880733944,
"grad_norm": 0.030303625389933586,
"learning_rate": 9.737431711798864e-05,
"loss": 0.0645,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.85,
"memory/max_allocated (GiB)": 48.85,
"step": 65,
"tokens_per_second_per_gpu": 437.07
},
{
"epoch": 0.15137614678899083,
"grad_norm": 0.027446158230304718,
"learning_rate": 9.725191633560491e-05,
"loss": 0.08,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 66,
"tokens_per_second_per_gpu": 411.5
},
{
"epoch": 0.1536697247706422,
"grad_norm": 0.03177177160978317,
"learning_rate": 9.712680772628364e-05,
"loss": 0.0801,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.93,
"memory/max_allocated (GiB)": 48.93,
"step": 67,
"tokens_per_second_per_gpu": 429.18
},
{
"epoch": 0.1559633027522936,
"grad_norm": 0.0288909412920475,
"learning_rate": 9.69989984595216e-05,
"loss": 0.0707,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 49.04,
"memory/max_allocated (GiB)": 49.04,
"step": 68,
"tokens_per_second_per_gpu": 408.55
},
{
"epoch": 0.15825688073394495,
"grad_norm": 0.02751251310110092,
"learning_rate": 9.686849585957994e-05,
"loss": 0.0736,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.89,
"memory/max_allocated (GiB)": 48.89,
"step": 69,
"tokens_per_second_per_gpu": 420.0
},
{
"epoch": 0.16055045871559634,
"grad_norm": 0.023428168147802353,
"learning_rate": 9.673530740506447e-05,
"loss": 0.0648,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 70,
"tokens_per_second_per_gpu": 512.59
},
{
"epoch": 0.1628440366972477,
"grad_norm": 0.031534772366285324,
"learning_rate": 9.659944072849707e-05,
"loss": 0.0818,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.93,
"memory/max_allocated (GiB)": 48.93,
"step": 71,
"tokens_per_second_per_gpu": 456.9
},
{
"epoch": 0.1651376146788991,
"grad_norm": 0.027208171784877777,
"learning_rate": 9.646090361587827e-05,
"loss": 0.0709,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.93,
"memory/max_allocated (GiB)": 48.93,
"step": 72,
"tokens_per_second_per_gpu": 378.48
},
{
"epoch": 0.16743119266055045,
"grad_norm": 0.02961639314889908,
"learning_rate": 9.631970400624113e-05,
"loss": 0.0764,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.81,
"memory/max_allocated (GiB)": 48.81,
"step": 73,
"tokens_per_second_per_gpu": 316.38
},
{
"epoch": 0.16972477064220184,
"grad_norm": 0.027367761358618736,
"learning_rate": 9.617584999119625e-05,
"loss": 0.0672,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.85,
"memory/max_allocated (GiB)": 48.85,
"step": 74,
"tokens_per_second_per_gpu": 402.44
},
{
"epoch": 0.1720183486238532,
"grad_norm": 0.030167503282427788,
"learning_rate": 9.602934981446803e-05,
"loss": 0.0743,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 75,
"tokens_per_second_per_gpu": 531.29
},
{
"epoch": 0.1743119266055046,
"grad_norm": 0.0387263149023056,
"learning_rate": 9.588021187142235e-05,
"loss": 0.083,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.81,
"memory/max_allocated (GiB)": 48.81,
"step": 76,
"tokens_per_second_per_gpu": 424.59
},
{
"epoch": 0.17660550458715596,
"grad_norm": 0.027617793530225754,
"learning_rate": 9.572844470858537e-05,
"loss": 0.0769,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 77,
"tokens_per_second_per_gpu": 461.9
},
{
"epoch": 0.17889908256880735,
"grad_norm": 0.029771512374281883,
"learning_rate": 9.557405702315381e-05,
"loss": 0.0658,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.93,
"memory/max_allocated (GiB)": 48.93,
"step": 78,
"tokens_per_second_per_gpu": 475.77
},
{
"epoch": 0.1811926605504587,
"grad_norm": 0.029358675703406334,
"learning_rate": 9.541705766249655e-05,
"loss": 0.066,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 49.0,
"memory/max_allocated (GiB)": 49.0,
"step": 79,
"tokens_per_second_per_gpu": 489.33
},
{
"epoch": 0.1834862385321101,
"grad_norm": 0.023111771792173386,
"learning_rate": 9.525745562364756e-05,
"loss": 0.066,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 80,
"tokens_per_second_per_gpu": 382.84
},
{
"epoch": 0.18577981651376146,
"grad_norm": 0.029448291286826134,
"learning_rate": 9.509526005279044e-05,
"loss": 0.0608,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.81,
"memory/max_allocated (GiB)": 48.81,
"step": 81,
"tokens_per_second_per_gpu": 415.81
},
{
"epoch": 0.18807339449541285,
"grad_norm": 0.02794116735458374,
"learning_rate": 9.493048024473412e-05,
"loss": 0.0736,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 49.0,
"memory/max_allocated (GiB)": 49.0,
"step": 82,
"tokens_per_second_per_gpu": 400.02
},
{
"epoch": 0.19036697247706422,
"grad_norm": 0.04534873738884926,
"learning_rate": 9.476312564238034e-05,
"loss": 0.0673,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.93,
"memory/max_allocated (GiB)": 48.93,
"step": 83,
"tokens_per_second_per_gpu": 369.1
},
{
"epoch": 0.1926605504587156,
"grad_norm": 0.026540853083133698,
"learning_rate": 9.459320583618252e-05,
"loss": 0.0558,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 49.04,
"memory/max_allocated (GiB)": 49.04,
"step": 84,
"tokens_per_second_per_gpu": 611.61
},
{
"epoch": 0.19495412844036697,
"grad_norm": 0.03129403293132782,
"learning_rate": 9.442073056359604e-05,
"loss": 0.0741,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.93,
"memory/max_allocated (GiB)": 48.93,
"step": 85,
"tokens_per_second_per_gpu": 492.16
},
{
"epoch": 0.19724770642201836,
"grad_norm": 0.027526071295142174,
"learning_rate": 9.424570970852034e-05,
"loss": 0.0733,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.85,
"memory/max_allocated (GiB)": 48.85,
"step": 86,
"tokens_per_second_per_gpu": 427.76
},
{
"epoch": 0.19954128440366972,
"grad_norm": 0.025468798354268074,
"learning_rate": 9.406815330073244e-05,
"loss": 0.0613,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.93,
"memory/max_allocated (GiB)": 48.93,
"step": 87,
"tokens_per_second_per_gpu": 462.82
},
{
"epoch": 0.2018348623853211,
"grad_norm": 0.029043635353446007,
"learning_rate": 9.388807151531229e-05,
"loss": 0.0758,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.89,
"memory/max_allocated (GiB)": 48.89,
"step": 88,
"tokens_per_second_per_gpu": 353.91
},
{
"epoch": 0.20412844036697247,
"grad_norm": 0.03196391835808754,
"learning_rate": 9.37054746720595e-05,
"loss": 0.0678,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 49.0,
"memory/max_allocated (GiB)": 49.0,
"step": 89,
"tokens_per_second_per_gpu": 411.71
},
{
"epoch": 0.20642201834862386,
"grad_norm": 0.033272091299295425,
"learning_rate": 9.352037323490208e-05,
"loss": 0.0722,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.85,
"memory/max_allocated (GiB)": 48.85,
"step": 90,
"tokens_per_second_per_gpu": 398.81
},
{
"epoch": 0.20871559633027523,
"grad_norm": 0.03096090629696846,
"learning_rate": 9.333277781129678e-05,
"loss": 0.0809,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.89,
"memory/max_allocated (GiB)": 48.89,
"step": 91,
"tokens_per_second_per_gpu": 393.81
},
{
"epoch": 0.21100917431192662,
"grad_norm": 0.026267440989613533,
"learning_rate": 9.314269915162114e-05,
"loss": 0.0604,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 49.0,
"memory/max_allocated (GiB)": 49.0,
"step": 92,
"tokens_per_second_per_gpu": 453.78
},
{
"epoch": 0.21330275229357798,
"grad_norm": 0.02608361840248108,
"learning_rate": 9.295014814855753e-05,
"loss": 0.0663,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.93,
"memory/max_allocated (GiB)": 48.93,
"step": 93,
"tokens_per_second_per_gpu": 430.47
},
{
"epoch": 0.21559633027522937,
"grad_norm": 0.024829065427184105,
"learning_rate": 9.275513583646884e-05,
"loss": 0.0598,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.81,
"memory/max_allocated (GiB)": 48.81,
"step": 94,
"tokens_per_second_per_gpu": 384.01
},
{
"epoch": 0.21788990825688073,
"grad_norm": 0.03385532647371292,
"learning_rate": 9.255767339076622e-05,
"loss": 0.0719,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.93,
"memory/max_allocated (GiB)": 48.93,
"step": 95,
"tokens_per_second_per_gpu": 440.35
},
{
"epoch": 0.22018348623853212,
"grad_norm": 0.029608217999339104,
"learning_rate": 9.23577721272686e-05,
"loss": 0.094,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 49.04,
"memory/max_allocated (GiB)": 49.04,
"step": 96,
"tokens_per_second_per_gpu": 485.56
},
{
"epoch": 0.22247706422018348,
"grad_norm": 0.02693762816488743,
"learning_rate": 9.215544350155422e-05,
"loss": 0.0755,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 97,
"tokens_per_second_per_gpu": 432.16
},
{
"epoch": 0.22477064220183487,
"grad_norm": 0.02771424688398838,
"learning_rate": 9.195069910830427e-05,
"loss": 0.0692,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.89,
"memory/max_allocated (GiB)": 48.89,
"step": 98,
"tokens_per_second_per_gpu": 412.93
},
{
"epoch": 0.22706422018348624,
"grad_norm": 0.02276022732257843,
"learning_rate": 9.174355068063828e-05,
"loss": 0.0637,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 49.0,
"memory/max_allocated (GiB)": 49.0,
"step": 99,
"tokens_per_second_per_gpu": 418.24
},
{
"epoch": 0.22935779816513763,
"grad_norm": 0.026155246421694756,
"learning_rate": 9.15340100894418e-05,
"loss": 0.0698,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 100,
"tokens_per_second_per_gpu": 403.6
},
{
"epoch": 0.231651376146789,
"grad_norm": 0.022778436541557312,
"learning_rate": 9.132208934268622e-05,
"loss": 0.0654,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 49.0,
"memory/max_allocated (GiB)": 49.0,
"step": 101,
"tokens_per_second_per_gpu": 491.32
},
{
"epoch": 0.23394495412844038,
"grad_norm": 0.04701945558190346,
"learning_rate": 9.110780058474052e-05,
"loss": 0.0741,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.85,
"memory/max_allocated (GiB)": 48.85,
"step": 102,
"tokens_per_second_per_gpu": 444.03
},
{
"epoch": 0.23623853211009174,
"grad_norm": 0.030211661010980606,
"learning_rate": 9.08911560956753e-05,
"loss": 0.0789,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 103,
"tokens_per_second_per_gpu": 514.87
},
{
"epoch": 0.23853211009174313,
"grad_norm": 0.026159459725022316,
"learning_rate": 9.067216829055922e-05,
"loss": 0.0637,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 104,
"tokens_per_second_per_gpu": 446.47
},
{
"epoch": 0.2408256880733945,
"grad_norm": 0.02918146923184395,
"learning_rate": 9.045084971874738e-05,
"loss": 0.0727,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.89,
"memory/max_allocated (GiB)": 48.89,
"step": 105,
"tokens_per_second_per_gpu": 425.37
},
{
"epoch": 0.24311926605504589,
"grad_norm": 0.03170175105333328,
"learning_rate": 9.022721306316222e-05,
"loss": 0.0857,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.85,
"memory/max_allocated (GiB)": 48.85,
"step": 106,
"tokens_per_second_per_gpu": 301.79
},
{
"epoch": 0.24541284403669725,
"grad_norm": 0.032674651592969894,
"learning_rate": 9.000127113956674e-05,
"loss": 0.0795,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.77,
"memory/max_allocated (GiB)": 48.77,
"step": 107,
"tokens_per_second_per_gpu": 338.41
},
{
"epoch": 0.24770642201834864,
"grad_norm": 0.026492780074477196,
"learning_rate": 8.977303689583e-05,
"loss": 0.0775,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.89,
"memory/max_allocated (GiB)": 48.89,
"step": 108,
"tokens_per_second_per_gpu": 383.35
},
{
"epoch": 0.25,
"grad_norm": 0.0290480125695467,
"learning_rate": 8.954252341118523e-05,
"loss": 0.076,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.93,
"memory/max_allocated (GiB)": 48.93,
"step": 109,
"tokens_per_second_per_gpu": 382.78
},
{
"epoch": 0.25229357798165136,
"grad_norm": 0.030473977327346802,
"learning_rate": 8.930974389548023e-05,
"loss": 0.0761,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.93,
"memory/max_allocated (GiB)": 48.93,
"step": 110,
"tokens_per_second_per_gpu": 476.56
},
{
"epoch": 0.2545871559633027,
"grad_norm": 0.02930077351629734,
"learning_rate": 8.90747116884204e-05,
"loss": 0.0691,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.89,
"memory/max_allocated (GiB)": 48.89,
"step": 111,
"tokens_per_second_per_gpu": 441.2
},
{
"epoch": 0.25688073394495414,
"grad_norm": 0.02884151227772236,
"learning_rate": 8.883744025880428e-05,
"loss": 0.0806,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 49.0,
"memory/max_allocated (GiB)": 49.0,
"step": 112,
"tokens_per_second_per_gpu": 406.96
},
{
"epoch": 0.2591743119266055,
"grad_norm": 0.02618175558745861,
"learning_rate": 8.859794320375168e-05,
"loss": 0.0677,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.89,
"memory/max_allocated (GiB)": 48.89,
"step": 113,
"tokens_per_second_per_gpu": 430.04
},
{
"epoch": 0.26146788990825687,
"grad_norm": 0.026963548734784126,
"learning_rate": 8.835623424792452e-05,
"loss": 0.0694,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.89,
"memory/max_allocated (GiB)": 48.89,
"step": 114,
"tokens_per_second_per_gpu": 351.9
},
{
"epoch": 0.26376146788990823,
"grad_norm": 0.021544624119997025,
"learning_rate": 8.811232724274035e-05,
"loss": 0.0613,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 115,
"tokens_per_second_per_gpu": 480.22
},
{
"epoch": 0.26605504587155965,
"grad_norm": 0.03840009495615959,
"learning_rate": 8.786623616557847e-05,
"loss": 0.0723,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 49.0,
"memory/max_allocated (GiB)": 49.0,
"step": 116,
"tokens_per_second_per_gpu": 433.18
},
{
"epoch": 0.268348623853211,
"grad_norm": 0.022571468725800514,
"learning_rate": 8.761797511897906e-05,
"loss": 0.065,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 49.0,
"memory/max_allocated (GiB)": 49.0,
"step": 117,
"tokens_per_second_per_gpu": 421.92
},
{
"epoch": 0.2706422018348624,
"grad_norm": 0.02688576467335224,
"learning_rate": 8.736755832983497e-05,
"loss": 0.0772,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.81,
"memory/max_allocated (GiB)": 48.81,
"step": 118,
"tokens_per_second_per_gpu": 354.3
},
{
"epoch": 0.27293577981651373,
"grad_norm": 0.025858785957098007,
"learning_rate": 8.711500014857634e-05,
"loss": 0.0745,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.81,
"memory/max_allocated (GiB)": 48.81,
"step": 119,
"tokens_per_second_per_gpu": 365.46
},
{
"epoch": 0.27522935779816515,
"grad_norm": 0.02718079835176468,
"learning_rate": 8.686031504834843e-05,
"loss": 0.0759,
"memory/device_reserved (GiB)": 50.97,
"memory/max_active (GiB)": 48.97,
"memory/max_allocated (GiB)": 48.97,
"step": 120,
"tokens_per_second_per_gpu": 426.06
}
],
"logging_steps": 1,
"max_steps": 436,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 60,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.4689538053609882e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}