diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4830 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 436, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0022935779816513763, + "grad_norm": 0.12869106233119965, + "learning_rate": 0.0, + "loss": 0.1978, + "memory/device_reserved (GiB)": 50.77, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 1, + "tokens_per_second_per_gpu": 354.96 + }, + { + "epoch": 0.0045871559633027525, + "grad_norm": 0.15667210519313812, + "learning_rate": 4.7619047619047615e-06, + "loss": 0.2353, + "memory/device_reserved (GiB)": 50.77, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 2, + "tokens_per_second_per_gpu": 406.37 + }, + { + "epoch": 0.006880733944954129, + "grad_norm": 0.2217973917722702, + "learning_rate": 9.523809523809523e-06, + "loss": 0.2243, + "memory/device_reserved (GiB)": 50.87, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 3, + "tokens_per_second_per_gpu": 371.18 + }, + { + "epoch": 0.009174311926605505, + "grad_norm": 0.15948686003684998, + "learning_rate": 1.4285714285714285e-05, + "loss": 0.2392, + "memory/device_reserved (GiB)": 50.87, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 4, + "tokens_per_second_per_gpu": 414.48 + }, + { + "epoch": 0.011467889908256881, + "grad_norm": 0.153566375374794, + "learning_rate": 1.9047619047619046e-05, + "loss": 0.2182, + "memory/device_reserved (GiB)": 50.87, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 5, + "tokens_per_second_per_gpu": 369.22 + }, + { + "epoch": 0.013761467889908258, + "grad_norm": 0.1521972268819809, + "learning_rate": 2.380952380952381e-05, + "loss": 0.2112, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 49.04, + "memory/max_allocated (GiB)": 49.04, + "step": 6, + "tokens_per_second_per_gpu": 429.31 + }, + { + "epoch": 0.016055045871559634, + "grad_norm": 0.168710395693779, + "learning_rate": 2.857142857142857e-05, + "loss": 0.226, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 7, + "tokens_per_second_per_gpu": 417.78 + }, + { + "epoch": 0.01834862385321101, + "grad_norm": 0.13864850997924805, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.1884, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 8, + "tokens_per_second_per_gpu": 439.56 + }, + { + "epoch": 0.020642201834862386, + "grad_norm": 0.15227903425693512, + "learning_rate": 3.809523809523809e-05, + "loss": 0.1996, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 9, + "tokens_per_second_per_gpu": 411.33 + }, + { + "epoch": 0.022935779816513763, + "grad_norm": 0.13421630859375, + "learning_rate": 4.2857142857142856e-05, + "loss": 0.1599, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 10, + "tokens_per_second_per_gpu": 496.3 + }, + { + "epoch": 0.02522935779816514, + "grad_norm": 0.14955134689807892, + "learning_rate": 4.761904761904762e-05, + "loss": 0.1735, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 11, + "tokens_per_second_per_gpu": 372.95 + }, + { + "epoch": 0.027522935779816515, + "grad_norm": 0.1432778388261795, + "learning_rate": 5.2380952380952384e-05, + "loss": 0.1515, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 12, + "tokens_per_second_per_gpu": 398.65 + }, + { + "epoch": 0.02981651376146789, + "grad_norm": 0.14163611829280853, + "learning_rate": 5.714285714285714e-05, + "loss": 0.1517, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 13, + "tokens_per_second_per_gpu": 440.5 + }, + { + "epoch": 0.03211009174311927, + "grad_norm": 0.15477906167507172, + "learning_rate": 6.19047619047619e-05, + "loss": 0.1444, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 14, + "tokens_per_second_per_gpu": 385.32 + }, + { + "epoch": 0.034403669724770644, + "grad_norm": 0.1055532768368721, + "learning_rate": 6.666666666666667e-05, + "loss": 0.1292, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 15, + "tokens_per_second_per_gpu": 453.02 + }, + { + "epoch": 0.03669724770642202, + "grad_norm": 0.10180933028459549, + "learning_rate": 7.142857142857143e-05, + "loss": 0.1208, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 16, + "tokens_per_second_per_gpu": 474.27 + }, + { + "epoch": 0.0389908256880734, + "grad_norm": 0.07999677956104279, + "learning_rate": 7.619047619047618e-05, + "loss": 0.132, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 17, + "tokens_per_second_per_gpu": 382.05 + }, + { + "epoch": 0.04128440366972477, + "grad_norm": 0.09194924682378769, + "learning_rate": 8.095238095238096e-05, + "loss": 0.1067, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 18, + "tokens_per_second_per_gpu": 398.61 + }, + { + "epoch": 0.04357798165137615, + "grad_norm": 0.0931428000330925, + "learning_rate": 8.571428571428571e-05, + "loss": 0.1088, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 19, + "tokens_per_second_per_gpu": 447.07 + }, + { + "epoch": 0.045871559633027525, + "grad_norm": 0.06202042102813721, + "learning_rate": 9.047619047619048e-05, + "loss": 0.0962, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 20, + "tokens_per_second_per_gpu": 382.57 + }, + { + "epoch": 0.0481651376146789, + "grad_norm": 0.04220607504248619, + "learning_rate": 9.523809523809524e-05, + "loss": 0.0963, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 21, + "tokens_per_second_per_gpu": 423.29 + }, + { + "epoch": 0.05045871559633028, + "grad_norm": 0.050066106021404266, + "learning_rate": 0.0001, + "loss": 0.1032, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 22, + "tokens_per_second_per_gpu": 381.35 + }, + { + "epoch": 0.052752293577981654, + "grad_norm": 0.0557384118437767, + "learning_rate": 9.999856734543933e-05, + "loss": 0.1025, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 23, + "tokens_per_second_per_gpu": 393.62 + }, + { + "epoch": 0.05504587155963303, + "grad_norm": 0.04612402245402336, + "learning_rate": 9.999426946385727e-05, + "loss": 0.0985, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 24, + "tokens_per_second_per_gpu": 515.46 + }, + { + "epoch": 0.05733944954128441, + "grad_norm": 0.09721734374761581, + "learning_rate": 9.998710660154898e-05, + "loss": 0.1062, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 25, + "tokens_per_second_per_gpu": 398.15 + }, + { + "epoch": 0.05963302752293578, + "grad_norm": 0.036745935678482056, + "learning_rate": 9.997707916899079e-05, + "loss": 0.1045, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 26, + "tokens_per_second_per_gpu": 422.42 + }, + { + "epoch": 0.06192660550458716, + "grad_norm": 0.04298936203122139, + "learning_rate": 9.996418774081658e-05, + "loss": 0.0923, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 27, + "tokens_per_second_per_gpu": 440.87 + }, + { + "epoch": 0.06422018348623854, + "grad_norm": 0.033536747097969055, + "learning_rate": 9.994843305578486e-05, + "loss": 0.096, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 28, + "tokens_per_second_per_gpu": 370.28 + }, + { + "epoch": 0.06651376146788991, + "grad_norm": 0.03256046772003174, + "learning_rate": 9.99298160167365e-05, + "loss": 0.0832, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 29, + "tokens_per_second_per_gpu": 357.19 + }, + { + "epoch": 0.06880733944954129, + "grad_norm": 0.042709868401288986, + "learning_rate": 9.990833769054293e-05, + "loss": 0.086, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 30, + "tokens_per_second_per_gpu": 441.89 + }, + { + "epoch": 0.07110091743119266, + "grad_norm": 0.04347776621580124, + "learning_rate": 9.988399930804504e-05, + "loss": 0.1, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 48.77, + "memory/max_allocated (GiB)": 48.77, + "step": 31, + "tokens_per_second_per_gpu": 348.66 + }, + { + "epoch": 0.07339449541284404, + "grad_norm": 0.030414681881666183, + "learning_rate": 9.985680226398261e-05, + "loss": 0.0811, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 32, + "tokens_per_second_per_gpu": 435.28 + }, + { + "epoch": 0.07568807339449542, + "grad_norm": 0.034023743122816086, + "learning_rate": 9.98267481169144e-05, + "loss": 0.0743, + "memory/device_reserved (GiB)": 50.93, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 33, + "tokens_per_second_per_gpu": 482.51 + }, + { + "epoch": 0.0779816513761468, + "grad_norm": 0.03136487305164337, + "learning_rate": 9.979383858912885e-05, + "loss": 0.0739, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.08, + "memory/max_allocated (GiB)": 49.08, + "step": 34, + "tokens_per_second_per_gpu": 496.59 + }, + { + "epoch": 0.08027522935779817, + "grad_norm": 0.028108298778533936, + "learning_rate": 9.975807556654537e-05, + "loss": 0.077, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 35, + "tokens_per_second_per_gpu": 349.1 + }, + { + "epoch": 0.08256880733944955, + "grad_norm": 0.028020795434713364, + "learning_rate": 9.971946109860626e-05, + "loss": 0.0775, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 36, + "tokens_per_second_per_gpu": 351.02 + }, + { + "epoch": 0.08486238532110092, + "grad_norm": 0.028756650164723396, + "learning_rate": 9.967799739815925e-05, + "loss": 0.0788, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 37, + "tokens_per_second_per_gpu": 534.52 + }, + { + "epoch": 0.0871559633027523, + "grad_norm": 0.02806459739804268, + "learning_rate": 9.963368684133072e-05, + "loss": 0.0809, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 38, + "tokens_per_second_per_gpu": 367.94 + }, + { + "epoch": 0.08944954128440367, + "grad_norm": 0.02387731708586216, + "learning_rate": 9.958653196738954e-05, + "loss": 0.0642, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.04, + "memory/max_allocated (GiB)": 49.04, + "step": 39, + "tokens_per_second_per_gpu": 466.74 + }, + { + "epoch": 0.09174311926605505, + "grad_norm": 0.027889851480722427, + "learning_rate": 9.953653547860151e-05, + "loss": 0.0904, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 40, + "tokens_per_second_per_gpu": 371.51 + }, + { + "epoch": 0.09403669724770643, + "grad_norm": 0.031659577041864395, + "learning_rate": 9.948370024007454e-05, + "loss": 0.081, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 41, + "tokens_per_second_per_gpu": 479.04 + }, + { + "epoch": 0.0963302752293578, + "grad_norm": 0.03186093270778656, + "learning_rate": 9.942802927959443e-05, + "loss": 0.0881, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 42, + "tokens_per_second_per_gpu": 364.73 + }, + { + "epoch": 0.09862385321100918, + "grad_norm": 0.0313677079975605, + "learning_rate": 9.936952578745142e-05, + "loss": 0.0808, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 43, + "tokens_per_second_per_gpu": 418.0 + }, + { + "epoch": 0.10091743119266056, + "grad_norm": 0.0264989472925663, + "learning_rate": 9.93081931162573e-05, + "loss": 0.0664, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 44, + "tokens_per_second_per_gpu": 439.24 + }, + { + "epoch": 0.10321100917431193, + "grad_norm": 0.026272334158420563, + "learning_rate": 9.92440347807533e-05, + "loss": 0.0683, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.04, + "memory/max_allocated (GiB)": 49.04, + "step": 45, + "tokens_per_second_per_gpu": 482.81 + }, + { + "epoch": 0.10550458715596331, + "grad_norm": 0.029066840186715126, + "learning_rate": 9.91770544576087e-05, + "loss": 0.0737, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 46, + "tokens_per_second_per_gpu": 389.87 + }, + { + "epoch": 0.10779816513761468, + "grad_norm": 0.024542706087231636, + "learning_rate": 9.910725598521013e-05, + "loss": 0.0737, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 47, + "tokens_per_second_per_gpu": 473.12 + }, + { + "epoch": 0.11009174311926606, + "grad_norm": 0.042941153049468994, + "learning_rate": 9.90346433634416e-05, + "loss": 0.0951, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 48, + "tokens_per_second_per_gpu": 325.12 + }, + { + "epoch": 0.11238532110091744, + "grad_norm": 0.029044413939118385, + "learning_rate": 9.89592207534552e-05, + "loss": 0.0745, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.73, + "memory/max_allocated (GiB)": 48.73, + "step": 49, + "tokens_per_second_per_gpu": 315.62 + }, + { + "epoch": 0.11467889908256881, + "grad_norm": 0.028920788317918777, + "learning_rate": 9.888099247743283e-05, + "loss": 0.0818, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 50, + "tokens_per_second_per_gpu": 441.3 + }, + { + "epoch": 0.11697247706422019, + "grad_norm": 0.026095205917954445, + "learning_rate": 9.879996301833833e-05, + "loss": 0.0688, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 51, + "tokens_per_second_per_gpu": 386.22 + }, + { + "epoch": 0.11926605504587157, + "grad_norm": 0.024823926389217377, + "learning_rate": 9.871613701966067e-05, + "loss": 0.0701, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 52, + "tokens_per_second_per_gpu": 511.32 + }, + { + "epoch": 0.12155963302752294, + "grad_norm": 0.036093298345804214, + "learning_rate": 9.862951928514782e-05, + "loss": 0.0823, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 53, + "tokens_per_second_per_gpu": 323.2 + }, + { + "epoch": 0.12385321100917432, + "grad_norm": 0.03257686272263527, + "learning_rate": 9.854011477853146e-05, + "loss": 0.0769, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.04, + "memory/max_allocated (GiB)": 49.04, + "step": 54, + "tokens_per_second_per_gpu": 447.62 + }, + { + "epoch": 0.12614678899082568, + "grad_norm": 0.03413158655166626, + "learning_rate": 9.844792862324258e-05, + "loss": 0.0728, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 55, + "tokens_per_second_per_gpu": 451.05 + }, + { + "epoch": 0.12844036697247707, + "grad_norm": 0.02947932481765747, + "learning_rate": 9.835296610211779e-05, + "loss": 0.0713, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 56, + "tokens_per_second_per_gpu": 457.44 + }, + { + "epoch": 0.13073394495412843, + "grad_norm": 0.0220651775598526, + "learning_rate": 9.825523265709666e-05, + "loss": 0.0607, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 57, + "tokens_per_second_per_gpu": 456.49 + }, + { + "epoch": 0.13302752293577982, + "grad_norm": 0.026394842192530632, + "learning_rate": 9.815473388890983e-05, + "loss": 0.0716, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 58, + "tokens_per_second_per_gpu": 393.95 + }, + { + "epoch": 0.1353211009174312, + "grad_norm": 0.027936838567256927, + "learning_rate": 9.805147555675805e-05, + "loss": 0.0738, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 59, + "tokens_per_second_per_gpu": 464.83 + }, + { + "epoch": 0.13761467889908258, + "grad_norm": 0.023982539772987366, + "learning_rate": 9.794546357798208e-05, + "loss": 0.0608, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 60, + "tokens_per_second_per_gpu": 450.66 + }, + { + "epoch": 0.13990825688073394, + "grad_norm": 0.027479754760861397, + "learning_rate": 9.783670402772379e-05, + "loss": 0.0672, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 61, + "tokens_per_second_per_gpu": 455.94 + }, + { + "epoch": 0.14220183486238533, + "grad_norm": 0.02617599070072174, + "learning_rate": 9.772520313857775e-05, + "loss": 0.0804, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 62, + "tokens_per_second_per_gpu": 394.85 + }, + { + "epoch": 0.1444954128440367, + "grad_norm": 0.030884992331266403, + "learning_rate": 9.761096730023432e-05, + "loss": 0.0768, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 63, + "tokens_per_second_per_gpu": 446.63 + }, + { + "epoch": 0.14678899082568808, + "grad_norm": 0.027579287067055702, + "learning_rate": 9.749400305911322e-05, + "loss": 0.0659, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 64, + "tokens_per_second_per_gpu": 484.34 + }, + { + "epoch": 0.14908256880733944, + "grad_norm": 0.030303625389933586, + "learning_rate": 9.737431711798864e-05, + "loss": 0.0645, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 65, + "tokens_per_second_per_gpu": 437.07 + }, + { + "epoch": 0.15137614678899083, + "grad_norm": 0.027446158230304718, + "learning_rate": 9.725191633560491e-05, + "loss": 0.08, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 66, + "tokens_per_second_per_gpu": 411.5 + }, + { + "epoch": 0.1536697247706422, + "grad_norm": 0.03177177160978317, + "learning_rate": 9.712680772628364e-05, + "loss": 0.0801, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 67, + "tokens_per_second_per_gpu": 429.18 + }, + { + "epoch": 0.1559633027522936, + "grad_norm": 0.0288909412920475, + "learning_rate": 9.69989984595216e-05, + "loss": 0.0707, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.04, + "memory/max_allocated (GiB)": 49.04, + "step": 68, + "tokens_per_second_per_gpu": 408.55 + }, + { + "epoch": 0.15825688073394495, + "grad_norm": 0.02751251310110092, + "learning_rate": 9.686849585957994e-05, + "loss": 0.0736, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 69, + "tokens_per_second_per_gpu": 420.0 + }, + { + "epoch": 0.16055045871559634, + "grad_norm": 0.023428168147802353, + "learning_rate": 9.673530740506447e-05, + "loss": 0.0648, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 70, + "tokens_per_second_per_gpu": 512.59 + }, + { + "epoch": 0.1628440366972477, + "grad_norm": 0.031534772366285324, + "learning_rate": 9.659944072849707e-05, + "loss": 0.0818, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 71, + "tokens_per_second_per_gpu": 456.9 + }, + { + "epoch": 0.1651376146788991, + "grad_norm": 0.027208171784877777, + "learning_rate": 9.646090361587827e-05, + "loss": 0.0709, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 72, + "tokens_per_second_per_gpu": 378.48 + }, + { + "epoch": 0.16743119266055045, + "grad_norm": 0.02961639314889908, + "learning_rate": 9.631970400624113e-05, + "loss": 0.0764, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 73, + "tokens_per_second_per_gpu": 316.38 + }, + { + "epoch": 0.16972477064220184, + "grad_norm": 0.027367761358618736, + "learning_rate": 9.617584999119625e-05, + "loss": 0.0672, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 74, + "tokens_per_second_per_gpu": 402.44 + }, + { + "epoch": 0.1720183486238532, + "grad_norm": 0.030167503282427788, + "learning_rate": 9.602934981446803e-05, + "loss": 0.0743, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 75, + "tokens_per_second_per_gpu": 531.29 + }, + { + "epoch": 0.1743119266055046, + "grad_norm": 0.0387263149023056, + "learning_rate": 9.588021187142235e-05, + "loss": 0.083, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 76, + "tokens_per_second_per_gpu": 424.59 + }, + { + "epoch": 0.17660550458715596, + "grad_norm": 0.027617793530225754, + "learning_rate": 9.572844470858537e-05, + "loss": 0.0769, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 77, + "tokens_per_second_per_gpu": 461.9 + }, + { + "epoch": 0.17889908256880735, + "grad_norm": 0.029771512374281883, + "learning_rate": 9.557405702315381e-05, + "loss": 0.0658, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 78, + "tokens_per_second_per_gpu": 475.77 + }, + { + "epoch": 0.1811926605504587, + "grad_norm": 0.029358675703406334, + "learning_rate": 9.541705766249655e-05, + "loss": 0.066, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 79, + "tokens_per_second_per_gpu": 489.33 + }, + { + "epoch": 0.1834862385321101, + "grad_norm": 0.023111771792173386, + "learning_rate": 9.525745562364756e-05, + "loss": 0.066, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 80, + "tokens_per_second_per_gpu": 382.84 + }, + { + "epoch": 0.18577981651376146, + "grad_norm": 0.029448291286826134, + "learning_rate": 9.509526005279044e-05, + "loss": 0.0608, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 81, + "tokens_per_second_per_gpu": 415.81 + }, + { + "epoch": 0.18807339449541285, + "grad_norm": 0.02794116735458374, + "learning_rate": 9.493048024473412e-05, + "loss": 0.0736, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 82, + "tokens_per_second_per_gpu": 400.02 + }, + { + "epoch": 0.19036697247706422, + "grad_norm": 0.04534873738884926, + "learning_rate": 9.476312564238034e-05, + "loss": 0.0673, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 83, + "tokens_per_second_per_gpu": 369.1 + }, + { + "epoch": 0.1926605504587156, + "grad_norm": 0.026540853083133698, + "learning_rate": 9.459320583618252e-05, + "loss": 0.0558, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.04, + "memory/max_allocated (GiB)": 49.04, + "step": 84, + "tokens_per_second_per_gpu": 611.61 + }, + { + "epoch": 0.19495412844036697, + "grad_norm": 0.03129403293132782, + "learning_rate": 9.442073056359604e-05, + "loss": 0.0741, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 85, + "tokens_per_second_per_gpu": 492.16 + }, + { + "epoch": 0.19724770642201836, + "grad_norm": 0.027526071295142174, + "learning_rate": 9.424570970852034e-05, + "loss": 0.0733, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 86, + "tokens_per_second_per_gpu": 427.76 + }, + { + "epoch": 0.19954128440366972, + "grad_norm": 0.025468798354268074, + "learning_rate": 9.406815330073244e-05, + "loss": 0.0613, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 87, + "tokens_per_second_per_gpu": 462.82 + }, + { + "epoch": 0.2018348623853211, + "grad_norm": 0.029043635353446007, + "learning_rate": 9.388807151531229e-05, + "loss": 0.0758, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 88, + "tokens_per_second_per_gpu": 353.91 + }, + { + "epoch": 0.20412844036697247, + "grad_norm": 0.03196391835808754, + "learning_rate": 9.37054746720595e-05, + "loss": 0.0678, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 89, + "tokens_per_second_per_gpu": 411.71 + }, + { + "epoch": 0.20642201834862386, + "grad_norm": 0.033272091299295425, + "learning_rate": 9.352037323490208e-05, + "loss": 0.0722, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 90, + "tokens_per_second_per_gpu": 398.81 + }, + { + "epoch": 0.20871559633027523, + "grad_norm": 0.03096090629696846, + "learning_rate": 9.333277781129678e-05, + "loss": 0.0809, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 91, + "tokens_per_second_per_gpu": 393.81 + }, + { + "epoch": 0.21100917431192662, + "grad_norm": 0.026267440989613533, + "learning_rate": 9.314269915162114e-05, + "loss": 0.0604, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 92, + "tokens_per_second_per_gpu": 453.78 + }, + { + "epoch": 0.21330275229357798, + "grad_norm": 0.02608361840248108, + "learning_rate": 9.295014814855753e-05, + "loss": 0.0663, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 93, + "tokens_per_second_per_gpu": 430.47 + }, + { + "epoch": 0.21559633027522937, + "grad_norm": 0.024829065427184105, + "learning_rate": 9.275513583646884e-05, + "loss": 0.0598, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 94, + "tokens_per_second_per_gpu": 384.01 + }, + { + "epoch": 0.21788990825688073, + "grad_norm": 0.03385532647371292, + "learning_rate": 9.255767339076622e-05, + "loss": 0.0719, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 95, + "tokens_per_second_per_gpu": 440.35 + }, + { + "epoch": 0.22018348623853212, + "grad_norm": 0.029608217999339104, + "learning_rate": 9.23577721272686e-05, + "loss": 0.094, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.04, + "memory/max_allocated (GiB)": 49.04, + "step": 96, + "tokens_per_second_per_gpu": 485.56 + }, + { + "epoch": 0.22247706422018348, + "grad_norm": 0.02693762816488743, + "learning_rate": 9.215544350155422e-05, + "loss": 0.0755, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 97, + "tokens_per_second_per_gpu": 432.16 + }, + { + "epoch": 0.22477064220183487, + "grad_norm": 0.02771424688398838, + "learning_rate": 9.195069910830427e-05, + "loss": 0.0692, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 98, + "tokens_per_second_per_gpu": 412.93 + }, + { + "epoch": 0.22706422018348624, + "grad_norm": 0.02276022732257843, + "learning_rate": 9.174355068063828e-05, + "loss": 0.0637, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 99, + "tokens_per_second_per_gpu": 418.24 + }, + { + "epoch": 0.22935779816513763, + "grad_norm": 0.026155246421694756, + "learning_rate": 9.15340100894418e-05, + "loss": 0.0698, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 100, + "tokens_per_second_per_gpu": 403.6 + }, + { + "epoch": 0.231651376146789, + "grad_norm": 0.022778436541557312, + "learning_rate": 9.132208934268622e-05, + "loss": 0.0654, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 101, + "tokens_per_second_per_gpu": 491.32 + }, + { + "epoch": 0.23394495412844038, + "grad_norm": 0.04701945558190346, + "learning_rate": 9.110780058474052e-05, + "loss": 0.0741, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 102, + "tokens_per_second_per_gpu": 444.03 + }, + { + "epoch": 0.23623853211009174, + "grad_norm": 0.030211661010980606, + "learning_rate": 9.08911560956753e-05, + "loss": 0.0789, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 103, + "tokens_per_second_per_gpu": 514.87 + }, + { + "epoch": 0.23853211009174313, + "grad_norm": 0.026159459725022316, + "learning_rate": 9.067216829055922e-05, + "loss": 0.0637, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 104, + "tokens_per_second_per_gpu": 446.47 + }, + { + "epoch": 0.2408256880733945, + "grad_norm": 0.02918146923184395, + "learning_rate": 9.045084971874738e-05, + "loss": 0.0727, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 105, + "tokens_per_second_per_gpu": 425.37 + }, + { + "epoch": 0.24311926605504589, + "grad_norm": 0.03170175105333328, + "learning_rate": 9.022721306316222e-05, + "loss": 0.0857, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 106, + "tokens_per_second_per_gpu": 301.79 + }, + { + "epoch": 0.24541284403669725, + "grad_norm": 0.032674651592969894, + "learning_rate": 9.000127113956674e-05, + "loss": 0.0795, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.77, + "memory/max_allocated (GiB)": 48.77, + "step": 107, + "tokens_per_second_per_gpu": 338.41 + }, + { + "epoch": 0.24770642201834864, + "grad_norm": 0.026492780074477196, + "learning_rate": 8.977303689583e-05, + "loss": 0.0775, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 108, + "tokens_per_second_per_gpu": 383.35 + }, + { + "epoch": 0.25, + "grad_norm": 0.0290480125695467, + "learning_rate": 8.954252341118523e-05, + "loss": 0.076, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 109, + "tokens_per_second_per_gpu": 382.78 + }, + { + "epoch": 0.25229357798165136, + "grad_norm": 0.030473977327346802, + "learning_rate": 8.930974389548023e-05, + "loss": 0.0761, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 110, + "tokens_per_second_per_gpu": 476.56 + }, + { + "epoch": 0.2545871559633027, + "grad_norm": 0.02930077351629734, + "learning_rate": 8.90747116884204e-05, + "loss": 0.0691, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 111, + "tokens_per_second_per_gpu": 441.2 + }, + { + "epoch": 0.25688073394495414, + "grad_norm": 0.02884151227772236, + "learning_rate": 8.883744025880428e-05, + "loss": 0.0806, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 112, + "tokens_per_second_per_gpu": 406.96 + }, + { + "epoch": 0.2591743119266055, + "grad_norm": 0.02618175558745861, + "learning_rate": 8.859794320375168e-05, + "loss": 0.0677, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 113, + "tokens_per_second_per_gpu": 430.04 + }, + { + "epoch": 0.26146788990825687, + "grad_norm": 0.026963548734784126, + "learning_rate": 8.835623424792452e-05, + "loss": 0.0694, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 114, + "tokens_per_second_per_gpu": 351.9 + }, + { + "epoch": 0.26376146788990823, + "grad_norm": 0.021544624119997025, + "learning_rate": 8.811232724274035e-05, + "loss": 0.0613, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 115, + "tokens_per_second_per_gpu": 480.22 + }, + { + "epoch": 0.26605504587155965, + "grad_norm": 0.03840009495615959, + "learning_rate": 8.786623616557847e-05, + "loss": 0.0723, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 116, + "tokens_per_second_per_gpu": 433.18 + }, + { + "epoch": 0.268348623853211, + "grad_norm": 0.022571468725800514, + "learning_rate": 8.761797511897906e-05, + "loss": 0.065, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 117, + "tokens_per_second_per_gpu": 421.92 + }, + { + "epoch": 0.2706422018348624, + "grad_norm": 0.02688576467335224, + "learning_rate": 8.736755832983497e-05, + "loss": 0.0772, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 118, + "tokens_per_second_per_gpu": 354.3 + }, + { + "epoch": 0.27293577981651373, + "grad_norm": 0.025858785957098007, + "learning_rate": 8.711500014857634e-05, + "loss": 0.0745, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 119, + "tokens_per_second_per_gpu": 365.46 + }, + { + "epoch": 0.27522935779816515, + "grad_norm": 0.02718079835176468, + "learning_rate": 8.686031504834843e-05, + "loss": 0.0759, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 120, + "tokens_per_second_per_gpu": 426.06 + }, + { + "epoch": 0.2775229357798165, + "grad_norm": 0.028197383508086205, + "learning_rate": 8.660351762418203e-05, + "loss": 0.0753, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 121, + "tokens_per_second_per_gpu": 483.89 + }, + { + "epoch": 0.2798165137614679, + "grad_norm": 0.02615584433078766, + "learning_rate": 8.634462259215719e-05, + "loss": 0.0692, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 122, + "tokens_per_second_per_gpu": 347.59 + }, + { + "epoch": 0.28211009174311924, + "grad_norm": 0.028645118698477745, + "learning_rate": 8.608364478855983e-05, + "loss": 0.0784, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 123, + "tokens_per_second_per_gpu": 472.02 + }, + { + "epoch": 0.28440366972477066, + "grad_norm": 0.03761473670601845, + "learning_rate": 8.58205991690316e-05, + "loss": 0.0663, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 124, + "tokens_per_second_per_gpu": 439.34 + }, + { + "epoch": 0.286697247706422, + "grad_norm": 0.024080324918031693, + "learning_rate": 8.555550080771273e-05, + "loss": 0.0685, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 125, + "tokens_per_second_per_gpu": 413.4 + }, + { + "epoch": 0.2889908256880734, + "grad_norm": 0.03224342688918114, + "learning_rate": 8.528836489637828e-05, + "loss": 0.0813, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.77, + "memory/max_allocated (GiB)": 48.77, + "step": 126, + "tokens_per_second_per_gpu": 299.66 + }, + { + "epoch": 0.29128440366972475, + "grad_norm": 0.02632022649049759, + "learning_rate": 8.501920674356754e-05, + "loss": 0.0649, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 127, + "tokens_per_second_per_gpu": 424.46 + }, + { + "epoch": 0.29357798165137616, + "grad_norm": 0.025439690798521042, + "learning_rate": 8.47480417737067e-05, + "loss": 0.0692, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 128, + "tokens_per_second_per_gpu": 443.94 + }, + { + "epoch": 0.2958715596330275, + "grad_norm": 0.028366245329380035, + "learning_rate": 8.447488552622498e-05, + "loss": 0.0743, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 129, + "tokens_per_second_per_gpu": 392.47 + }, + { + "epoch": 0.2981651376146789, + "grad_norm": 0.028246046975255013, + "learning_rate": 8.419975365466415e-05, + "loss": 0.0693, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.77, + "memory/max_allocated (GiB)": 48.77, + "step": 130, + "tokens_per_second_per_gpu": 385.79 + }, + { + "epoch": 0.30045871559633025, + "grad_norm": 0.029451027512550354, + "learning_rate": 8.392266192578143e-05, + "loss": 0.0731, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 131, + "tokens_per_second_per_gpu": 401.98 + }, + { + "epoch": 0.30275229357798167, + "grad_norm": 0.03156789019703865, + "learning_rate": 8.364362621864595e-05, + "loss": 0.0733, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 132, + "tokens_per_second_per_gpu": 406.2 + }, + { + "epoch": 0.30504587155963303, + "grad_norm": 0.0247171763330698, + "learning_rate": 8.336266252372889e-05, + "loss": 0.0723, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 133, + "tokens_per_second_per_gpu": 467.27 + }, + { + "epoch": 0.3073394495412844, + "grad_norm": 0.024775700643658638, + "learning_rate": 8.307978694198699e-05, + "loss": 0.0644, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 134, + "tokens_per_second_per_gpu": 377.14 + }, + { + "epoch": 0.30963302752293576, + "grad_norm": 0.025003118440508842, + "learning_rate": 8.279501568393994e-05, + "loss": 0.0684, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 135, + "tokens_per_second_per_gpu": 368.87 + }, + { + "epoch": 0.3119266055045872, + "grad_norm": 0.028482772409915924, + "learning_rate": 8.250836506874142e-05, + "loss": 0.0705, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 136, + "tokens_per_second_per_gpu": 439.79 + }, + { + "epoch": 0.31422018348623854, + "grad_norm": 0.02605322189629078, + "learning_rate": 8.221985152324385e-05, + "loss": 0.0638, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.04, + "memory/max_allocated (GiB)": 49.04, + "step": 137, + "tokens_per_second_per_gpu": 438.9 + }, + { + "epoch": 0.3165137614678899, + "grad_norm": 0.030314577743411064, + "learning_rate": 8.192949158105713e-05, + "loss": 0.0682, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 138, + "tokens_per_second_per_gpu": 355.22 + }, + { + "epoch": 0.31880733944954126, + "grad_norm": 0.02862844057381153, + "learning_rate": 8.163730188160105e-05, + "loss": 0.0764, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 139, + "tokens_per_second_per_gpu": 430.33 + }, + { + "epoch": 0.3211009174311927, + "grad_norm": 0.030885115265846252, + "learning_rate": 8.134329916915184e-05, + "loss": 0.0774, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 140, + "tokens_per_second_per_gpu": 369.87 + }, + { + "epoch": 0.32339449541284404, + "grad_norm": 0.025037452578544617, + "learning_rate": 8.104750029188257e-05, + "loss": 0.0695, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 141, + "tokens_per_second_per_gpu": 538.21 + }, + { + "epoch": 0.3256880733944954, + "grad_norm": 0.02607853338122368, + "learning_rate": 8.074992220089769e-05, + "loss": 0.066, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 142, + "tokens_per_second_per_gpu": 443.91 + }, + { + "epoch": 0.32798165137614677, + "grad_norm": 0.028251491487026215, + "learning_rate": 8.045058194926153e-05, + "loss": 0.0691, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 143, + "tokens_per_second_per_gpu": 403.07 + }, + { + "epoch": 0.3302752293577982, + "grad_norm": 0.02848455123603344, + "learning_rate": 8.014949669102117e-05, + "loss": 0.0712, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 144, + "tokens_per_second_per_gpu": 421.87 + }, + { + "epoch": 0.33256880733944955, + "grad_norm": 0.027499854564666748, + "learning_rate": 7.984668368022335e-05, + "loss": 0.071, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 145, + "tokens_per_second_per_gpu": 310.07 + }, + { + "epoch": 0.3348623853211009, + "grad_norm": 0.05668507516384125, + "learning_rate": 7.954216026992571e-05, + "loss": 0.072, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 146, + "tokens_per_second_per_gpu": 430.94 + }, + { + "epoch": 0.33715596330275227, + "grad_norm": 0.023797793313860893, + "learning_rate": 7.923594391120236e-05, + "loss": 0.0724, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 147, + "tokens_per_second_per_gpu": 506.38 + }, + { + "epoch": 0.3394495412844037, + "grad_norm": 0.03140917047858238, + "learning_rate": 7.892805215214381e-05, + "loss": 0.0707, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 148, + "tokens_per_second_per_gpu": 392.49 + }, + { + "epoch": 0.34174311926605505, + "grad_norm": 0.023651011288166046, + "learning_rate": 7.861850263685134e-05, + "loss": 0.0675, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 149, + "tokens_per_second_per_gpu": 468.39 + }, + { + "epoch": 0.3440366972477064, + "grad_norm": 0.028501421213150024, + "learning_rate": 7.830731310442599e-05, + "loss": 0.0677, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 150, + "tokens_per_second_per_gpu": 377.79 + }, + { + "epoch": 0.3463302752293578, + "grad_norm": 0.028334010392427444, + "learning_rate": 7.799450138795185e-05, + "loss": 0.0749, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 151, + "tokens_per_second_per_gpu": 370.82 + }, + { + "epoch": 0.3486238532110092, + "grad_norm": 0.029713135212659836, + "learning_rate": 7.768008541347423e-05, + "loss": 0.066, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 152, + "tokens_per_second_per_gpu": 403.75 + }, + { + "epoch": 0.35091743119266056, + "grad_norm": 0.030461538583040237, + "learning_rate": 7.73640831989723e-05, + "loss": 0.0667, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 153, + "tokens_per_second_per_gpu": 473.97 + }, + { + "epoch": 0.3532110091743119, + "grad_norm": 0.02694588340818882, + "learning_rate": 7.704651285332663e-05, + "loss": 0.0642, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 154, + "tokens_per_second_per_gpu": 421.0 + }, + { + "epoch": 0.3555045871559633, + "grad_norm": 0.025780972093343735, + "learning_rate": 7.672739257528134e-05, + "loss": 0.0727, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 155, + "tokens_per_second_per_gpu": 507.84 + }, + { + "epoch": 0.3577981651376147, + "grad_norm": 0.027480922639369965, + "learning_rate": 7.640674065240136e-05, + "loss": 0.078, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 156, + "tokens_per_second_per_gpu": 334.0 + }, + { + "epoch": 0.36009174311926606, + "grad_norm": 0.032992683351039886, + "learning_rate": 7.608457546002424e-05, + "loss": 0.0728, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 157, + "tokens_per_second_per_gpu": 315.95 + }, + { + "epoch": 0.3623853211009174, + "grad_norm": 0.029259737581014633, + "learning_rate": 7.576091546020725e-05, + "loss": 0.0721, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 158, + "tokens_per_second_per_gpu": 390.2 + }, + { + "epoch": 0.3646788990825688, + "grad_norm": 0.027205413207411766, + "learning_rate": 7.543577920066944e-05, + "loss": 0.0726, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 159, + "tokens_per_second_per_gpu": 459.84 + }, + { + "epoch": 0.3669724770642202, + "grad_norm": 0.028103800490498543, + "learning_rate": 7.510918531372857e-05, + "loss": 0.0723, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 160, + "tokens_per_second_per_gpu": 362.42 + }, + { + "epoch": 0.36926605504587157, + "grad_norm": 0.025422796607017517, + "learning_rate": 7.478115251523352e-05, + "loss": 0.0651, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 161, + "tokens_per_second_per_gpu": 409.68 + }, + { + "epoch": 0.37155963302752293, + "grad_norm": 0.0247375275939703, + "learning_rate": 7.445169960349167e-05, + "loss": 0.0648, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 162, + "tokens_per_second_per_gpu": 443.52 + }, + { + "epoch": 0.3738532110091743, + "grad_norm": 0.024430420249700546, + "learning_rate": 7.412084545819168e-05, + "loss": 0.0654, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 163, + "tokens_per_second_per_gpu": 439.86 + }, + { + "epoch": 0.3761467889908257, + "grad_norm": 0.02779349498450756, + "learning_rate": 7.378860903932159e-05, + "loss": 0.07, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 164, + "tokens_per_second_per_gpu": 387.88 + }, + { + "epoch": 0.37844036697247707, + "grad_norm": 0.028585737571120262, + "learning_rate": 7.34550093860822e-05, + "loss": 0.0794, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 165, + "tokens_per_second_per_gpu": 469.25 + }, + { + "epoch": 0.38073394495412843, + "grad_norm": 0.028040310367941856, + "learning_rate": 7.31200656157961e-05, + "loss": 0.0702, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 166, + "tokens_per_second_per_gpu": 340.82 + }, + { + "epoch": 0.3830275229357798, + "grad_norm": 0.030313577502965927, + "learning_rate": 7.278379692281208e-05, + "loss": 0.0694, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 167, + "tokens_per_second_per_gpu": 414.21 + }, + { + "epoch": 0.3853211009174312, + "grad_norm": 0.032695479691028595, + "learning_rate": 7.244622257740523e-05, + "loss": 0.0658, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 168, + "tokens_per_second_per_gpu": 435.84 + }, + { + "epoch": 0.3876146788990826, + "grad_norm": 0.02221628651022911, + "learning_rate": 7.210736192467256e-05, + "loss": 0.0596, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 169, + "tokens_per_second_per_gpu": 451.04 + }, + { + "epoch": 0.38990825688073394, + "grad_norm": 0.02417284995317459, + "learning_rate": 7.176723438342446e-05, + "loss": 0.0714, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 170, + "tokens_per_second_per_gpu": 444.02 + }, + { + "epoch": 0.3922018348623853, + "grad_norm": 0.027553344145417213, + "learning_rate": 7.142585944507185e-05, + "loss": 0.0613, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 171, + "tokens_per_second_per_gpu": 436.33 + }, + { + "epoch": 0.3944954128440367, + "grad_norm": 0.028384285047650337, + "learning_rate": 7.10832566725092e-05, + "loss": 0.0634, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 172, + "tokens_per_second_per_gpu": 389.66 + }, + { + "epoch": 0.3967889908256881, + "grad_norm": 0.024850716814398766, + "learning_rate": 7.073944569899354e-05, + "loss": 0.0717, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 173, + "tokens_per_second_per_gpu": 475.5 + }, + { + "epoch": 0.39908256880733944, + "grad_norm": 0.025330083444714546, + "learning_rate": 7.039444622701922e-05, + "loss": 0.0724, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 174, + "tokens_per_second_per_gpu": 383.81 + }, + { + "epoch": 0.4013761467889908, + "grad_norm": 0.025969544425606728, + "learning_rate": 7.00482780271889e-05, + "loss": 0.0712, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 175, + "tokens_per_second_per_gpu": 385.6 + }, + { + "epoch": 0.4036697247706422, + "grad_norm": 0.02731173112988472, + "learning_rate": 6.97009609370806e-05, + "loss": 0.0678, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 176, + "tokens_per_second_per_gpu": 430.61 + }, + { + "epoch": 0.4059633027522936, + "grad_norm": 0.028133299201726913, + "learning_rate": 6.935251486011087e-05, + "loss": 0.061, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 177, + "tokens_per_second_per_gpu": 379.64 + }, + { + "epoch": 0.40825688073394495, + "grad_norm": 0.02273411862552166, + "learning_rate": 6.900295976439413e-05, + "loss": 0.0604, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 178, + "tokens_per_second_per_gpu": 393.24 + }, + { + "epoch": 0.4105504587155963, + "grad_norm": 0.025121403858065605, + "learning_rate": 6.865231568159846e-05, + "loss": 0.0697, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 179, + "tokens_per_second_per_gpu": 453.6 + }, + { + "epoch": 0.41284403669724773, + "grad_norm": 0.029893774539232254, + "learning_rate": 6.830060270579768e-05, + "loss": 0.0743, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.77, + "memory/max_allocated (GiB)": 48.77, + "step": 180, + "tokens_per_second_per_gpu": 402.38 + }, + { + "epoch": 0.4151376146788991, + "grad_norm": 0.026196127757430077, + "learning_rate": 6.794784099231972e-05, + "loss": 0.0653, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 181, + "tokens_per_second_per_gpu": 369.19 + }, + { + "epoch": 0.41743119266055045, + "grad_norm": 0.03042738139629364, + "learning_rate": 6.759405075659166e-05, + "loss": 0.0654, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 182, + "tokens_per_second_per_gpu": 389.74 + }, + { + "epoch": 0.4197247706422018, + "grad_norm": 0.02454569563269615, + "learning_rate": 6.723925227298132e-05, + "loss": 0.0648, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 183, + "tokens_per_second_per_gpu": 383.9 + }, + { + "epoch": 0.42201834862385323, + "grad_norm": 0.03029336780309677, + "learning_rate": 6.688346587363533e-05, + "loss": 0.0711, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 184, + "tokens_per_second_per_gpu": 436.44 + }, + { + "epoch": 0.4243119266055046, + "grad_norm": 0.02716301940381527, + "learning_rate": 6.652671194731396e-05, + "loss": 0.0638, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 185, + "tokens_per_second_per_gpu": 405.73 + }, + { + "epoch": 0.42660550458715596, + "grad_norm": 0.030476156622171402, + "learning_rate": 6.616901093822283e-05, + "loss": 0.0742, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 186, + "tokens_per_second_per_gpu": 417.15 + }, + { + "epoch": 0.4288990825688073, + "grad_norm": 0.024246055632829666, + "learning_rate": 6.58103833448412e-05, + "loss": 0.0606, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 187, + "tokens_per_second_per_gpu": 418.65 + }, + { + "epoch": 0.43119266055045874, + "grad_norm": 0.025659549981355667, + "learning_rate": 6.545084971874738e-05, + "loss": 0.0643, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 188, + "tokens_per_second_per_gpu": 524.48 + }, + { + "epoch": 0.4334862385321101, + "grad_norm": 0.02851368486881256, + "learning_rate": 6.509043066344092e-05, + "loss": 0.0728, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 189, + "tokens_per_second_per_gpu": 470.95 + }, + { + "epoch": 0.43577981651376146, + "grad_norm": 0.03035641275346279, + "learning_rate": 6.472914683316195e-05, + "loss": 0.0797, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.04, + "memory/max_allocated (GiB)": 49.04, + "step": 190, + "tokens_per_second_per_gpu": 409.73 + }, + { + "epoch": 0.4380733944954128, + "grad_norm": 0.026916082948446274, + "learning_rate": 6.436701893170756e-05, + "loss": 0.06, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 191, + "tokens_per_second_per_gpu": 424.58 + }, + { + "epoch": 0.44036697247706424, + "grad_norm": 0.035412922501564026, + "learning_rate": 6.400406771124536e-05, + "loss": 0.0699, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 192, + "tokens_per_second_per_gpu": 372.44 + }, + { + "epoch": 0.4426605504587156, + "grad_norm": 0.02869465760886669, + "learning_rate": 6.364031397112416e-05, + "loss": 0.0709, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 193, + "tokens_per_second_per_gpu": 411.07 + }, + { + "epoch": 0.44495412844036697, + "grad_norm": 0.02998914197087288, + "learning_rate": 6.327577855668216e-05, + "loss": 0.0693, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 194, + "tokens_per_second_per_gpu": 473.81 + }, + { + "epoch": 0.44724770642201833, + "grad_norm": 0.029111091047525406, + "learning_rate": 6.291048235805234e-05, + "loss": 0.0789, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 195, + "tokens_per_second_per_gpu": 393.48 + }, + { + "epoch": 0.44954128440366975, + "grad_norm": 0.028819169849157333, + "learning_rate": 6.254444630896529e-05, + "loss": 0.0738, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 196, + "tokens_per_second_per_gpu": 339.21 + }, + { + "epoch": 0.4518348623853211, + "grad_norm": 0.027091829106211662, + "learning_rate": 6.21776913855496e-05, + "loss": 0.0606, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 197, + "tokens_per_second_per_gpu": 490.05 + }, + { + "epoch": 0.4541284403669725, + "grad_norm": 0.023907724767923355, + "learning_rate": 6.181023860512984e-05, + "loss": 0.0664, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 198, + "tokens_per_second_per_gpu": 437.98 + }, + { + "epoch": 0.45642201834862384, + "grad_norm": 0.026607749983668327, + "learning_rate": 6.144210902502207e-05, + "loss": 0.0686, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 199, + "tokens_per_second_per_gpu": 518.9 + }, + { + "epoch": 0.45871559633027525, + "grad_norm": 0.028734847903251648, + "learning_rate": 6.107332374132715e-05, + "loss": 0.0709, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 200, + "tokens_per_second_per_gpu": 448.6 + }, + { + "epoch": 0.4610091743119266, + "grad_norm": 0.027956590056419373, + "learning_rate": 6.0703903887721837e-05, + "loss": 0.0645, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 201, + "tokens_per_second_per_gpu": 450.75 + }, + { + "epoch": 0.463302752293578, + "grad_norm": 0.02955472283065319, + "learning_rate": 6.0333870634247645e-05, + "loss": 0.0749, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 202, + "tokens_per_second_per_gpu": 366.38 + }, + { + "epoch": 0.46559633027522934, + "grad_norm": 0.033545345067977905, + "learning_rate": 5.9963245186097725e-05, + "loss": 0.0714, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 203, + "tokens_per_second_per_gpu": 409.9 + }, + { + "epoch": 0.46788990825688076, + "grad_norm": 0.027358222752809525, + "learning_rate": 5.95920487824016e-05, + "loss": 0.0632, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 204, + "tokens_per_second_per_gpu": 409.18 + }, + { + "epoch": 0.4701834862385321, + "grad_norm": 0.026303566992282867, + "learning_rate": 5.922030269500809e-05, + "loss": 0.0621, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.77, + "memory/max_allocated (GiB)": 48.77, + "step": 205, + "tokens_per_second_per_gpu": 344.97 + }, + { + "epoch": 0.4724770642201835, + "grad_norm": 0.023472387343645096, + "learning_rate": 5.8848028227266325e-05, + "loss": 0.0642, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 206, + "tokens_per_second_per_gpu": 458.3 + }, + { + "epoch": 0.47477064220183485, + "grad_norm": 0.02930634468793869, + "learning_rate": 5.847524671280484e-05, + "loss": 0.07, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 207, + "tokens_per_second_per_gpu": 386.88 + }, + { + "epoch": 0.47706422018348627, + "grad_norm": 0.02035793662071228, + "learning_rate": 5.810197951430911e-05, + "loss": 0.0558, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 208, + "tokens_per_second_per_gpu": 479.37 + }, + { + "epoch": 0.4793577981651376, + "grad_norm": 0.027948010712862015, + "learning_rate": 5.772824802229733e-05, + "loss": 0.07, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.77, + "memory/max_allocated (GiB)": 48.77, + "step": 209, + "tokens_per_second_per_gpu": 352.97 + }, + { + "epoch": 0.481651376146789, + "grad_norm": 0.027743425220251083, + "learning_rate": 5.735407365389453e-05, + "loss": 0.0686, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 210, + "tokens_per_second_per_gpu": 419.65 + }, + { + "epoch": 0.48394495412844035, + "grad_norm": 0.03574339672923088, + "learning_rate": 5.697947785160532e-05, + "loss": 0.0593, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 211, + "tokens_per_second_per_gpu": 391.99 + }, + { + "epoch": 0.48623853211009177, + "grad_norm": 0.03303733468055725, + "learning_rate": 5.660448208208513e-05, + "loss": 0.0615, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 212, + "tokens_per_second_per_gpu": 420.47 + }, + { + "epoch": 0.48853211009174313, + "grad_norm": 0.030316850170493126, + "learning_rate": 5.622910783490988e-05, + "loss": 0.0745, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.77, + "memory/max_allocated (GiB)": 48.77, + "step": 213, + "tokens_per_second_per_gpu": 379.16 + }, + { + "epoch": 0.4908256880733945, + "grad_norm": 0.031506236642599106, + "learning_rate": 5.585337662134471e-05, + "loss": 0.0724, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 214, + "tokens_per_second_per_gpu": 376.6 + }, + { + "epoch": 0.49311926605504586, + "grad_norm": 0.025807412341237068, + "learning_rate": 5.5477309973111046e-05, + "loss": 0.0628, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 215, + "tokens_per_second_per_gpu": 386.77 + }, + { + "epoch": 0.4954128440366973, + "grad_norm": 0.02294624038040638, + "learning_rate": 5.510092944115286e-05, + "loss": 0.0629, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 216, + "tokens_per_second_per_gpu": 473.64 + }, + { + "epoch": 0.49770642201834864, + "grad_norm": 0.027048619464039803, + "learning_rate": 5.472425659440157e-05, + "loss": 0.0675, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 217, + "tokens_per_second_per_gpu": 374.21 + }, + { + "epoch": 0.5, + "grad_norm": 0.026564767584204674, + "learning_rate": 5.4347313018540056e-05, + "loss": 0.0697, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 218, + "tokens_per_second_per_gpu": 442.12 + }, + { + "epoch": 0.5022935779816514, + "grad_norm": 0.03516434505581856, + "learning_rate": 5.397012031476562e-05, + "loss": 0.082, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.04, + "memory/max_allocated (GiB)": 49.04, + "step": 219, + "tokens_per_second_per_gpu": 380.84 + }, + { + "epoch": 0.5045871559633027, + "grad_norm": 0.021558105945587158, + "learning_rate": 5.359270009855216e-05, + "loss": 0.0585, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 220, + "tokens_per_second_per_gpu": 509.31 + }, + { + "epoch": 0.5068807339449541, + "grad_norm": 0.024724913761019707, + "learning_rate": 5.321507399841148e-05, + "loss": 0.0632, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 221, + "tokens_per_second_per_gpu": 438.7 + }, + { + "epoch": 0.5091743119266054, + "grad_norm": 0.02698579616844654, + "learning_rate": 5.2837263654653715e-05, + "loss": 0.0715, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 222, + "tokens_per_second_per_gpu": 337.92 + }, + { + "epoch": 0.5114678899082569, + "grad_norm": 0.03043169341981411, + "learning_rate": 5.2459290718147344e-05, + "loss": 0.0755, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 223, + "tokens_per_second_per_gpu": 485.96 + }, + { + "epoch": 0.5137614678899083, + "grad_norm": 0.026405537500977516, + "learning_rate": 5.2081176849078464e-05, + "loss": 0.0641, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 224, + "tokens_per_second_per_gpu": 434.97 + }, + { + "epoch": 0.5160550458715596, + "grad_norm": 0.024269182235002518, + "learning_rate": 5.170294371570939e-05, + "loss": 0.0666, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 225, + "tokens_per_second_per_gpu": 399.27 + }, + { + "epoch": 0.518348623853211, + "grad_norm": 0.03496242314577103, + "learning_rate": 5.132461299313709e-05, + "loss": 0.073, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.04, + "memory/max_allocated (GiB)": 49.04, + "step": 226, + "tokens_per_second_per_gpu": 422.84 + }, + { + "epoch": 0.5206422018348624, + "grad_norm": 0.029179584234952927, + "learning_rate": 5.094620636205095e-05, + "loss": 0.0697, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.04, + "memory/max_allocated (GiB)": 49.04, + "step": 227, + "tokens_per_second_per_gpu": 357.38 + }, + { + "epoch": 0.5229357798165137, + "grad_norm": 0.027006233111023903, + "learning_rate": 5.056774550749043e-05, + "loss": 0.0614, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 228, + "tokens_per_second_per_gpu": 316.93 + }, + { + "epoch": 0.5252293577981652, + "grad_norm": 0.028260482475161552, + "learning_rate": 5.018925211760227e-05, + "loss": 0.0634, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 229, + "tokens_per_second_per_gpu": 417.85 + }, + { + "epoch": 0.5275229357798165, + "grad_norm": 0.025130394846200943, + "learning_rate": 4.981074788239773e-05, + "loss": 0.0588, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 230, + "tokens_per_second_per_gpu": 413.46 + }, + { + "epoch": 0.5298165137614679, + "grad_norm": 0.025551561266183853, + "learning_rate": 4.943225449250958e-05, + "loss": 0.0688, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 231, + "tokens_per_second_per_gpu": 445.27 + }, + { + "epoch": 0.5321100917431193, + "grad_norm": 0.028664810582995415, + "learning_rate": 4.9053793637949067e-05, + "loss": 0.0689, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 232, + "tokens_per_second_per_gpu": 395.88 + }, + { + "epoch": 0.5344036697247706, + "grad_norm": 0.02686873823404312, + "learning_rate": 4.8675387006862914e-05, + "loss": 0.0656, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 233, + "tokens_per_second_per_gpu": 544.1 + }, + { + "epoch": 0.536697247706422, + "grad_norm": 0.03144492581486702, + "learning_rate": 4.829705628429061e-05, + "loss": 0.0795, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 234, + "tokens_per_second_per_gpu": 356.41 + }, + { + "epoch": 0.5389908256880734, + "grad_norm": 0.02188139036297798, + "learning_rate": 4.7918823150921555e-05, + "loss": 0.0611, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 235, + "tokens_per_second_per_gpu": 368.54 + }, + { + "epoch": 0.5412844036697247, + "grad_norm": 0.02784140035510063, + "learning_rate": 4.754070928185266e-05, + "loss": 0.0604, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 236, + "tokens_per_second_per_gpu": 445.84 + }, + { + "epoch": 0.5435779816513762, + "grad_norm": 0.02372545376420021, + "learning_rate": 4.7162736345346303e-05, + "loss": 0.0604, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 237, + "tokens_per_second_per_gpu": 467.85 + }, + { + "epoch": 0.5458715596330275, + "grad_norm": 0.03274843469262123, + "learning_rate": 4.6784926001588544e-05, + "loss": 0.0817, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 238, + "tokens_per_second_per_gpu": 438.54 + }, + { + "epoch": 0.5481651376146789, + "grad_norm": 0.02551015093922615, + "learning_rate": 4.640729990144784e-05, + "loss": 0.0631, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 239, + "tokens_per_second_per_gpu": 486.44 + }, + { + "epoch": 0.5504587155963303, + "grad_norm": 0.04315930977463722, + "learning_rate": 4.6029879685234395e-05, + "loss": 0.0661, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.04, + "memory/max_allocated (GiB)": 49.04, + "step": 240, + "tokens_per_second_per_gpu": 450.6 + }, + { + "epoch": 0.5527522935779816, + "grad_norm": 0.024066558107733727, + "learning_rate": 4.565268698145997e-05, + "loss": 0.0612, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 241, + "tokens_per_second_per_gpu": 462.93 + }, + { + "epoch": 0.555045871559633, + "grad_norm": 0.026846949011087418, + "learning_rate": 4.527574340559844e-05, + "loss": 0.0754, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 242, + "tokens_per_second_per_gpu": 392.01 + }, + { + "epoch": 0.5573394495412844, + "grad_norm": 0.02346811629831791, + "learning_rate": 4.4899070558847154e-05, + "loss": 0.0675, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 243, + "tokens_per_second_per_gpu": 468.19 + }, + { + "epoch": 0.5596330275229358, + "grad_norm": 0.02288683131337166, + "learning_rate": 4.452269002688897e-05, + "loss": 0.064, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 244, + "tokens_per_second_per_gpu": 306.21 + }, + { + "epoch": 0.5619266055045872, + "grad_norm": 0.0288680586963892, + "learning_rate": 4.4146623378655296e-05, + "loss": 0.0677, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.73, + "memory/max_allocated (GiB)": 48.73, + "step": 245, + "tokens_per_second_per_gpu": 325.4 + }, + { + "epoch": 0.5642201834862385, + "grad_norm": 0.02450747601687908, + "learning_rate": 4.3770892165090126e-05, + "loss": 0.0638, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 246, + "tokens_per_second_per_gpu": 401.5 + }, + { + "epoch": 0.5665137614678899, + "grad_norm": 0.028074199333786964, + "learning_rate": 4.3395517917914895e-05, + "loss": 0.0615, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 247, + "tokens_per_second_per_gpu": 537.03 + }, + { + "epoch": 0.5688073394495413, + "grad_norm": 0.02514073997735977, + "learning_rate": 4.3020522148394676e-05, + "loss": 0.0669, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 248, + "tokens_per_second_per_gpu": 409.93 + }, + { + "epoch": 0.5711009174311926, + "grad_norm": 0.029449012130498886, + "learning_rate": 4.2645926346105484e-05, + "loss": 0.0711, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 249, + "tokens_per_second_per_gpu": 344.6 + }, + { + "epoch": 0.573394495412844, + "grad_norm": 0.024152036756277084, + "learning_rate": 4.22717519777027e-05, + "loss": 0.0652, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 250, + "tokens_per_second_per_gpu": 417.48 + }, + { + "epoch": 0.5756880733944955, + "grad_norm": 0.02781221643090248, + "learning_rate": 4.189802048569089e-05, + "loss": 0.0598, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 251, + "tokens_per_second_per_gpu": 477.01 + }, + { + "epoch": 0.5779816513761468, + "grad_norm": 0.02137266844511032, + "learning_rate": 4.1524753287195165e-05, + "loss": 0.0584, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 252, + "tokens_per_second_per_gpu": 475.28 + }, + { + "epoch": 0.5802752293577982, + "grad_norm": 0.03145367652177811, + "learning_rate": 4.1151971772733686e-05, + "loss": 0.0742, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 253, + "tokens_per_second_per_gpu": 416.81 + }, + { + "epoch": 0.5825688073394495, + "grad_norm": 0.026259735226631165, + "learning_rate": 4.07796973049919e-05, + "loss": 0.0704, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 254, + "tokens_per_second_per_gpu": 432.14 + }, + { + "epoch": 0.5848623853211009, + "grad_norm": 0.029704980552196503, + "learning_rate": 4.04079512175984e-05, + "loss": 0.0751, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 255, + "tokens_per_second_per_gpu": 368.81 + }, + { + "epoch": 0.5871559633027523, + "grad_norm": 0.037060242146253586, + "learning_rate": 4.003675481390228e-05, + "loss": 0.081, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 256, + "tokens_per_second_per_gpu": 400.19 + }, + { + "epoch": 0.5894495412844036, + "grad_norm": 0.027513017877936363, + "learning_rate": 3.966612936575235e-05, + "loss": 0.0597, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 257, + "tokens_per_second_per_gpu": 381.21 + }, + { + "epoch": 0.591743119266055, + "grad_norm": 0.037167515605688095, + "learning_rate": 3.929609611227817e-05, + "loss": 0.0639, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 258, + "tokens_per_second_per_gpu": 357.36 + }, + { + "epoch": 0.5940366972477065, + "grad_norm": 0.0229306872934103, + "learning_rate": 3.8926676258672866e-05, + "loss": 0.0626, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 259, + "tokens_per_second_per_gpu": 387.68 + }, + { + "epoch": 0.5963302752293578, + "grad_norm": 0.027137834578752518, + "learning_rate": 3.855789097497794e-05, + "loss": 0.0711, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 260, + "tokens_per_second_per_gpu": 377.29 + }, + { + "epoch": 0.5986238532110092, + "grad_norm": 0.027339540421962738, + "learning_rate": 3.818976139487017e-05, + "loss": 0.0644, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 261, + "tokens_per_second_per_gpu": 476.61 + }, + { + "epoch": 0.6009174311926605, + "grad_norm": 0.02739766612648964, + "learning_rate": 3.7822308614450406e-05, + "loss": 0.0711, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 262, + "tokens_per_second_per_gpu": 426.96 + }, + { + "epoch": 0.6032110091743119, + "grad_norm": 0.02805398218333721, + "learning_rate": 3.745555369103471e-05, + "loss": 0.0669, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 263, + "tokens_per_second_per_gpu": 363.5 + }, + { + "epoch": 0.6055045871559633, + "grad_norm": 0.03466130048036575, + "learning_rate": 3.708951764194767e-05, + "loss": 0.0771, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 264, + "tokens_per_second_per_gpu": 383.75 + }, + { + "epoch": 0.6077981651376146, + "grad_norm": 0.02684733085334301, + "learning_rate": 3.6724221443317855e-05, + "loss": 0.0613, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 265, + "tokens_per_second_per_gpu": 545.2 + }, + { + "epoch": 0.6100917431192661, + "grad_norm": 0.025042880326509476, + "learning_rate": 3.635968602887585e-05, + "loss": 0.0706, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 266, + "tokens_per_second_per_gpu": 420.58 + }, + { + "epoch": 0.6123853211009175, + "grad_norm": 0.02610246278345585, + "learning_rate": 3.599593228875465e-05, + "loss": 0.0749, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 267, + "tokens_per_second_per_gpu": 443.84 + }, + { + "epoch": 0.6146788990825688, + "grad_norm": 0.02343624271452427, + "learning_rate": 3.563298106829244e-05, + "loss": 0.0676, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 268, + "tokens_per_second_per_gpu": 409.89 + }, + { + "epoch": 0.6169724770642202, + "grad_norm": 0.02438695915043354, + "learning_rate": 3.527085316683805e-05, + "loss": 0.0648, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 269, + "tokens_per_second_per_gpu": 475.38 + }, + { + "epoch": 0.6192660550458715, + "grad_norm": 0.02070113644003868, + "learning_rate": 3.490956933655909e-05, + "loss": 0.0605, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 270, + "tokens_per_second_per_gpu": 341.3 + }, + { + "epoch": 0.6215596330275229, + "grad_norm": 0.03797437623143196, + "learning_rate": 3.4549150281252636e-05, + "loss": 0.0674, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 271, + "tokens_per_second_per_gpu": 391.97 + }, + { + "epoch": 0.6238532110091743, + "grad_norm": 0.02536945417523384, + "learning_rate": 3.41896166551588e-05, + "loss": 0.0649, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 272, + "tokens_per_second_per_gpu": 461.4 + }, + { + "epoch": 0.6261467889908257, + "grad_norm": 0.032918062061071396, + "learning_rate": 3.383098906177719e-05, + "loss": 0.0769, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 273, + "tokens_per_second_per_gpu": 495.12 + }, + { + "epoch": 0.6284403669724771, + "grad_norm": 0.03230955824255943, + "learning_rate": 3.347328805268605e-05, + "loss": 0.0687, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 274, + "tokens_per_second_per_gpu": 355.59 + }, + { + "epoch": 0.6307339449541285, + "grad_norm": 0.045344047248363495, + "learning_rate": 3.3116534126364685e-05, + "loss": 0.0748, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 275, + "tokens_per_second_per_gpu": 339.05 + }, + { + "epoch": 0.6330275229357798, + "grad_norm": 0.021811284124851227, + "learning_rate": 3.2760747727018694e-05, + "loss": 0.0646, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 276, + "tokens_per_second_per_gpu": 334.53 + }, + { + "epoch": 0.6353211009174312, + "grad_norm": 0.02648971416056156, + "learning_rate": 3.240594924340835e-05, + "loss": 0.068, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 277, + "tokens_per_second_per_gpu": 375.76 + }, + { + "epoch": 0.6376146788990825, + "grad_norm": 0.022893795743584633, + "learning_rate": 3.205215900768029e-05, + "loss": 0.0627, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 278, + "tokens_per_second_per_gpu": 412.8 + }, + { + "epoch": 0.6399082568807339, + "grad_norm": 0.027191977947950363, + "learning_rate": 3.169939729420233e-05, + "loss": 0.0632, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 279, + "tokens_per_second_per_gpu": 408.91 + }, + { + "epoch": 0.6422018348623854, + "grad_norm": 0.023182721808552742, + "learning_rate": 3.1347684318401536e-05, + "loss": 0.0631, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 280, + "tokens_per_second_per_gpu": 434.99 + }, + { + "epoch": 0.6444954128440367, + "grad_norm": 0.03368153050541878, + "learning_rate": 3.099704023560587e-05, + "loss": 0.0762, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 281, + "tokens_per_second_per_gpu": 393.93 + }, + { + "epoch": 0.6467889908256881, + "grad_norm": 0.023287048563361168, + "learning_rate": 3.0647485139889145e-05, + "loss": 0.0629, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 282, + "tokens_per_second_per_gpu": 335.84 + }, + { + "epoch": 0.6490825688073395, + "grad_norm": 0.027626749128103256, + "learning_rate": 3.0299039062919416e-05, + "loss": 0.0631, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 283, + "tokens_per_second_per_gpu": 446.72 + }, + { + "epoch": 0.6513761467889908, + "grad_norm": 0.02671007066965103, + "learning_rate": 2.995172197281113e-05, + "loss": 0.0684, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 284, + "tokens_per_second_per_gpu": 419.33 + }, + { + "epoch": 0.6536697247706422, + "grad_norm": 0.026775743812322617, + "learning_rate": 2.96055537729808e-05, + "loss": 0.063, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 285, + "tokens_per_second_per_gpu": 456.33 + }, + { + "epoch": 0.6559633027522935, + "grad_norm": 0.024690093472599983, + "learning_rate": 2.926055430100647e-05, + "loss": 0.0601, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 286, + "tokens_per_second_per_gpu": 363.46 + }, + { + "epoch": 0.658256880733945, + "grad_norm": 0.021927161142230034, + "learning_rate": 2.8916743327490803e-05, + "loss": 0.0598, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 287, + "tokens_per_second_per_gpu": 395.18 + }, + { + "epoch": 0.6605504587155964, + "grad_norm": 0.029110578820109367, + "learning_rate": 2.8574140554928175e-05, + "loss": 0.0732, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 288, + "tokens_per_second_per_gpu": 395.82 + }, + { + "epoch": 0.6628440366972477, + "grad_norm": 0.025474051013588905, + "learning_rate": 2.8232765616575563e-05, + "loss": 0.0674, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 289, + "tokens_per_second_per_gpu": 435.52 + }, + { + "epoch": 0.6651376146788991, + "grad_norm": 0.02178235538303852, + "learning_rate": 2.789263807532746e-05, + "loss": 0.0616, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 290, + "tokens_per_second_per_gpu": 442.39 + }, + { + "epoch": 0.6674311926605505, + "grad_norm": 0.023412682116031647, + "learning_rate": 2.7553777422594774e-05, + "loss": 0.0673, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 291, + "tokens_per_second_per_gpu": 412.59 + }, + { + "epoch": 0.6697247706422018, + "grad_norm": 0.023469222709536552, + "learning_rate": 2.721620307718793e-05, + "loss": 0.0682, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.77, + "memory/max_allocated (GiB)": 48.77, + "step": 292, + "tokens_per_second_per_gpu": 276.54 + }, + { + "epoch": 0.6720183486238532, + "grad_norm": 0.03131282329559326, + "learning_rate": 2.687993438420392e-05, + "loss": 0.0647, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 293, + "tokens_per_second_per_gpu": 392.4 + }, + { + "epoch": 0.6743119266055045, + "grad_norm": 0.02991569973528385, + "learning_rate": 2.65449906139178e-05, + "loss": 0.0681, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 294, + "tokens_per_second_per_gpu": 377.34 + }, + { + "epoch": 0.676605504587156, + "grad_norm": 0.02651585452258587, + "learning_rate": 2.6211390960678413e-05, + "loss": 0.0802, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.77, + "memory/max_allocated (GiB)": 48.77, + "step": 295, + "tokens_per_second_per_gpu": 358.61 + }, + { + "epoch": 0.6788990825688074, + "grad_norm": 0.022964881733059883, + "learning_rate": 2.5879154541808337e-05, + "loss": 0.0643, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 296, + "tokens_per_second_per_gpu": 484.52 + }, + { + "epoch": 0.6811926605504587, + "grad_norm": 0.028967639431357384, + "learning_rate": 2.554830039650834e-05, + "loss": 0.0632, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 297, + "tokens_per_second_per_gpu": 440.4 + }, + { + "epoch": 0.6834862385321101, + "grad_norm": 0.02948296256363392, + "learning_rate": 2.5218847484766495e-05, + "loss": 0.0752, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 298, + "tokens_per_second_per_gpu": 288.6 + }, + { + "epoch": 0.6857798165137615, + "grad_norm": 0.03220253810286522, + "learning_rate": 2.4890814686271448e-05, + "loss": 0.0634, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 299, + "tokens_per_second_per_gpu": 447.81 + }, + { + "epoch": 0.6880733944954128, + "grad_norm": 0.028979238122701645, + "learning_rate": 2.456422079933056e-05, + "loss": 0.0689, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 300, + "tokens_per_second_per_gpu": 458.5 + }, + { + "epoch": 0.6903669724770642, + "grad_norm": 0.024549167603254318, + "learning_rate": 2.4239084539792745e-05, + "loss": 0.0593, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 301, + "tokens_per_second_per_gpu": 419.65 + }, + { + "epoch": 0.6926605504587156, + "grad_norm": 0.02671237848699093, + "learning_rate": 2.391542453997578e-05, + "loss": 0.0657, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 302, + "tokens_per_second_per_gpu": 368.27 + }, + { + "epoch": 0.694954128440367, + "grad_norm": 0.03672722727060318, + "learning_rate": 2.3593259347598657e-05, + "loss": 0.0535, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 303, + "tokens_per_second_per_gpu": 474.85 + }, + { + "epoch": 0.6972477064220184, + "grad_norm": 0.03666655346751213, + "learning_rate": 2.3272607424718675e-05, + "loss": 0.0646, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 304, + "tokens_per_second_per_gpu": 393.88 + }, + { + "epoch": 0.6995412844036697, + "grad_norm": 0.025117024779319763, + "learning_rate": 2.29534871466734e-05, + "loss": 0.0699, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.04, + "memory/max_allocated (GiB)": 49.04, + "step": 305, + "tokens_per_second_per_gpu": 449.5 + }, + { + "epoch": 0.7018348623853211, + "grad_norm": 0.035403817892074585, + "learning_rate": 2.2635916801027706e-05, + "loss": 0.0769, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 306, + "tokens_per_second_per_gpu": 420.33 + }, + { + "epoch": 0.7041284403669725, + "grad_norm": 0.026707297191023827, + "learning_rate": 2.2319914586525777e-05, + "loss": 0.0633, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 307, + "tokens_per_second_per_gpu": 451.77 + }, + { + "epoch": 0.7064220183486238, + "grad_norm": 0.02504413016140461, + "learning_rate": 2.2005498612048155e-05, + "loss": 0.0597, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 308, + "tokens_per_second_per_gpu": 357.06 + }, + { + "epoch": 0.7087155963302753, + "grad_norm": 0.02307130955159664, + "learning_rate": 2.1692686895574005e-05, + "loss": 0.064, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 309, + "tokens_per_second_per_gpu": 474.84 + }, + { + "epoch": 0.7110091743119266, + "grad_norm": 0.026173440739512444, + "learning_rate": 2.1381497363148673e-05, + "loss": 0.063, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 310, + "tokens_per_second_per_gpu": 403.88 + }, + { + "epoch": 0.713302752293578, + "grad_norm": 0.027350088581442833, + "learning_rate": 2.1071947847856222e-05, + "loss": 0.0674, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 311, + "tokens_per_second_per_gpu": 409.62 + }, + { + "epoch": 0.7155963302752294, + "grad_norm": 0.02530243620276451, + "learning_rate": 2.0764056088797645e-05, + "loss": 0.063, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 312, + "tokens_per_second_per_gpu": 385.83 + }, + { + "epoch": 0.7178899082568807, + "grad_norm": 0.028018414974212646, + "learning_rate": 2.045783973007429e-05, + "loss": 0.0634, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 313, + "tokens_per_second_per_gpu": 395.65 + }, + { + "epoch": 0.7201834862385321, + "grad_norm": 0.02613895572721958, + "learning_rate": 2.0153316319776662e-05, + "loss": 0.0653, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 314, + "tokens_per_second_per_gpu": 357.1 + }, + { + "epoch": 0.7224770642201835, + "grad_norm": 0.026048416271805763, + "learning_rate": 1.985050330897883e-05, + "loss": 0.0644, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 315, + "tokens_per_second_per_gpu": 395.0 + }, + { + "epoch": 0.7247706422018348, + "grad_norm": 0.030031291767954826, + "learning_rate": 1.954941805073848e-05, + "loss": 0.078, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 316, + "tokens_per_second_per_gpu": 372.79 + }, + { + "epoch": 0.7270642201834863, + "grad_norm": 0.029979195445775986, + "learning_rate": 1.9250077799102322e-05, + "loss": 0.0651, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 317, + "tokens_per_second_per_gpu": 438.54 + }, + { + "epoch": 0.7293577981651376, + "grad_norm": 0.025628041476011276, + "learning_rate": 1.8952499708117432e-05, + "loss": 0.0669, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 318, + "tokens_per_second_per_gpu": 474.63 + }, + { + "epoch": 0.731651376146789, + "grad_norm": 0.024868648499250412, + "learning_rate": 1.8656700830848174e-05, + "loss": 0.0656, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 319, + "tokens_per_second_per_gpu": 445.15 + }, + { + "epoch": 0.7339449541284404, + "grad_norm": 0.024810567498207092, + "learning_rate": 1.8362698118398967e-05, + "loss": 0.064, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 320, + "tokens_per_second_per_gpu": 383.63 + }, + { + "epoch": 0.7362385321100917, + "grad_norm": 0.02743346616625786, + "learning_rate": 1.8070508418942876e-05, + "loss": 0.0758, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 321, + "tokens_per_second_per_gpu": 386.17 + }, + { + "epoch": 0.7385321100917431, + "grad_norm": 0.028884073719382286, + "learning_rate": 1.7780148476756147e-05, + "loss": 0.0675, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 322, + "tokens_per_second_per_gpu": 498.58 + }, + { + "epoch": 0.7408256880733946, + "grad_norm": 0.028301537036895752, + "learning_rate": 1.7491634931258587e-05, + "loss": 0.0734, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 323, + "tokens_per_second_per_gpu": 392.47 + }, + { + "epoch": 0.7431192660550459, + "grad_norm": 0.02405114285647869, + "learning_rate": 1.7204984316060063e-05, + "loss": 0.0538, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 324, + "tokens_per_second_per_gpu": 409.24 + }, + { + "epoch": 0.7454128440366973, + "grad_norm": 0.029399245977401733, + "learning_rate": 1.6920213058013022e-05, + "loss": 0.0693, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.08, + "memory/max_allocated (GiB)": 49.08, + "step": 325, + "tokens_per_second_per_gpu": 461.77 + }, + { + "epoch": 0.7477064220183486, + "grad_norm": 0.02802177332341671, + "learning_rate": 1.6637337476271124e-05, + "loss": 0.0647, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 326, + "tokens_per_second_per_gpu": 389.28 + }, + { + "epoch": 0.75, + "grad_norm": 0.024391207844018936, + "learning_rate": 1.6356373781354058e-05, + "loss": 0.066, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 327, + "tokens_per_second_per_gpu": 376.51 + }, + { + "epoch": 0.7522935779816514, + "grad_norm": 0.02589585818350315, + "learning_rate": 1.6077338074218596e-05, + "loss": 0.0676, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 328, + "tokens_per_second_per_gpu": 422.1 + }, + { + "epoch": 0.7545871559633027, + "grad_norm": 0.022877002134919167, + "learning_rate": 1.580024634533587e-05, + "loss": 0.0653, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 329, + "tokens_per_second_per_gpu": 440.68 + }, + { + "epoch": 0.7568807339449541, + "grad_norm": 0.029319310560822487, + "learning_rate": 1.5525114473775014e-05, + "loss": 0.0871, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 330, + "tokens_per_second_per_gpu": 435.63 + }, + { + "epoch": 0.7591743119266054, + "grad_norm": 0.03219328075647354, + "learning_rate": 1.5251958226293306e-05, + "loss": 0.0801, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 331, + "tokens_per_second_per_gpu": 363.82 + }, + { + "epoch": 0.7614678899082569, + "grad_norm": 0.024657782167196274, + "learning_rate": 1.4980793256432474e-05, + "loss": 0.0622, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 332, + "tokens_per_second_per_gpu": 342.94 + }, + { + "epoch": 0.7637614678899083, + "grad_norm": 0.03142733871936798, + "learning_rate": 1.4711635103621719e-05, + "loss": 0.0681, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 333, + "tokens_per_second_per_gpu": 404.61 + }, + { + "epoch": 0.7660550458715596, + "grad_norm": 0.026000676676630974, + "learning_rate": 1.4444499192287275e-05, + "loss": 0.065, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 334, + "tokens_per_second_per_gpu": 367.91 + }, + { + "epoch": 0.768348623853211, + "grad_norm": 0.03227536380290985, + "learning_rate": 1.4179400830968415e-05, + "loss": 0.0767, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 335, + "tokens_per_second_per_gpu": 314.19 + }, + { + "epoch": 0.7706422018348624, + "grad_norm": 0.025221284478902817, + "learning_rate": 1.3916355211440164e-05, + "loss": 0.0645, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 336, + "tokens_per_second_per_gpu": 362.08 + }, + { + "epoch": 0.7729357798165137, + "grad_norm": 0.030213654041290283, + "learning_rate": 1.3655377407842812e-05, + "loss": 0.066, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 337, + "tokens_per_second_per_gpu": 466.08 + }, + { + "epoch": 0.7752293577981652, + "grad_norm": 0.026164716109633446, + "learning_rate": 1.3396482375817975e-05, + "loss": 0.0656, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 338, + "tokens_per_second_per_gpu": 458.34 + }, + { + "epoch": 0.7775229357798165, + "grad_norm": 0.0265730619430542, + "learning_rate": 1.3139684951651588e-05, + "loss": 0.0636, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 339, + "tokens_per_second_per_gpu": 399.93 + }, + { + "epoch": 0.7798165137614679, + "grad_norm": 0.026285763829946518, + "learning_rate": 1.2884999851423673e-05, + "loss": 0.0682, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 340, + "tokens_per_second_per_gpu": 421.4 + }, + { + "epoch": 0.7821100917431193, + "grad_norm": 0.023802319541573524, + "learning_rate": 1.2632441670165056e-05, + "loss": 0.0641, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.08, + "memory/max_allocated (GiB)": 49.08, + "step": 341, + "tokens_per_second_per_gpu": 439.55 + }, + { + "epoch": 0.7844036697247706, + "grad_norm": 0.024973031133413315, + "learning_rate": 1.2382024881020937e-05, + "loss": 0.0615, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 342, + "tokens_per_second_per_gpu": 492.26 + }, + { + "epoch": 0.786697247706422, + "grad_norm": 0.029818380251526833, + "learning_rate": 1.213376383442153e-05, + "loss": 0.0746, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 343, + "tokens_per_second_per_gpu": 394.15 + }, + { + "epoch": 0.7889908256880734, + "grad_norm": 0.028851691633462906, + "learning_rate": 1.188767275725966e-05, + "loss": 0.0744, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 344, + "tokens_per_second_per_gpu": 422.33 + }, + { + "epoch": 0.7912844036697247, + "grad_norm": 0.03523954004049301, + "learning_rate": 1.164376575207547e-05, + "loss": 0.077, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 345, + "tokens_per_second_per_gpu": 286.3 + }, + { + "epoch": 0.7935779816513762, + "grad_norm": 0.023627813905477524, + "learning_rate": 1.140205679624834e-05, + "loss": 0.0641, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 346, + "tokens_per_second_per_gpu": 351.44 + }, + { + "epoch": 0.7958715596330275, + "grad_norm": 0.026164906099438667, + "learning_rate": 1.1162559741195733e-05, + "loss": 0.0658, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 347, + "tokens_per_second_per_gpu": 389.38 + }, + { + "epoch": 0.7981651376146789, + "grad_norm": 0.023336883634328842, + "learning_rate": 1.092528831157959e-05, + "loss": 0.062, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 348, + "tokens_per_second_per_gpu": 472.15 + }, + { + "epoch": 0.8004587155963303, + "grad_norm": 0.02306864783167839, + "learning_rate": 1.0690256104519764e-05, + "loss": 0.0629, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 349, + "tokens_per_second_per_gpu": 422.0 + }, + { + "epoch": 0.8027522935779816, + "grad_norm": 0.026163572445511818, + "learning_rate": 1.0457476588814774e-05, + "loss": 0.0667, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.77, + "memory/max_allocated (GiB)": 48.77, + "step": 350, + "tokens_per_second_per_gpu": 389.59 + }, + { + "epoch": 0.805045871559633, + "grad_norm": 0.024867909029126167, + "learning_rate": 1.0226963104170002e-05, + "loss": 0.0674, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 351, + "tokens_per_second_per_gpu": 429.91 + }, + { + "epoch": 0.8073394495412844, + "grad_norm": 0.023188138380646706, + "learning_rate": 9.998728860433276e-06, + "loss": 0.0645, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 352, + "tokens_per_second_per_gpu": 388.46 + }, + { + "epoch": 0.8096330275229358, + "grad_norm": 0.03035775013267994, + "learning_rate": 9.772786936837785e-06, + "loss": 0.0707, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 353, + "tokens_per_second_per_gpu": 397.77 + }, + { + "epoch": 0.8119266055045872, + "grad_norm": 0.04821021109819412, + "learning_rate": 9.549150281252633e-06, + "loss": 0.0646, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 354, + "tokens_per_second_per_gpu": 445.4 + }, + { + "epoch": 0.8142201834862385, + "grad_norm": 0.030557144433259964, + "learning_rate": 9.327831709440792e-06, + "loss": 0.0659, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 355, + "tokens_per_second_per_gpu": 382.05 + }, + { + "epoch": 0.8165137614678899, + "grad_norm": 0.02662436105310917, + "learning_rate": 9.108843904324715e-06, + "loss": 0.0626, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 356, + "tokens_per_second_per_gpu": 412.62 + }, + { + "epoch": 0.8188073394495413, + "grad_norm": 0.027914568781852722, + "learning_rate": 8.8921994152595e-06, + "loss": 0.0681, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 357, + "tokens_per_second_per_gpu": 297.66 + }, + { + "epoch": 0.8211009174311926, + "grad_norm": 0.027242561802268028, + "learning_rate": 8.677910657313782e-06, + "loss": 0.067, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 358, + "tokens_per_second_per_gpu": 457.91 + }, + { + "epoch": 0.823394495412844, + "grad_norm": 0.030475802719593048, + "learning_rate": 8.465989910558209e-06, + "loss": 0.0689, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 359, + "tokens_per_second_per_gpu": 368.2 + }, + { + "epoch": 0.8256880733944955, + "grad_norm": 0.028360676020383835, + "learning_rate": 8.256449319361748e-06, + "loss": 0.0687, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 360, + "tokens_per_second_per_gpu": 384.64 + }, + { + "epoch": 0.8279816513761468, + "grad_norm": 0.031053343787789345, + "learning_rate": 8.049300891695744e-06, + "loss": 0.0754, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 361, + "tokens_per_second_per_gpu": 320.92 + }, + { + "epoch": 0.8302752293577982, + "grad_norm": 0.030271202325820923, + "learning_rate": 7.844556498445788e-06, + "loss": 0.072, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 362, + "tokens_per_second_per_gpu": 437.12 + }, + { + "epoch": 0.8325688073394495, + "grad_norm": 0.027202172204852104, + "learning_rate": 7.642227872731417e-06, + "loss": 0.0696, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 363, + "tokens_per_second_per_gpu": 332.31 + }, + { + "epoch": 0.8348623853211009, + "grad_norm": 0.02677847445011139, + "learning_rate": 7.4423266092337855e-06, + "loss": 0.0703, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 364, + "tokens_per_second_per_gpu": 364.39 + }, + { + "epoch": 0.8371559633027523, + "grad_norm": 0.0259072408080101, + "learning_rate": 7.244864163531162e-06, + "loss": 0.0678, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.77, + "memory/max_allocated (GiB)": 48.77, + "step": 365, + "tokens_per_second_per_gpu": 367.02 + }, + { + "epoch": 0.8394495412844036, + "grad_norm": 0.02673807553946972, + "learning_rate": 7.049851851442468e-06, + "loss": 0.0661, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 366, + "tokens_per_second_per_gpu": 475.35 + }, + { + "epoch": 0.841743119266055, + "grad_norm": 0.027974814176559448, + "learning_rate": 6.857300848378856e-06, + "loss": 0.0747, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 367, + "tokens_per_second_per_gpu": 409.23 + }, + { + "epoch": 0.8440366972477065, + "grad_norm": 0.022259563207626343, + "learning_rate": 6.667222188703226e-06, + "loss": 0.064, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 368, + "tokens_per_second_per_gpu": 440.59 + }, + { + "epoch": 0.8463302752293578, + "grad_norm": 0.02939799055457115, + "learning_rate": 6.479626765097918e-06, + "loss": 0.0693, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 369, + "tokens_per_second_per_gpu": 455.83 + }, + { + "epoch": 0.8486238532110092, + "grad_norm": 0.029195845127105713, + "learning_rate": 6.294525327940515e-06, + "loss": 0.0711, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 370, + "tokens_per_second_per_gpu": 394.89 + }, + { + "epoch": 0.8509174311926605, + "grad_norm": 0.0236493106931448, + "learning_rate": 6.111928484687723e-06, + "loss": 0.0643, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 371, + "tokens_per_second_per_gpu": 408.68 + }, + { + "epoch": 0.8532110091743119, + "grad_norm": 0.02727104350924492, + "learning_rate": 5.931846699267557e-06, + "loss": 0.067, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 372, + "tokens_per_second_per_gpu": 509.27 + }, + { + "epoch": 0.8555045871559633, + "grad_norm": 0.034410908818244934, + "learning_rate": 5.7542902914796745e-06, + "loss": 0.0624, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 373, + "tokens_per_second_per_gpu": 556.2 + }, + { + "epoch": 0.8577981651376146, + "grad_norm": 0.0287538543343544, + "learning_rate": 5.579269436403967e-06, + "loss": 0.0651, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.08, + "memory/max_allocated (GiB)": 49.08, + "step": 374, + "tokens_per_second_per_gpu": 381.61 + }, + { + "epoch": 0.8600917431192661, + "grad_norm": 0.02870243228971958, + "learning_rate": 5.4067941638174806e-06, + "loss": 0.0731, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 375, + "tokens_per_second_per_gpu": 361.13 + }, + { + "epoch": 0.8623853211009175, + "grad_norm": 0.026416806504130363, + "learning_rate": 5.2368743576196536e-06, + "loss": 0.064, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 376, + "tokens_per_second_per_gpu": 326.97 + }, + { + "epoch": 0.8646788990825688, + "grad_norm": 0.023003704845905304, + "learning_rate": 5.0695197552659e-06, + "loss": 0.0625, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 377, + "tokens_per_second_per_gpu": 438.76 + }, + { + "epoch": 0.8669724770642202, + "grad_norm": 0.037476420402526855, + "learning_rate": 4.9047399472095746e-06, + "loss": 0.0697, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 378, + "tokens_per_second_per_gpu": 345.81 + }, + { + "epoch": 0.8692660550458715, + "grad_norm": 0.02971925400197506, + "learning_rate": 4.742544376352443e-06, + "loss": 0.0663, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 379, + "tokens_per_second_per_gpu": 436.62 + }, + { + "epoch": 0.8715596330275229, + "grad_norm": 0.023713113740086555, + "learning_rate": 4.582942337503465e-06, + "loss": 0.0602, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 380, + "tokens_per_second_per_gpu": 448.46 + }, + { + "epoch": 0.8738532110091743, + "grad_norm": 0.02941006049513817, + "learning_rate": 4.425942976846187e-06, + "loss": 0.0725, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 381, + "tokens_per_second_per_gpu": 329.17 + }, + { + "epoch": 0.8761467889908257, + "grad_norm": 0.028299743309617043, + "learning_rate": 4.271555291414636e-06, + "loss": 0.072, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 382, + "tokens_per_second_per_gpu": 340.56 + }, + { + "epoch": 0.8784403669724771, + "grad_norm": 0.03180241584777832, + "learning_rate": 4.119788128577667e-06, + "loss": 0.0766, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 383, + "tokens_per_second_per_gpu": 446.49 + }, + { + "epoch": 0.8807339449541285, + "grad_norm": 0.026926379650831223, + "learning_rate": 3.9706501855319765e-06, + "loss": 0.0683, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 384, + "tokens_per_second_per_gpu": 440.01 + }, + { + "epoch": 0.8830275229357798, + "grad_norm": 0.03347824513912201, + "learning_rate": 3.824150008803767e-06, + "loss": 0.0751, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 385, + "tokens_per_second_per_gpu": 343.0 + }, + { + "epoch": 0.8853211009174312, + "grad_norm": 0.030953101813793182, + "learning_rate": 3.680295993758881e-06, + "loss": 0.0689, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 386, + "tokens_per_second_per_gpu": 393.96 + }, + { + "epoch": 0.8876146788990825, + "grad_norm": 0.032475098967552185, + "learning_rate": 3.539096384121743e-06, + "loss": 0.0828, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 387, + "tokens_per_second_per_gpu": 378.9 + }, + { + "epoch": 0.8899082568807339, + "grad_norm": 0.02490062825381756, + "learning_rate": 3.40055927150294e-06, + "loss": 0.0623, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 388, + "tokens_per_second_per_gpu": 408.26 + }, + { + "epoch": 0.8922018348623854, + "grad_norm": 0.02600006014108658, + "learning_rate": 3.2646925949355312e-06, + "loss": 0.0658, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 389, + "tokens_per_second_per_gpu": 446.25 + }, + { + "epoch": 0.8944954128440367, + "grad_norm": 0.024244820699095726, + "learning_rate": 3.1315041404200663e-06, + "loss": 0.0655, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 390, + "tokens_per_second_per_gpu": 420.39 + }, + { + "epoch": 0.8967889908256881, + "grad_norm": 0.0253219585865736, + "learning_rate": 3.00100154047841e-06, + "loss": 0.0674, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 391, + "tokens_per_second_per_gpu": 463.53 + }, + { + "epoch": 0.8990825688073395, + "grad_norm": 0.027757421135902405, + "learning_rate": 2.8731922737163685e-06, + "loss": 0.0681, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 392, + "tokens_per_second_per_gpu": 472.68 + }, + { + "epoch": 0.9013761467889908, + "grad_norm": 0.02381259575486183, + "learning_rate": 2.7480836643950956e-06, + "loss": 0.0596, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.08, + "memory/max_allocated (GiB)": 49.08, + "step": 393, + "tokens_per_second_per_gpu": 452.37 + }, + { + "epoch": 0.9036697247706422, + "grad_norm": 0.024906722828745842, + "learning_rate": 2.6256828820113766e-06, + "loss": 0.0669, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 394, + "tokens_per_second_per_gpu": 327.71 + }, + { + "epoch": 0.9059633027522935, + "grad_norm": 0.025515113025903702, + "learning_rate": 2.5059969408867843e-06, + "loss": 0.0636, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 395, + "tokens_per_second_per_gpu": 409.3 + }, + { + "epoch": 0.908256880733945, + "grad_norm": 0.026188403367996216, + "learning_rate": 2.3890326997656975e-06, + "loss": 0.0688, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 396, + "tokens_per_second_per_gpu": 371.81 + }, + { + "epoch": 0.9105504587155964, + "grad_norm": 0.027840575203299522, + "learning_rate": 2.274796861422246e-06, + "loss": 0.0737, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 397, + "tokens_per_second_per_gpu": 447.44 + }, + { + "epoch": 0.9128440366972477, + "grad_norm": 0.0268483255058527, + "learning_rate": 2.163295972276219e-06, + "loss": 0.0583, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 398, + "tokens_per_second_per_gpu": 383.02 + }, + { + "epoch": 0.9151376146788991, + "grad_norm": 0.027824856340885162, + "learning_rate": 2.054536422017922e-06, + "loss": 0.0767, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 399, + "tokens_per_second_per_gpu": 331.96 + }, + { + "epoch": 0.9174311926605505, + "grad_norm": 0.024313461035490036, + "learning_rate": 1.9485244432419667e-06, + "loss": 0.0694, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 400, + "tokens_per_second_per_gpu": 371.17 + }, + { + "epoch": 0.9197247706422018, + "grad_norm": 0.02038564346730709, + "learning_rate": 1.8452661110901715e-06, + "loss": 0.0563, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 401, + "tokens_per_second_per_gpu": 474.9 + }, + { + "epoch": 0.9220183486238532, + "grad_norm": 0.030249858275055885, + "learning_rate": 1.7447673429033362e-06, + "loss": 0.0685, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 402, + "tokens_per_second_per_gpu": 324.48 + }, + { + "epoch": 0.9243119266055045, + "grad_norm": 0.027523530647158623, + "learning_rate": 1.6470338978822108e-06, + "loss": 0.0666, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 403, + "tokens_per_second_per_gpu": 405.82 + }, + { + "epoch": 0.926605504587156, + "grad_norm": 0.026385333389043808, + "learning_rate": 1.5520713767574246e-06, + "loss": 0.0768, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 404, + "tokens_per_second_per_gpu": 380.7 + }, + { + "epoch": 0.9288990825688074, + "grad_norm": 0.02548050880432129, + "learning_rate": 1.4598852214685488e-06, + "loss": 0.0649, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 405, + "tokens_per_second_per_gpu": 421.77 + }, + { + "epoch": 0.9311926605504587, + "grad_norm": 0.0276033915579319, + "learning_rate": 1.3704807148521903e-06, + "loss": 0.0722, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 406, + "tokens_per_second_per_gpu": 391.2 + }, + { + "epoch": 0.9334862385321101, + "grad_norm": 0.025824090465903282, + "learning_rate": 1.2838629803393342e-06, + "loss": 0.0658, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 407, + "tokens_per_second_per_gpu": 363.71 + }, + { + "epoch": 0.9357798165137615, + "grad_norm": 0.032180044800043106, + "learning_rate": 1.2000369816616674e-06, + "loss": 0.0677, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 408, + "tokens_per_second_per_gpu": 490.46 + }, + { + "epoch": 0.9380733944954128, + "grad_norm": 0.03195993974804878, + "learning_rate": 1.119007522567167e-06, + "loss": 0.08, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 409, + "tokens_per_second_per_gpu": 443.35 + }, + { + "epoch": 0.9403669724770642, + "grad_norm": 0.024462653324007988, + "learning_rate": 1.0407792465447986e-06, + "loss": 0.0589, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 410, + "tokens_per_second_per_gpu": 511.33 + }, + { + "epoch": 0.9426605504587156, + "grad_norm": 0.02783488854765892, + "learning_rate": 9.653566365584176e-07, + "loss": 0.0705, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 411, + "tokens_per_second_per_gpu": 407.62 + }, + { + "epoch": 0.944954128440367, + "grad_norm": 0.03449428081512451, + "learning_rate": 8.927440147898702e-07, + "loss": 0.0801, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 412, + "tokens_per_second_per_gpu": 306.83 + }, + { + "epoch": 0.9472477064220184, + "grad_norm": 0.027761735022068024, + "learning_rate": 8.229455423913013e-07, + "loss": 0.0749, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.77, + "memory/max_allocated (GiB)": 48.77, + "step": 413, + "tokens_per_second_per_gpu": 327.64 + }, + { + "epoch": 0.9495412844036697, + "grad_norm": 0.029755057767033577, + "learning_rate": 7.559652192467126e-07, + "loss": 0.0778, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 414, + "tokens_per_second_per_gpu": 384.79 + }, + { + "epoch": 0.9518348623853211, + "grad_norm": 0.028378870338201523, + "learning_rate": 6.918068837427128e-07, + "loss": 0.0672, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 415, + "tokens_per_second_per_gpu": 406.2 + }, + { + "epoch": 0.9541284403669725, + "grad_norm": 0.02773345075547695, + "learning_rate": 6.304742125485874e-07, + "loss": 0.06, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 416, + "tokens_per_second_per_gpu": 387.18 + }, + { + "epoch": 0.9564220183486238, + "grad_norm": 0.0268245879560709, + "learning_rate": 5.719707204055735e-07, + "loss": 0.0621, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 417, + "tokens_per_second_per_gpu": 411.81 + }, + { + "epoch": 0.9587155963302753, + "grad_norm": 0.033236313611269, + "learning_rate": 5.162997599254704e-07, + "loss": 0.0578, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.08, + "memory/max_allocated (GiB)": 49.08, + "step": 418, + "tokens_per_second_per_gpu": 471.32 + }, + { + "epoch": 0.9610091743119266, + "grad_norm": 0.022961758077144623, + "learning_rate": 4.634645213984934e-07, + "loss": 0.0643, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 419, + "tokens_per_second_per_gpu": 436.57 + }, + { + "epoch": 0.963302752293578, + "grad_norm": 0.028307458385825157, + "learning_rate": 4.134680326104645e-07, + "loss": 0.0691, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 420, + "tokens_per_second_per_gpu": 492.18 + }, + { + "epoch": 0.9655963302752294, + "grad_norm": 0.026976363733410835, + "learning_rate": 3.663131586692792e-07, + "loss": 0.0655, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 421, + "tokens_per_second_per_gpu": 327.53 + }, + { + "epoch": 0.9678899082568807, + "grad_norm": 0.024504244327545166, + "learning_rate": 3.2200260184075406e-07, + "loss": 0.0658, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 422, + "tokens_per_second_per_gpu": 419.83 + }, + { + "epoch": 0.9701834862385321, + "grad_norm": 0.023533035069704056, + "learning_rate": 2.805389013937454e-07, + "loss": 0.0556, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 423, + "tokens_per_second_per_gpu": 401.57 + }, + { + "epoch": 0.9724770642201835, + "grad_norm": 0.022774042561650276, + "learning_rate": 2.419244334546267e-07, + "loss": 0.0581, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.73, + "memory/max_allocated (GiB)": 48.73, + "step": 424, + "tokens_per_second_per_gpu": 329.45 + }, + { + "epoch": 0.9747706422018348, + "grad_norm": 0.03273961320519447, + "learning_rate": 2.061614108711474e-07, + "loss": 0.0824, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 425, + "tokens_per_second_per_gpu": 373.34 + }, + { + "epoch": 0.9770642201834863, + "grad_norm": 0.02143704518675804, + "learning_rate": 1.732518830856067e-07, + "loss": 0.0588, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 426, + "tokens_per_second_per_gpu": 431.87 + }, + { + "epoch": 0.9793577981651376, + "grad_norm": 0.026173925027251244, + "learning_rate": 1.431977360173975e-07, + "loss": 0.0678, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 427, + "tokens_per_second_per_gpu": 439.15 + }, + { + "epoch": 0.981651376146789, + "grad_norm": 0.026415711268782616, + "learning_rate": 1.16000691954965e-07, + "loss": 0.067, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.0, + "memory/max_allocated (GiB)": 49.0, + "step": 428, + "tokens_per_second_per_gpu": 480.36 + }, + { + "epoch": 0.9839449541284404, + "grad_norm": 0.025120330974459648, + "learning_rate": 9.1662309457069e-08, + "loss": 0.0644, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.97, + "memory/max_allocated (GiB)": 48.97, + "step": 429, + "tokens_per_second_per_gpu": 414.88 + }, + { + "epoch": 0.9862385321100917, + "grad_norm": 0.023429665714502335, + "learning_rate": 7.018398326350539e-08, + "loss": 0.0645, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.85, + "memory/max_allocated (GiB)": 48.85, + "step": 430, + "tokens_per_second_per_gpu": 330.04 + }, + { + "epoch": 0.9885321100917431, + "grad_norm": 0.03130911663174629, + "learning_rate": 5.15669442151423e-08, + "loss": 0.0723, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.81, + "memory/max_allocated (GiB)": 48.81, + "step": 431, + "tokens_per_second_per_gpu": 269.97 + }, + { + "epoch": 0.9908256880733946, + "grad_norm": 0.026494460180401802, + "learning_rate": 3.581225918342646e-08, + "loss": 0.0685, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.93, + "memory/max_allocated (GiB)": 48.93, + "step": 432, + "tokens_per_second_per_gpu": 387.46 + }, + { + "epoch": 0.9931192660550459, + "grad_norm": 0.032140735536813736, + "learning_rate": 2.292083100920994e-08, + "loss": 0.0631, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 433, + "tokens_per_second_per_gpu": 427.4 + }, + { + "epoch": 0.9954128440366973, + "grad_norm": 0.025799578055739403, + "learning_rate": 1.2893398451024886e-08, + "loss": 0.0695, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 49.04, + "memory/max_allocated (GiB)": 49.04, + "step": 434, + "tokens_per_second_per_gpu": 461.05 + }, + { + "epoch": 0.9977064220183486, + "grad_norm": 0.031855881214141846, + "learning_rate": 5.730536142745102e-09, + "loss": 0.0818, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 435, + "tokens_per_second_per_gpu": 415.34 + }, + { + "epoch": 1.0, + "grad_norm": 0.026343608275055885, + "learning_rate": 1.432654560679092e-09, + "loss": 0.0674, + "memory/device_reserved (GiB)": 50.97, + "memory/max_active (GiB)": 48.89, + "memory/max_allocated (GiB)": 48.89, + "step": 436, + "tokens_per_second_per_gpu": 342.89 + } + ], + "logging_steps": 1, + "max_steps": 436, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 60, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.337198826144924e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}