{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 436, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0022935779816513763, "grad_norm": 0.12869106233119965, "learning_rate": 0.0, "loss": 0.1978, "memory/device_reserved (GiB)": 50.77, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 1, "tokens_per_second_per_gpu": 354.96 }, { "epoch": 0.0045871559633027525, "grad_norm": 0.15667210519313812, "learning_rate": 4.7619047619047615e-06, "loss": 0.2353, "memory/device_reserved (GiB)": 50.77, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 2, "tokens_per_second_per_gpu": 406.37 }, { "epoch": 0.006880733944954129, "grad_norm": 0.2217973917722702, "learning_rate": 9.523809523809523e-06, "loss": 0.2243, "memory/device_reserved (GiB)": 50.87, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 3, "tokens_per_second_per_gpu": 371.18 }, { "epoch": 0.009174311926605505, "grad_norm": 0.15948686003684998, "learning_rate": 1.4285714285714285e-05, "loss": 0.2392, "memory/device_reserved (GiB)": 50.87, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 4, "tokens_per_second_per_gpu": 414.48 }, { "epoch": 0.011467889908256881, "grad_norm": 0.153566375374794, "learning_rate": 1.9047619047619046e-05, "loss": 0.2182, "memory/device_reserved (GiB)": 50.87, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 5, "tokens_per_second_per_gpu": 369.22 }, { "epoch": 0.013761467889908258, "grad_norm": 0.1521972268819809, "learning_rate": 2.380952380952381e-05, "loss": 0.2112, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 49.04, "memory/max_allocated (GiB)": 49.04, "step": 6, "tokens_per_second_per_gpu": 429.31 }, { "epoch": 0.016055045871559634, "grad_norm": 0.168710395693779, "learning_rate": 2.857142857142857e-05, "loss": 0.226, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 7, "tokens_per_second_per_gpu": 417.78 }, { "epoch": 0.01834862385321101, "grad_norm": 0.13864850997924805, "learning_rate": 3.3333333333333335e-05, "loss": 0.1884, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 8, "tokens_per_second_per_gpu": 439.56 }, { "epoch": 0.020642201834862386, "grad_norm": 0.15227903425693512, "learning_rate": 3.809523809523809e-05, "loss": 0.1996, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 9, "tokens_per_second_per_gpu": 411.33 }, { "epoch": 0.022935779816513763, "grad_norm": 0.13421630859375, "learning_rate": 4.2857142857142856e-05, "loss": 0.1599, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 10, "tokens_per_second_per_gpu": 496.3 }, { "epoch": 0.02522935779816514, "grad_norm": 0.14955134689807892, "learning_rate": 4.761904761904762e-05, "loss": 0.1735, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 11, "tokens_per_second_per_gpu": 372.95 }, { "epoch": 0.027522935779816515, "grad_norm": 0.1432778388261795, "learning_rate": 5.2380952380952384e-05, "loss": 0.1515, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 12, "tokens_per_second_per_gpu": 398.65 }, { "epoch": 0.02981651376146789, "grad_norm": 0.14163611829280853, "learning_rate": 5.714285714285714e-05, "loss": 0.1517, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 13, "tokens_per_second_per_gpu": 440.5 }, { "epoch": 0.03211009174311927, "grad_norm": 0.15477906167507172, "learning_rate": 6.19047619047619e-05, "loss": 0.1444, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 14, "tokens_per_second_per_gpu": 385.32 }, { "epoch": 0.034403669724770644, "grad_norm": 0.1055532768368721, "learning_rate": 6.666666666666667e-05, "loss": 0.1292, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 15, "tokens_per_second_per_gpu": 453.02 }, { "epoch": 0.03669724770642202, "grad_norm": 0.10180933028459549, "learning_rate": 7.142857142857143e-05, "loss": 0.1208, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 16, "tokens_per_second_per_gpu": 474.27 }, { "epoch": 0.0389908256880734, "grad_norm": 0.07999677956104279, "learning_rate": 7.619047619047618e-05, "loss": 0.132, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 17, "tokens_per_second_per_gpu": 382.05 }, { "epoch": 0.04128440366972477, "grad_norm": 0.09194924682378769, "learning_rate": 8.095238095238096e-05, "loss": 0.1067, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 18, "tokens_per_second_per_gpu": 398.61 }, { "epoch": 0.04357798165137615, "grad_norm": 0.0931428000330925, "learning_rate": 8.571428571428571e-05, "loss": 0.1088, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 19, "tokens_per_second_per_gpu": 447.07 }, { "epoch": 0.045871559633027525, "grad_norm": 0.06202042102813721, "learning_rate": 9.047619047619048e-05, "loss": 0.0962, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 20, "tokens_per_second_per_gpu": 382.57 }, { "epoch": 0.0481651376146789, "grad_norm": 0.04220607504248619, "learning_rate": 9.523809523809524e-05, "loss": 0.0963, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 21, "tokens_per_second_per_gpu": 423.29 }, { "epoch": 0.05045871559633028, "grad_norm": 0.050066106021404266, "learning_rate": 0.0001, "loss": 0.1032, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 22, "tokens_per_second_per_gpu": 381.35 }, { "epoch": 0.052752293577981654, "grad_norm": 0.0557384118437767, "learning_rate": 9.999856734543933e-05, "loss": 0.1025, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 23, "tokens_per_second_per_gpu": 393.62 }, { "epoch": 0.05504587155963303, "grad_norm": 0.04612402245402336, "learning_rate": 9.999426946385727e-05, "loss": 0.0985, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 24, "tokens_per_second_per_gpu": 515.46 }, { "epoch": 0.05733944954128441, "grad_norm": 0.09721734374761581, "learning_rate": 9.998710660154898e-05, "loss": 0.1062, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 25, "tokens_per_second_per_gpu": 398.15 }, { "epoch": 0.05963302752293578, "grad_norm": 0.036745935678482056, "learning_rate": 9.997707916899079e-05, "loss": 0.1045, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 26, "tokens_per_second_per_gpu": 422.42 }, { "epoch": 0.06192660550458716, "grad_norm": 0.04298936203122139, "learning_rate": 9.996418774081658e-05, "loss": 0.0923, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 27, "tokens_per_second_per_gpu": 440.87 }, { "epoch": 0.06422018348623854, "grad_norm": 0.033536747097969055, "learning_rate": 9.994843305578486e-05, "loss": 0.096, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 28, "tokens_per_second_per_gpu": 370.28 }, { "epoch": 0.06651376146788991, "grad_norm": 0.03256046772003174, "learning_rate": 9.99298160167365e-05, "loss": 0.0832, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 29, "tokens_per_second_per_gpu": 357.19 }, { "epoch": 0.06880733944954129, "grad_norm": 0.042709868401288986, "learning_rate": 9.990833769054293e-05, "loss": 0.086, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 30, "tokens_per_second_per_gpu": 441.89 }, { "epoch": 0.07110091743119266, "grad_norm": 0.04347776621580124, "learning_rate": 9.988399930804504e-05, "loss": 0.1, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.77, "memory/max_allocated (GiB)": 48.77, "step": 31, "tokens_per_second_per_gpu": 348.66 }, { "epoch": 0.07339449541284404, "grad_norm": 0.030414681881666183, "learning_rate": 9.985680226398261e-05, "loss": 0.0811, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 32, "tokens_per_second_per_gpu": 435.28 }, { "epoch": 0.07568807339449542, "grad_norm": 0.034023743122816086, "learning_rate": 9.98267481169144e-05, "loss": 0.0743, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 33, "tokens_per_second_per_gpu": 482.51 }, { "epoch": 0.0779816513761468, "grad_norm": 0.03136487305164337, "learning_rate": 9.979383858912885e-05, "loss": 0.0739, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.08, "memory/max_allocated (GiB)": 49.08, "step": 34, "tokens_per_second_per_gpu": 496.59 }, { "epoch": 0.08027522935779817, "grad_norm": 0.028108298778533936, "learning_rate": 9.975807556654537e-05, "loss": 0.077, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 35, "tokens_per_second_per_gpu": 349.1 }, { "epoch": 0.08256880733944955, "grad_norm": 0.028020795434713364, "learning_rate": 9.971946109860626e-05, "loss": 0.0775, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 36, "tokens_per_second_per_gpu": 351.02 }, { "epoch": 0.08486238532110092, "grad_norm": 0.028756650164723396, "learning_rate": 9.967799739815925e-05, "loss": 0.0788, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 37, "tokens_per_second_per_gpu": 534.52 }, { "epoch": 0.0871559633027523, "grad_norm": 0.02806459739804268, "learning_rate": 9.963368684133072e-05, "loss": 0.0809, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 38, "tokens_per_second_per_gpu": 367.94 }, { "epoch": 0.08944954128440367, "grad_norm": 0.02387731708586216, "learning_rate": 9.958653196738954e-05, "loss": 0.0642, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.04, "memory/max_allocated (GiB)": 49.04, "step": 39, "tokens_per_second_per_gpu": 466.74 }, { "epoch": 0.09174311926605505, "grad_norm": 0.027889851480722427, "learning_rate": 9.953653547860151e-05, "loss": 0.0904, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 40, "tokens_per_second_per_gpu": 371.51 }, { "epoch": 0.09403669724770643, "grad_norm": 0.031659577041864395, "learning_rate": 9.948370024007454e-05, "loss": 0.081, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 41, "tokens_per_second_per_gpu": 479.04 }, { "epoch": 0.0963302752293578, "grad_norm": 0.03186093270778656, "learning_rate": 9.942802927959443e-05, "loss": 0.0881, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 42, "tokens_per_second_per_gpu": 364.73 }, { "epoch": 0.09862385321100918, "grad_norm": 0.0313677079975605, "learning_rate": 9.936952578745142e-05, "loss": 0.0808, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 43, "tokens_per_second_per_gpu": 418.0 }, { "epoch": 0.10091743119266056, "grad_norm": 0.0264989472925663, "learning_rate": 9.93081931162573e-05, "loss": 0.0664, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 44, "tokens_per_second_per_gpu": 439.24 }, { "epoch": 0.10321100917431193, "grad_norm": 0.026272334158420563, "learning_rate": 9.92440347807533e-05, "loss": 0.0683, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.04, "memory/max_allocated (GiB)": 49.04, "step": 45, "tokens_per_second_per_gpu": 482.81 }, { "epoch": 0.10550458715596331, "grad_norm": 0.029066840186715126, "learning_rate": 9.91770544576087e-05, "loss": 0.0737, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 46, "tokens_per_second_per_gpu": 389.87 }, { "epoch": 0.10779816513761468, "grad_norm": 0.024542706087231636, "learning_rate": 9.910725598521013e-05, "loss": 0.0737, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 47, "tokens_per_second_per_gpu": 473.12 }, { "epoch": 0.11009174311926606, "grad_norm": 0.042941153049468994, "learning_rate": 9.90346433634416e-05, "loss": 0.0951, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 48, "tokens_per_second_per_gpu": 325.12 }, { "epoch": 0.11238532110091744, "grad_norm": 0.029044413939118385, "learning_rate": 9.89592207534552e-05, "loss": 0.0745, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.73, "memory/max_allocated (GiB)": 48.73, "step": 49, "tokens_per_second_per_gpu": 315.62 }, { "epoch": 0.11467889908256881, "grad_norm": 0.028920788317918777, "learning_rate": 9.888099247743283e-05, "loss": 0.0818, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 50, "tokens_per_second_per_gpu": 441.3 }, { "epoch": 0.11697247706422019, "grad_norm": 0.026095205917954445, "learning_rate": 9.879996301833833e-05, "loss": 0.0688, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 51, "tokens_per_second_per_gpu": 386.22 }, { "epoch": 0.11926605504587157, "grad_norm": 0.024823926389217377, "learning_rate": 9.871613701966067e-05, "loss": 0.0701, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 52, "tokens_per_second_per_gpu": 511.32 }, { "epoch": 0.12155963302752294, "grad_norm": 0.036093298345804214, "learning_rate": 9.862951928514782e-05, "loss": 0.0823, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 53, "tokens_per_second_per_gpu": 323.2 }, { "epoch": 0.12385321100917432, "grad_norm": 0.03257686272263527, "learning_rate": 9.854011477853146e-05, "loss": 0.0769, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.04, "memory/max_allocated (GiB)": 49.04, "step": 54, "tokens_per_second_per_gpu": 447.62 }, { "epoch": 0.12614678899082568, "grad_norm": 0.03413158655166626, "learning_rate": 9.844792862324258e-05, "loss": 0.0728, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 55, "tokens_per_second_per_gpu": 451.05 }, { "epoch": 0.12844036697247707, "grad_norm": 0.02947932481765747, "learning_rate": 9.835296610211779e-05, "loss": 0.0713, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 56, "tokens_per_second_per_gpu": 457.44 }, { "epoch": 0.13073394495412843, "grad_norm": 0.0220651775598526, "learning_rate": 9.825523265709666e-05, "loss": 0.0607, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 57, "tokens_per_second_per_gpu": 456.49 }, { "epoch": 0.13302752293577982, "grad_norm": 0.026394842192530632, "learning_rate": 9.815473388890983e-05, "loss": 0.0716, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 58, "tokens_per_second_per_gpu": 393.95 }, { "epoch": 0.1353211009174312, "grad_norm": 0.027936838567256927, "learning_rate": 9.805147555675805e-05, "loss": 0.0738, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 59, "tokens_per_second_per_gpu": 464.83 }, { "epoch": 0.13761467889908258, "grad_norm": 0.023982539772987366, "learning_rate": 9.794546357798208e-05, "loss": 0.0608, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 60, "tokens_per_second_per_gpu": 450.66 }, { "epoch": 0.13990825688073394, "grad_norm": 0.027479754760861397, "learning_rate": 9.783670402772379e-05, "loss": 0.0672, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 61, "tokens_per_second_per_gpu": 455.94 }, { "epoch": 0.14220183486238533, "grad_norm": 0.02617599070072174, "learning_rate": 9.772520313857775e-05, "loss": 0.0804, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 62, "tokens_per_second_per_gpu": 394.85 }, { "epoch": 0.1444954128440367, "grad_norm": 0.030884992331266403, "learning_rate": 9.761096730023432e-05, "loss": 0.0768, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 63, "tokens_per_second_per_gpu": 446.63 }, { "epoch": 0.14678899082568808, "grad_norm": 0.027579287067055702, "learning_rate": 9.749400305911322e-05, "loss": 0.0659, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 64, "tokens_per_second_per_gpu": 484.34 }, { "epoch": 0.14908256880733944, "grad_norm": 0.030303625389933586, "learning_rate": 9.737431711798864e-05, "loss": 0.0645, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 65, "tokens_per_second_per_gpu": 437.07 }, { "epoch": 0.15137614678899083, "grad_norm": 0.027446158230304718, "learning_rate": 9.725191633560491e-05, "loss": 0.08, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 66, "tokens_per_second_per_gpu": 411.5 }, { "epoch": 0.1536697247706422, "grad_norm": 0.03177177160978317, "learning_rate": 9.712680772628364e-05, "loss": 0.0801, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 67, "tokens_per_second_per_gpu": 429.18 }, { "epoch": 0.1559633027522936, "grad_norm": 0.0288909412920475, "learning_rate": 9.69989984595216e-05, "loss": 0.0707, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.04, "memory/max_allocated (GiB)": 49.04, "step": 68, "tokens_per_second_per_gpu": 408.55 }, { "epoch": 0.15825688073394495, "grad_norm": 0.02751251310110092, "learning_rate": 9.686849585957994e-05, "loss": 0.0736, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 69, "tokens_per_second_per_gpu": 420.0 }, { "epoch": 0.16055045871559634, "grad_norm": 0.023428168147802353, "learning_rate": 9.673530740506447e-05, "loss": 0.0648, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 70, "tokens_per_second_per_gpu": 512.59 }, { "epoch": 0.1628440366972477, "grad_norm": 0.031534772366285324, "learning_rate": 9.659944072849707e-05, "loss": 0.0818, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 71, "tokens_per_second_per_gpu": 456.9 }, { "epoch": 0.1651376146788991, "grad_norm": 0.027208171784877777, "learning_rate": 9.646090361587827e-05, "loss": 0.0709, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 72, "tokens_per_second_per_gpu": 378.48 }, { "epoch": 0.16743119266055045, "grad_norm": 0.02961639314889908, "learning_rate": 9.631970400624113e-05, "loss": 0.0764, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 73, "tokens_per_second_per_gpu": 316.38 }, { "epoch": 0.16972477064220184, "grad_norm": 0.027367761358618736, "learning_rate": 9.617584999119625e-05, "loss": 0.0672, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 74, "tokens_per_second_per_gpu": 402.44 }, { "epoch": 0.1720183486238532, "grad_norm": 0.030167503282427788, "learning_rate": 9.602934981446803e-05, "loss": 0.0743, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 75, "tokens_per_second_per_gpu": 531.29 }, { "epoch": 0.1743119266055046, "grad_norm": 0.0387263149023056, "learning_rate": 9.588021187142235e-05, "loss": 0.083, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 76, "tokens_per_second_per_gpu": 424.59 }, { "epoch": 0.17660550458715596, "grad_norm": 0.027617793530225754, "learning_rate": 9.572844470858537e-05, "loss": 0.0769, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 77, "tokens_per_second_per_gpu": 461.9 }, { "epoch": 0.17889908256880735, "grad_norm": 0.029771512374281883, "learning_rate": 9.557405702315381e-05, "loss": 0.0658, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 78, "tokens_per_second_per_gpu": 475.77 }, { "epoch": 0.1811926605504587, "grad_norm": 0.029358675703406334, "learning_rate": 9.541705766249655e-05, "loss": 0.066, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 79, "tokens_per_second_per_gpu": 489.33 }, { "epoch": 0.1834862385321101, "grad_norm": 0.023111771792173386, "learning_rate": 9.525745562364756e-05, "loss": 0.066, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 80, "tokens_per_second_per_gpu": 382.84 }, { "epoch": 0.18577981651376146, "grad_norm": 0.029448291286826134, "learning_rate": 9.509526005279044e-05, "loss": 0.0608, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 81, "tokens_per_second_per_gpu": 415.81 }, { "epoch": 0.18807339449541285, "grad_norm": 0.02794116735458374, "learning_rate": 9.493048024473412e-05, "loss": 0.0736, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 82, "tokens_per_second_per_gpu": 400.02 }, { "epoch": 0.19036697247706422, "grad_norm": 0.04534873738884926, "learning_rate": 9.476312564238034e-05, "loss": 0.0673, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 83, "tokens_per_second_per_gpu": 369.1 }, { "epoch": 0.1926605504587156, "grad_norm": 0.026540853083133698, "learning_rate": 9.459320583618252e-05, "loss": 0.0558, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.04, "memory/max_allocated (GiB)": 49.04, "step": 84, "tokens_per_second_per_gpu": 611.61 }, { "epoch": 0.19495412844036697, "grad_norm": 0.03129403293132782, "learning_rate": 9.442073056359604e-05, "loss": 0.0741, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 85, "tokens_per_second_per_gpu": 492.16 }, { "epoch": 0.19724770642201836, "grad_norm": 0.027526071295142174, "learning_rate": 9.424570970852034e-05, "loss": 0.0733, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 86, "tokens_per_second_per_gpu": 427.76 }, { "epoch": 0.19954128440366972, "grad_norm": 0.025468798354268074, "learning_rate": 9.406815330073244e-05, "loss": 0.0613, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 87, "tokens_per_second_per_gpu": 462.82 }, { "epoch": 0.2018348623853211, "grad_norm": 0.029043635353446007, "learning_rate": 9.388807151531229e-05, "loss": 0.0758, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 88, "tokens_per_second_per_gpu": 353.91 }, { "epoch": 0.20412844036697247, "grad_norm": 0.03196391835808754, "learning_rate": 9.37054746720595e-05, "loss": 0.0678, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 89, "tokens_per_second_per_gpu": 411.71 }, { "epoch": 0.20642201834862386, "grad_norm": 0.033272091299295425, "learning_rate": 9.352037323490208e-05, "loss": 0.0722, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 90, "tokens_per_second_per_gpu": 398.81 }, { "epoch": 0.20871559633027523, "grad_norm": 0.03096090629696846, "learning_rate": 9.333277781129678e-05, "loss": 0.0809, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 91, "tokens_per_second_per_gpu": 393.81 }, { "epoch": 0.21100917431192662, "grad_norm": 0.026267440989613533, "learning_rate": 9.314269915162114e-05, "loss": 0.0604, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 92, "tokens_per_second_per_gpu": 453.78 }, { "epoch": 0.21330275229357798, "grad_norm": 0.02608361840248108, "learning_rate": 9.295014814855753e-05, "loss": 0.0663, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 93, "tokens_per_second_per_gpu": 430.47 }, { "epoch": 0.21559633027522937, "grad_norm": 0.024829065427184105, "learning_rate": 9.275513583646884e-05, "loss": 0.0598, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 94, "tokens_per_second_per_gpu": 384.01 }, { "epoch": 0.21788990825688073, "grad_norm": 0.03385532647371292, "learning_rate": 9.255767339076622e-05, "loss": 0.0719, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 95, "tokens_per_second_per_gpu": 440.35 }, { "epoch": 0.22018348623853212, "grad_norm": 0.029608217999339104, "learning_rate": 9.23577721272686e-05, "loss": 0.094, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.04, "memory/max_allocated (GiB)": 49.04, "step": 96, "tokens_per_second_per_gpu": 485.56 }, { "epoch": 0.22247706422018348, "grad_norm": 0.02693762816488743, "learning_rate": 9.215544350155422e-05, "loss": 0.0755, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 97, "tokens_per_second_per_gpu": 432.16 }, { "epoch": 0.22477064220183487, "grad_norm": 0.02771424688398838, "learning_rate": 9.195069910830427e-05, "loss": 0.0692, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 98, "tokens_per_second_per_gpu": 412.93 }, { "epoch": 0.22706422018348624, "grad_norm": 0.02276022732257843, "learning_rate": 9.174355068063828e-05, "loss": 0.0637, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 99, "tokens_per_second_per_gpu": 418.24 }, { "epoch": 0.22935779816513763, "grad_norm": 0.026155246421694756, "learning_rate": 9.15340100894418e-05, "loss": 0.0698, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 100, "tokens_per_second_per_gpu": 403.6 }, { "epoch": 0.231651376146789, "grad_norm": 0.022778436541557312, "learning_rate": 9.132208934268622e-05, "loss": 0.0654, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 101, "tokens_per_second_per_gpu": 491.32 }, { "epoch": 0.23394495412844038, "grad_norm": 0.04701945558190346, "learning_rate": 9.110780058474052e-05, "loss": 0.0741, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 102, "tokens_per_second_per_gpu": 444.03 }, { "epoch": 0.23623853211009174, "grad_norm": 0.030211661010980606, "learning_rate": 9.08911560956753e-05, "loss": 0.0789, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 103, "tokens_per_second_per_gpu": 514.87 }, { "epoch": 0.23853211009174313, "grad_norm": 0.026159459725022316, "learning_rate": 9.067216829055922e-05, "loss": 0.0637, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 104, "tokens_per_second_per_gpu": 446.47 }, { "epoch": 0.2408256880733945, "grad_norm": 0.02918146923184395, "learning_rate": 9.045084971874738e-05, "loss": 0.0727, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 105, "tokens_per_second_per_gpu": 425.37 }, { "epoch": 0.24311926605504589, "grad_norm": 0.03170175105333328, "learning_rate": 9.022721306316222e-05, "loss": 0.0857, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 106, "tokens_per_second_per_gpu": 301.79 }, { "epoch": 0.24541284403669725, "grad_norm": 0.032674651592969894, "learning_rate": 9.000127113956674e-05, "loss": 0.0795, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.77, "memory/max_allocated (GiB)": 48.77, "step": 107, "tokens_per_second_per_gpu": 338.41 }, { "epoch": 0.24770642201834864, "grad_norm": 0.026492780074477196, "learning_rate": 8.977303689583e-05, "loss": 0.0775, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 108, "tokens_per_second_per_gpu": 383.35 }, { "epoch": 0.25, "grad_norm": 0.0290480125695467, "learning_rate": 8.954252341118523e-05, "loss": 0.076, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 109, "tokens_per_second_per_gpu": 382.78 }, { "epoch": 0.25229357798165136, "grad_norm": 0.030473977327346802, "learning_rate": 8.930974389548023e-05, "loss": 0.0761, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 110, "tokens_per_second_per_gpu": 476.56 }, { "epoch": 0.2545871559633027, "grad_norm": 0.02930077351629734, "learning_rate": 8.90747116884204e-05, "loss": 0.0691, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 111, "tokens_per_second_per_gpu": 441.2 }, { "epoch": 0.25688073394495414, "grad_norm": 0.02884151227772236, "learning_rate": 8.883744025880428e-05, "loss": 0.0806, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 112, "tokens_per_second_per_gpu": 406.96 }, { "epoch": 0.2591743119266055, "grad_norm": 0.02618175558745861, "learning_rate": 8.859794320375168e-05, "loss": 0.0677, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 113, "tokens_per_second_per_gpu": 430.04 }, { "epoch": 0.26146788990825687, "grad_norm": 0.026963548734784126, "learning_rate": 8.835623424792452e-05, "loss": 0.0694, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 114, "tokens_per_second_per_gpu": 351.9 }, { "epoch": 0.26376146788990823, "grad_norm": 0.021544624119997025, "learning_rate": 8.811232724274035e-05, "loss": 0.0613, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 115, "tokens_per_second_per_gpu": 480.22 }, { "epoch": 0.26605504587155965, "grad_norm": 0.03840009495615959, "learning_rate": 8.786623616557847e-05, "loss": 0.0723, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 116, "tokens_per_second_per_gpu": 433.18 }, { "epoch": 0.268348623853211, "grad_norm": 0.022571468725800514, "learning_rate": 8.761797511897906e-05, "loss": 0.065, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 117, "tokens_per_second_per_gpu": 421.92 }, { "epoch": 0.2706422018348624, "grad_norm": 0.02688576467335224, "learning_rate": 8.736755832983497e-05, "loss": 0.0772, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 118, "tokens_per_second_per_gpu": 354.3 }, { "epoch": 0.27293577981651373, "grad_norm": 0.025858785957098007, "learning_rate": 8.711500014857634e-05, "loss": 0.0745, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 119, "tokens_per_second_per_gpu": 365.46 }, { "epoch": 0.27522935779816515, "grad_norm": 0.02718079835176468, "learning_rate": 8.686031504834843e-05, "loss": 0.0759, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 120, "tokens_per_second_per_gpu": 426.06 }, { "epoch": 0.2775229357798165, "grad_norm": 0.028197383508086205, "learning_rate": 8.660351762418203e-05, "loss": 0.0753, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 121, "tokens_per_second_per_gpu": 483.89 }, { "epoch": 0.2798165137614679, "grad_norm": 0.02615584433078766, "learning_rate": 8.634462259215719e-05, "loss": 0.0692, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 122, "tokens_per_second_per_gpu": 347.59 }, { "epoch": 0.28211009174311924, "grad_norm": 0.028645118698477745, "learning_rate": 8.608364478855983e-05, "loss": 0.0784, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 123, "tokens_per_second_per_gpu": 472.02 }, { "epoch": 0.28440366972477066, "grad_norm": 0.03761473670601845, "learning_rate": 8.58205991690316e-05, "loss": 0.0663, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 124, "tokens_per_second_per_gpu": 439.34 }, { "epoch": 0.286697247706422, "grad_norm": 0.024080324918031693, "learning_rate": 8.555550080771273e-05, "loss": 0.0685, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 125, "tokens_per_second_per_gpu": 413.4 }, { "epoch": 0.2889908256880734, "grad_norm": 0.03224342688918114, "learning_rate": 8.528836489637828e-05, "loss": 0.0813, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.77, "memory/max_allocated (GiB)": 48.77, "step": 126, "tokens_per_second_per_gpu": 299.66 }, { "epoch": 0.29128440366972475, "grad_norm": 0.02632022649049759, "learning_rate": 8.501920674356754e-05, "loss": 0.0649, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 127, "tokens_per_second_per_gpu": 424.46 }, { "epoch": 0.29357798165137616, "grad_norm": 0.025439690798521042, "learning_rate": 8.47480417737067e-05, "loss": 0.0692, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 128, "tokens_per_second_per_gpu": 443.94 }, { "epoch": 0.2958715596330275, "grad_norm": 0.028366245329380035, "learning_rate": 8.447488552622498e-05, "loss": 0.0743, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 129, "tokens_per_second_per_gpu": 392.47 }, { "epoch": 0.2981651376146789, "grad_norm": 0.028246046975255013, "learning_rate": 8.419975365466415e-05, "loss": 0.0693, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.77, "memory/max_allocated (GiB)": 48.77, "step": 130, "tokens_per_second_per_gpu": 385.79 }, { "epoch": 0.30045871559633025, "grad_norm": 0.029451027512550354, "learning_rate": 8.392266192578143e-05, "loss": 0.0731, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 131, "tokens_per_second_per_gpu": 401.98 }, { "epoch": 0.30275229357798167, "grad_norm": 0.03156789019703865, "learning_rate": 8.364362621864595e-05, "loss": 0.0733, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 132, "tokens_per_second_per_gpu": 406.2 }, { "epoch": 0.30504587155963303, "grad_norm": 0.0247171763330698, "learning_rate": 8.336266252372889e-05, "loss": 0.0723, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 133, "tokens_per_second_per_gpu": 467.27 }, { "epoch": 0.3073394495412844, "grad_norm": 0.024775700643658638, "learning_rate": 8.307978694198699e-05, "loss": 0.0644, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 134, "tokens_per_second_per_gpu": 377.14 }, { "epoch": 0.30963302752293576, "grad_norm": 0.025003118440508842, "learning_rate": 8.279501568393994e-05, "loss": 0.0684, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 135, "tokens_per_second_per_gpu": 368.87 }, { "epoch": 0.3119266055045872, "grad_norm": 0.028482772409915924, "learning_rate": 8.250836506874142e-05, "loss": 0.0705, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 136, "tokens_per_second_per_gpu": 439.79 }, { "epoch": 0.31422018348623854, "grad_norm": 0.02605322189629078, "learning_rate": 8.221985152324385e-05, "loss": 0.0638, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.04, "memory/max_allocated (GiB)": 49.04, "step": 137, "tokens_per_second_per_gpu": 438.9 }, { "epoch": 0.3165137614678899, "grad_norm": 0.030314577743411064, "learning_rate": 8.192949158105713e-05, "loss": 0.0682, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 138, "tokens_per_second_per_gpu": 355.22 }, { "epoch": 0.31880733944954126, "grad_norm": 0.02862844057381153, "learning_rate": 8.163730188160105e-05, "loss": 0.0764, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 139, "tokens_per_second_per_gpu": 430.33 }, { "epoch": 0.3211009174311927, "grad_norm": 0.030885115265846252, "learning_rate": 8.134329916915184e-05, "loss": 0.0774, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 140, "tokens_per_second_per_gpu": 369.87 }, { "epoch": 0.32339449541284404, "grad_norm": 0.025037452578544617, "learning_rate": 8.104750029188257e-05, "loss": 0.0695, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 141, "tokens_per_second_per_gpu": 538.21 }, { "epoch": 0.3256880733944954, "grad_norm": 0.02607853338122368, "learning_rate": 8.074992220089769e-05, "loss": 0.066, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 142, "tokens_per_second_per_gpu": 443.91 }, { "epoch": 0.32798165137614677, "grad_norm": 0.028251491487026215, "learning_rate": 8.045058194926153e-05, "loss": 0.0691, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 143, "tokens_per_second_per_gpu": 403.07 }, { "epoch": 0.3302752293577982, "grad_norm": 0.02848455123603344, "learning_rate": 8.014949669102117e-05, "loss": 0.0712, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 144, "tokens_per_second_per_gpu": 421.87 }, { "epoch": 0.33256880733944955, "grad_norm": 0.027499854564666748, "learning_rate": 7.984668368022335e-05, "loss": 0.071, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 145, "tokens_per_second_per_gpu": 310.07 }, { "epoch": 0.3348623853211009, "grad_norm": 0.05668507516384125, "learning_rate": 7.954216026992571e-05, "loss": 0.072, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 146, "tokens_per_second_per_gpu": 430.94 }, { "epoch": 0.33715596330275227, "grad_norm": 0.023797793313860893, "learning_rate": 7.923594391120236e-05, "loss": 0.0724, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 147, "tokens_per_second_per_gpu": 506.38 }, { "epoch": 0.3394495412844037, "grad_norm": 0.03140917047858238, "learning_rate": 7.892805215214381e-05, "loss": 0.0707, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 148, "tokens_per_second_per_gpu": 392.49 }, { "epoch": 0.34174311926605505, "grad_norm": 0.023651011288166046, "learning_rate": 7.861850263685134e-05, "loss": 0.0675, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 149, "tokens_per_second_per_gpu": 468.39 }, { "epoch": 0.3440366972477064, "grad_norm": 0.028501421213150024, "learning_rate": 7.830731310442599e-05, "loss": 0.0677, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 150, "tokens_per_second_per_gpu": 377.79 }, { "epoch": 0.3463302752293578, "grad_norm": 0.028334010392427444, "learning_rate": 7.799450138795185e-05, "loss": 0.0749, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 151, "tokens_per_second_per_gpu": 370.82 }, { "epoch": 0.3486238532110092, "grad_norm": 0.029713135212659836, "learning_rate": 7.768008541347423e-05, "loss": 0.066, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 152, "tokens_per_second_per_gpu": 403.75 }, { "epoch": 0.35091743119266056, "grad_norm": 0.030461538583040237, "learning_rate": 7.73640831989723e-05, "loss": 0.0667, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 153, "tokens_per_second_per_gpu": 473.97 }, { "epoch": 0.3532110091743119, "grad_norm": 0.02694588340818882, "learning_rate": 7.704651285332663e-05, "loss": 0.0642, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 154, "tokens_per_second_per_gpu": 421.0 }, { "epoch": 0.3555045871559633, "grad_norm": 0.025780972093343735, "learning_rate": 7.672739257528134e-05, "loss": 0.0727, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 155, "tokens_per_second_per_gpu": 507.84 }, { "epoch": 0.3577981651376147, "grad_norm": 0.027480922639369965, "learning_rate": 7.640674065240136e-05, "loss": 0.078, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 156, "tokens_per_second_per_gpu": 334.0 }, { "epoch": 0.36009174311926606, "grad_norm": 0.032992683351039886, "learning_rate": 7.608457546002424e-05, "loss": 0.0728, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 157, "tokens_per_second_per_gpu": 315.95 }, { "epoch": 0.3623853211009174, "grad_norm": 0.029259737581014633, "learning_rate": 7.576091546020725e-05, "loss": 0.0721, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 158, "tokens_per_second_per_gpu": 390.2 }, { "epoch": 0.3646788990825688, "grad_norm": 0.027205413207411766, "learning_rate": 7.543577920066944e-05, "loss": 0.0726, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 159, "tokens_per_second_per_gpu": 459.84 }, { "epoch": 0.3669724770642202, "grad_norm": 0.028103800490498543, "learning_rate": 7.510918531372857e-05, "loss": 0.0723, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 160, "tokens_per_second_per_gpu": 362.42 }, { "epoch": 0.36926605504587157, "grad_norm": 0.025422796607017517, "learning_rate": 7.478115251523352e-05, "loss": 0.0651, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 161, "tokens_per_second_per_gpu": 409.68 }, { "epoch": 0.37155963302752293, "grad_norm": 0.0247375275939703, "learning_rate": 7.445169960349167e-05, "loss": 0.0648, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 162, "tokens_per_second_per_gpu": 443.52 }, { "epoch": 0.3738532110091743, "grad_norm": 0.024430420249700546, "learning_rate": 7.412084545819168e-05, "loss": 0.0654, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 163, "tokens_per_second_per_gpu": 439.86 }, { "epoch": 0.3761467889908257, "grad_norm": 0.02779349498450756, "learning_rate": 7.378860903932159e-05, "loss": 0.07, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 164, "tokens_per_second_per_gpu": 387.88 }, { "epoch": 0.37844036697247707, "grad_norm": 0.028585737571120262, "learning_rate": 7.34550093860822e-05, "loss": 0.0794, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 165, "tokens_per_second_per_gpu": 469.25 }, { "epoch": 0.38073394495412843, "grad_norm": 0.028040310367941856, "learning_rate": 7.31200656157961e-05, "loss": 0.0702, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 166, "tokens_per_second_per_gpu": 340.82 }, { "epoch": 0.3830275229357798, "grad_norm": 0.030313577502965927, "learning_rate": 7.278379692281208e-05, "loss": 0.0694, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 167, "tokens_per_second_per_gpu": 414.21 }, { "epoch": 0.3853211009174312, "grad_norm": 0.032695479691028595, "learning_rate": 7.244622257740523e-05, "loss": 0.0658, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 168, "tokens_per_second_per_gpu": 435.84 }, { "epoch": 0.3876146788990826, "grad_norm": 0.02221628651022911, "learning_rate": 7.210736192467256e-05, "loss": 0.0596, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 169, "tokens_per_second_per_gpu": 451.04 }, { "epoch": 0.38990825688073394, "grad_norm": 0.02417284995317459, "learning_rate": 7.176723438342446e-05, "loss": 0.0714, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 170, "tokens_per_second_per_gpu": 444.02 }, { "epoch": 0.3922018348623853, "grad_norm": 0.027553344145417213, "learning_rate": 7.142585944507185e-05, "loss": 0.0613, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 171, "tokens_per_second_per_gpu": 436.33 }, { "epoch": 0.3944954128440367, "grad_norm": 0.028384285047650337, "learning_rate": 7.10832566725092e-05, "loss": 0.0634, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 172, "tokens_per_second_per_gpu": 389.66 }, { "epoch": 0.3967889908256881, "grad_norm": 0.024850716814398766, "learning_rate": 7.073944569899354e-05, "loss": 0.0717, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 173, "tokens_per_second_per_gpu": 475.5 }, { "epoch": 0.39908256880733944, "grad_norm": 0.025330083444714546, "learning_rate": 7.039444622701922e-05, "loss": 0.0724, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 174, "tokens_per_second_per_gpu": 383.81 }, { "epoch": 0.4013761467889908, "grad_norm": 0.025969544425606728, "learning_rate": 7.00482780271889e-05, "loss": 0.0712, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 175, "tokens_per_second_per_gpu": 385.6 }, { "epoch": 0.4036697247706422, "grad_norm": 0.02731173112988472, "learning_rate": 6.97009609370806e-05, "loss": 0.0678, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 176, "tokens_per_second_per_gpu": 430.61 }, { "epoch": 0.4059633027522936, "grad_norm": 0.028133299201726913, "learning_rate": 6.935251486011087e-05, "loss": 0.061, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 177, "tokens_per_second_per_gpu": 379.64 }, { "epoch": 0.40825688073394495, "grad_norm": 0.02273411862552166, "learning_rate": 6.900295976439413e-05, "loss": 0.0604, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 178, "tokens_per_second_per_gpu": 393.24 }, { "epoch": 0.4105504587155963, "grad_norm": 0.025121403858065605, "learning_rate": 6.865231568159846e-05, "loss": 0.0697, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 179, "tokens_per_second_per_gpu": 453.6 }, { "epoch": 0.41284403669724773, "grad_norm": 0.029893774539232254, "learning_rate": 6.830060270579768e-05, "loss": 0.0743, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.77, "memory/max_allocated (GiB)": 48.77, "step": 180, "tokens_per_second_per_gpu": 402.38 }, { "epoch": 0.4151376146788991, "grad_norm": 0.026196127757430077, "learning_rate": 6.794784099231972e-05, "loss": 0.0653, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 181, "tokens_per_second_per_gpu": 369.19 }, { "epoch": 0.41743119266055045, "grad_norm": 0.03042738139629364, "learning_rate": 6.759405075659166e-05, "loss": 0.0654, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 182, "tokens_per_second_per_gpu": 389.74 }, { "epoch": 0.4197247706422018, "grad_norm": 0.02454569563269615, "learning_rate": 6.723925227298132e-05, "loss": 0.0648, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 183, "tokens_per_second_per_gpu": 383.9 }, { "epoch": 0.42201834862385323, "grad_norm": 0.03029336780309677, "learning_rate": 6.688346587363533e-05, "loss": 0.0711, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 184, "tokens_per_second_per_gpu": 436.44 }, { "epoch": 0.4243119266055046, "grad_norm": 0.02716301940381527, "learning_rate": 6.652671194731396e-05, "loss": 0.0638, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 185, "tokens_per_second_per_gpu": 405.73 }, { "epoch": 0.42660550458715596, "grad_norm": 0.030476156622171402, "learning_rate": 6.616901093822283e-05, "loss": 0.0742, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 186, "tokens_per_second_per_gpu": 417.15 }, { "epoch": 0.4288990825688073, "grad_norm": 0.024246055632829666, "learning_rate": 6.58103833448412e-05, "loss": 0.0606, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 187, "tokens_per_second_per_gpu": 418.65 }, { "epoch": 0.43119266055045874, "grad_norm": 0.025659549981355667, "learning_rate": 6.545084971874738e-05, "loss": 0.0643, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 188, "tokens_per_second_per_gpu": 524.48 }, { "epoch": 0.4334862385321101, "grad_norm": 0.02851368486881256, "learning_rate": 6.509043066344092e-05, "loss": 0.0728, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 189, "tokens_per_second_per_gpu": 470.95 }, { "epoch": 0.43577981651376146, "grad_norm": 0.03035641275346279, "learning_rate": 6.472914683316195e-05, "loss": 0.0797, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.04, "memory/max_allocated (GiB)": 49.04, "step": 190, "tokens_per_second_per_gpu": 409.73 }, { "epoch": 0.4380733944954128, "grad_norm": 0.026916082948446274, "learning_rate": 6.436701893170756e-05, "loss": 0.06, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 191, "tokens_per_second_per_gpu": 424.58 }, { "epoch": 0.44036697247706424, "grad_norm": 0.035412922501564026, "learning_rate": 6.400406771124536e-05, "loss": 0.0699, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 192, "tokens_per_second_per_gpu": 372.44 }, { "epoch": 0.4426605504587156, "grad_norm": 0.02869465760886669, "learning_rate": 6.364031397112416e-05, "loss": 0.0709, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 193, "tokens_per_second_per_gpu": 411.07 }, { "epoch": 0.44495412844036697, "grad_norm": 0.02998914197087288, "learning_rate": 6.327577855668216e-05, "loss": 0.0693, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 194, "tokens_per_second_per_gpu": 473.81 }, { "epoch": 0.44724770642201833, "grad_norm": 0.029111091047525406, "learning_rate": 6.291048235805234e-05, "loss": 0.0789, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 195, "tokens_per_second_per_gpu": 393.48 }, { "epoch": 0.44954128440366975, "grad_norm": 0.028819169849157333, "learning_rate": 6.254444630896529e-05, "loss": 0.0738, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 196, "tokens_per_second_per_gpu": 339.21 }, { "epoch": 0.4518348623853211, "grad_norm": 0.027091829106211662, "learning_rate": 6.21776913855496e-05, "loss": 0.0606, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 197, "tokens_per_second_per_gpu": 490.05 }, { "epoch": 0.4541284403669725, "grad_norm": 0.023907724767923355, "learning_rate": 6.181023860512984e-05, "loss": 0.0664, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 198, "tokens_per_second_per_gpu": 437.98 }, { "epoch": 0.45642201834862384, "grad_norm": 0.026607749983668327, "learning_rate": 6.144210902502207e-05, "loss": 0.0686, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 199, "tokens_per_second_per_gpu": 518.9 }, { "epoch": 0.45871559633027525, "grad_norm": 0.028734847903251648, "learning_rate": 6.107332374132715e-05, "loss": 0.0709, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 200, "tokens_per_second_per_gpu": 448.6 }, { "epoch": 0.4610091743119266, "grad_norm": 0.027956590056419373, "learning_rate": 6.0703903887721837e-05, "loss": 0.0645, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 201, "tokens_per_second_per_gpu": 450.75 }, { "epoch": 0.463302752293578, "grad_norm": 0.02955472283065319, "learning_rate": 6.0333870634247645e-05, "loss": 0.0749, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 202, "tokens_per_second_per_gpu": 366.38 }, { "epoch": 0.46559633027522934, "grad_norm": 0.033545345067977905, "learning_rate": 5.9963245186097725e-05, "loss": 0.0714, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 203, "tokens_per_second_per_gpu": 409.9 }, { "epoch": 0.46788990825688076, "grad_norm": 0.027358222752809525, "learning_rate": 5.95920487824016e-05, "loss": 0.0632, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 204, "tokens_per_second_per_gpu": 409.18 }, { "epoch": 0.4701834862385321, "grad_norm": 0.026303566992282867, "learning_rate": 5.922030269500809e-05, "loss": 0.0621, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.77, "memory/max_allocated (GiB)": 48.77, "step": 205, "tokens_per_second_per_gpu": 344.97 }, { "epoch": 0.4724770642201835, "grad_norm": 0.023472387343645096, "learning_rate": 5.8848028227266325e-05, "loss": 0.0642, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 206, "tokens_per_second_per_gpu": 458.3 }, { "epoch": 0.47477064220183485, "grad_norm": 0.02930634468793869, "learning_rate": 5.847524671280484e-05, "loss": 0.07, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 207, "tokens_per_second_per_gpu": 386.88 }, { "epoch": 0.47706422018348627, "grad_norm": 0.02035793662071228, "learning_rate": 5.810197951430911e-05, "loss": 0.0558, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 208, "tokens_per_second_per_gpu": 479.37 }, { "epoch": 0.4793577981651376, "grad_norm": 0.027948010712862015, "learning_rate": 5.772824802229733e-05, "loss": 0.07, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.77, "memory/max_allocated (GiB)": 48.77, "step": 209, "tokens_per_second_per_gpu": 352.97 }, { "epoch": 0.481651376146789, "grad_norm": 0.027743425220251083, "learning_rate": 5.735407365389453e-05, "loss": 0.0686, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 210, "tokens_per_second_per_gpu": 419.65 }, { "epoch": 0.48394495412844035, "grad_norm": 0.03574339672923088, "learning_rate": 5.697947785160532e-05, "loss": 0.0593, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 211, "tokens_per_second_per_gpu": 391.99 }, { "epoch": 0.48623853211009177, "grad_norm": 0.03303733468055725, "learning_rate": 5.660448208208513e-05, "loss": 0.0615, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 212, "tokens_per_second_per_gpu": 420.47 }, { "epoch": 0.48853211009174313, "grad_norm": 0.030316850170493126, "learning_rate": 5.622910783490988e-05, "loss": 0.0745, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.77, "memory/max_allocated (GiB)": 48.77, "step": 213, "tokens_per_second_per_gpu": 379.16 }, { "epoch": 0.4908256880733945, "grad_norm": 0.031506236642599106, "learning_rate": 5.585337662134471e-05, "loss": 0.0724, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 214, "tokens_per_second_per_gpu": 376.6 }, { "epoch": 0.49311926605504586, "grad_norm": 0.025807412341237068, "learning_rate": 5.5477309973111046e-05, "loss": 0.0628, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 215, "tokens_per_second_per_gpu": 386.77 }, { "epoch": 0.4954128440366973, "grad_norm": 0.02294624038040638, "learning_rate": 5.510092944115286e-05, "loss": 0.0629, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 216, "tokens_per_second_per_gpu": 473.64 }, { "epoch": 0.49770642201834864, "grad_norm": 0.027048619464039803, "learning_rate": 5.472425659440157e-05, "loss": 0.0675, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 217, "tokens_per_second_per_gpu": 374.21 }, { "epoch": 0.5, "grad_norm": 0.026564767584204674, "learning_rate": 5.4347313018540056e-05, "loss": 0.0697, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 218, "tokens_per_second_per_gpu": 442.12 }, { "epoch": 0.5022935779816514, "grad_norm": 0.03516434505581856, "learning_rate": 5.397012031476562e-05, "loss": 0.082, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.04, "memory/max_allocated (GiB)": 49.04, "step": 219, "tokens_per_second_per_gpu": 380.84 }, { "epoch": 0.5045871559633027, "grad_norm": 0.021558105945587158, "learning_rate": 5.359270009855216e-05, "loss": 0.0585, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 220, "tokens_per_second_per_gpu": 509.31 }, { "epoch": 0.5068807339449541, "grad_norm": 0.024724913761019707, "learning_rate": 5.321507399841148e-05, "loss": 0.0632, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 221, "tokens_per_second_per_gpu": 438.7 }, { "epoch": 0.5091743119266054, "grad_norm": 0.02698579616844654, "learning_rate": 5.2837263654653715e-05, "loss": 0.0715, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 222, "tokens_per_second_per_gpu": 337.92 }, { "epoch": 0.5114678899082569, "grad_norm": 0.03043169341981411, "learning_rate": 5.2459290718147344e-05, "loss": 0.0755, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 223, "tokens_per_second_per_gpu": 485.96 }, { "epoch": 0.5137614678899083, "grad_norm": 0.026405537500977516, "learning_rate": 5.2081176849078464e-05, "loss": 0.0641, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 224, "tokens_per_second_per_gpu": 434.97 }, { "epoch": 0.5160550458715596, "grad_norm": 0.024269182235002518, "learning_rate": 5.170294371570939e-05, "loss": 0.0666, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 225, "tokens_per_second_per_gpu": 399.27 }, { "epoch": 0.518348623853211, "grad_norm": 0.03496242314577103, "learning_rate": 5.132461299313709e-05, "loss": 0.073, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.04, "memory/max_allocated (GiB)": 49.04, "step": 226, "tokens_per_second_per_gpu": 422.84 }, { "epoch": 0.5206422018348624, "grad_norm": 0.029179584234952927, "learning_rate": 5.094620636205095e-05, "loss": 0.0697, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.04, "memory/max_allocated (GiB)": 49.04, "step": 227, "tokens_per_second_per_gpu": 357.38 }, { "epoch": 0.5229357798165137, "grad_norm": 0.027006233111023903, "learning_rate": 5.056774550749043e-05, "loss": 0.0614, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 228, "tokens_per_second_per_gpu": 316.93 }, { "epoch": 0.5252293577981652, "grad_norm": 0.028260482475161552, "learning_rate": 5.018925211760227e-05, "loss": 0.0634, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 229, "tokens_per_second_per_gpu": 417.85 }, { "epoch": 0.5275229357798165, "grad_norm": 0.025130394846200943, "learning_rate": 4.981074788239773e-05, "loss": 0.0588, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 230, "tokens_per_second_per_gpu": 413.46 }, { "epoch": 0.5298165137614679, "grad_norm": 0.025551561266183853, "learning_rate": 4.943225449250958e-05, "loss": 0.0688, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 231, "tokens_per_second_per_gpu": 445.27 }, { "epoch": 0.5321100917431193, "grad_norm": 0.028664810582995415, "learning_rate": 4.9053793637949067e-05, "loss": 0.0689, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 232, "tokens_per_second_per_gpu": 395.88 }, { "epoch": 0.5344036697247706, "grad_norm": 0.02686873823404312, "learning_rate": 4.8675387006862914e-05, "loss": 0.0656, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 233, "tokens_per_second_per_gpu": 544.1 }, { "epoch": 0.536697247706422, "grad_norm": 0.03144492581486702, "learning_rate": 4.829705628429061e-05, "loss": 0.0795, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 234, "tokens_per_second_per_gpu": 356.41 }, { "epoch": 0.5389908256880734, "grad_norm": 0.02188139036297798, "learning_rate": 4.7918823150921555e-05, "loss": 0.0611, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 235, "tokens_per_second_per_gpu": 368.54 }, { "epoch": 0.5412844036697247, "grad_norm": 0.02784140035510063, "learning_rate": 4.754070928185266e-05, "loss": 0.0604, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 236, "tokens_per_second_per_gpu": 445.84 }, { "epoch": 0.5435779816513762, "grad_norm": 0.02372545376420021, "learning_rate": 4.7162736345346303e-05, "loss": 0.0604, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 237, "tokens_per_second_per_gpu": 467.85 }, { "epoch": 0.5458715596330275, "grad_norm": 0.03274843469262123, "learning_rate": 4.6784926001588544e-05, "loss": 0.0817, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 238, "tokens_per_second_per_gpu": 438.54 }, { "epoch": 0.5481651376146789, "grad_norm": 0.02551015093922615, "learning_rate": 4.640729990144784e-05, "loss": 0.0631, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 239, "tokens_per_second_per_gpu": 486.44 }, { "epoch": 0.5504587155963303, "grad_norm": 0.04315930977463722, "learning_rate": 4.6029879685234395e-05, "loss": 0.0661, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.04, "memory/max_allocated (GiB)": 49.04, "step": 240, "tokens_per_second_per_gpu": 450.6 }, { "epoch": 0.5527522935779816, "grad_norm": 0.024066558107733727, "learning_rate": 4.565268698145997e-05, "loss": 0.0612, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 241, "tokens_per_second_per_gpu": 462.93 }, { "epoch": 0.555045871559633, "grad_norm": 0.026846949011087418, "learning_rate": 4.527574340559844e-05, "loss": 0.0754, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 242, "tokens_per_second_per_gpu": 392.01 }, { "epoch": 0.5573394495412844, "grad_norm": 0.02346811629831791, "learning_rate": 4.4899070558847154e-05, "loss": 0.0675, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 243, "tokens_per_second_per_gpu": 468.19 }, { "epoch": 0.5596330275229358, "grad_norm": 0.02288683131337166, "learning_rate": 4.452269002688897e-05, "loss": 0.064, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 244, "tokens_per_second_per_gpu": 306.21 }, { "epoch": 0.5619266055045872, "grad_norm": 0.0288680586963892, "learning_rate": 4.4146623378655296e-05, "loss": 0.0677, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.73, "memory/max_allocated (GiB)": 48.73, "step": 245, "tokens_per_second_per_gpu": 325.4 }, { "epoch": 0.5642201834862385, "grad_norm": 0.02450747601687908, "learning_rate": 4.3770892165090126e-05, "loss": 0.0638, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 246, "tokens_per_second_per_gpu": 401.5 }, { "epoch": 0.5665137614678899, "grad_norm": 0.028074199333786964, "learning_rate": 4.3395517917914895e-05, "loss": 0.0615, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 247, "tokens_per_second_per_gpu": 537.03 }, { "epoch": 0.5688073394495413, "grad_norm": 0.02514073997735977, "learning_rate": 4.3020522148394676e-05, "loss": 0.0669, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 248, "tokens_per_second_per_gpu": 409.93 }, { "epoch": 0.5711009174311926, "grad_norm": 0.029449012130498886, "learning_rate": 4.2645926346105484e-05, "loss": 0.0711, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 249, "tokens_per_second_per_gpu": 344.6 }, { "epoch": 0.573394495412844, "grad_norm": 0.024152036756277084, "learning_rate": 4.22717519777027e-05, "loss": 0.0652, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 250, "tokens_per_second_per_gpu": 417.48 }, { "epoch": 0.5756880733944955, "grad_norm": 0.02781221643090248, "learning_rate": 4.189802048569089e-05, "loss": 0.0598, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 251, "tokens_per_second_per_gpu": 477.01 }, { "epoch": 0.5779816513761468, "grad_norm": 0.02137266844511032, "learning_rate": 4.1524753287195165e-05, "loss": 0.0584, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 252, "tokens_per_second_per_gpu": 475.28 }, { "epoch": 0.5802752293577982, "grad_norm": 0.03145367652177811, "learning_rate": 4.1151971772733686e-05, "loss": 0.0742, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 253, "tokens_per_second_per_gpu": 416.81 }, { "epoch": 0.5825688073394495, "grad_norm": 0.026259735226631165, "learning_rate": 4.07796973049919e-05, "loss": 0.0704, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 254, "tokens_per_second_per_gpu": 432.14 }, { "epoch": 0.5848623853211009, "grad_norm": 0.029704980552196503, "learning_rate": 4.04079512175984e-05, "loss": 0.0751, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 255, "tokens_per_second_per_gpu": 368.81 }, { "epoch": 0.5871559633027523, "grad_norm": 0.037060242146253586, "learning_rate": 4.003675481390228e-05, "loss": 0.081, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 256, "tokens_per_second_per_gpu": 400.19 }, { "epoch": 0.5894495412844036, "grad_norm": 0.027513017877936363, "learning_rate": 3.966612936575235e-05, "loss": 0.0597, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 257, "tokens_per_second_per_gpu": 381.21 }, { "epoch": 0.591743119266055, "grad_norm": 0.037167515605688095, "learning_rate": 3.929609611227817e-05, "loss": 0.0639, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 258, "tokens_per_second_per_gpu": 357.36 }, { "epoch": 0.5940366972477065, "grad_norm": 0.0229306872934103, "learning_rate": 3.8926676258672866e-05, "loss": 0.0626, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 259, "tokens_per_second_per_gpu": 387.68 }, { "epoch": 0.5963302752293578, "grad_norm": 0.027137834578752518, "learning_rate": 3.855789097497794e-05, "loss": 0.0711, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 260, "tokens_per_second_per_gpu": 377.29 }, { "epoch": 0.5986238532110092, "grad_norm": 0.027339540421962738, "learning_rate": 3.818976139487017e-05, "loss": 0.0644, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 261, "tokens_per_second_per_gpu": 476.61 }, { "epoch": 0.6009174311926605, "grad_norm": 0.02739766612648964, "learning_rate": 3.7822308614450406e-05, "loss": 0.0711, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 262, "tokens_per_second_per_gpu": 426.96 }, { "epoch": 0.6032110091743119, "grad_norm": 0.02805398218333721, "learning_rate": 3.745555369103471e-05, "loss": 0.0669, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 263, "tokens_per_second_per_gpu": 363.5 }, { "epoch": 0.6055045871559633, "grad_norm": 0.03466130048036575, "learning_rate": 3.708951764194767e-05, "loss": 0.0771, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 264, "tokens_per_second_per_gpu": 383.75 }, { "epoch": 0.6077981651376146, "grad_norm": 0.02684733085334301, "learning_rate": 3.6724221443317855e-05, "loss": 0.0613, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 265, "tokens_per_second_per_gpu": 545.2 }, { "epoch": 0.6100917431192661, "grad_norm": 0.025042880326509476, "learning_rate": 3.635968602887585e-05, "loss": 0.0706, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 266, "tokens_per_second_per_gpu": 420.58 }, { "epoch": 0.6123853211009175, "grad_norm": 0.02610246278345585, "learning_rate": 3.599593228875465e-05, "loss": 0.0749, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 267, "tokens_per_second_per_gpu": 443.84 }, { "epoch": 0.6146788990825688, "grad_norm": 0.02343624271452427, "learning_rate": 3.563298106829244e-05, "loss": 0.0676, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 268, "tokens_per_second_per_gpu": 409.89 }, { "epoch": 0.6169724770642202, "grad_norm": 0.02438695915043354, "learning_rate": 3.527085316683805e-05, "loss": 0.0648, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 269, "tokens_per_second_per_gpu": 475.38 }, { "epoch": 0.6192660550458715, "grad_norm": 0.02070113644003868, "learning_rate": 3.490956933655909e-05, "loss": 0.0605, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 270, "tokens_per_second_per_gpu": 341.3 }, { "epoch": 0.6215596330275229, "grad_norm": 0.03797437623143196, "learning_rate": 3.4549150281252636e-05, "loss": 0.0674, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 271, "tokens_per_second_per_gpu": 391.97 }, { "epoch": 0.6238532110091743, "grad_norm": 0.02536945417523384, "learning_rate": 3.41896166551588e-05, "loss": 0.0649, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 272, "tokens_per_second_per_gpu": 461.4 }, { "epoch": 0.6261467889908257, "grad_norm": 0.032918062061071396, "learning_rate": 3.383098906177719e-05, "loss": 0.0769, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 273, "tokens_per_second_per_gpu": 495.12 }, { "epoch": 0.6284403669724771, "grad_norm": 0.03230955824255943, "learning_rate": 3.347328805268605e-05, "loss": 0.0687, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 274, "tokens_per_second_per_gpu": 355.59 }, { "epoch": 0.6307339449541285, "grad_norm": 0.045344047248363495, "learning_rate": 3.3116534126364685e-05, "loss": 0.0748, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 275, "tokens_per_second_per_gpu": 339.05 }, { "epoch": 0.6330275229357798, "grad_norm": 0.021811284124851227, "learning_rate": 3.2760747727018694e-05, "loss": 0.0646, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 276, "tokens_per_second_per_gpu": 334.53 }, { "epoch": 0.6353211009174312, "grad_norm": 0.02648971416056156, "learning_rate": 3.240594924340835e-05, "loss": 0.068, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 277, "tokens_per_second_per_gpu": 375.76 }, { "epoch": 0.6376146788990825, "grad_norm": 0.022893795743584633, "learning_rate": 3.205215900768029e-05, "loss": 0.0627, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 278, "tokens_per_second_per_gpu": 412.8 }, { "epoch": 0.6399082568807339, "grad_norm": 0.027191977947950363, "learning_rate": 3.169939729420233e-05, "loss": 0.0632, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 279, "tokens_per_second_per_gpu": 408.91 }, { "epoch": 0.6422018348623854, "grad_norm": 0.023182721808552742, "learning_rate": 3.1347684318401536e-05, "loss": 0.0631, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 280, "tokens_per_second_per_gpu": 434.99 }, { "epoch": 0.6444954128440367, "grad_norm": 0.03368153050541878, "learning_rate": 3.099704023560587e-05, "loss": 0.0762, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 281, "tokens_per_second_per_gpu": 393.93 }, { "epoch": 0.6467889908256881, "grad_norm": 0.023287048563361168, "learning_rate": 3.0647485139889145e-05, "loss": 0.0629, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 282, "tokens_per_second_per_gpu": 335.84 }, { "epoch": 0.6490825688073395, "grad_norm": 0.027626749128103256, "learning_rate": 3.0299039062919416e-05, "loss": 0.0631, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 283, "tokens_per_second_per_gpu": 446.72 }, { "epoch": 0.6513761467889908, "grad_norm": 0.02671007066965103, "learning_rate": 2.995172197281113e-05, "loss": 0.0684, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 284, "tokens_per_second_per_gpu": 419.33 }, { "epoch": 0.6536697247706422, "grad_norm": 0.026775743812322617, "learning_rate": 2.96055537729808e-05, "loss": 0.063, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 285, "tokens_per_second_per_gpu": 456.33 }, { "epoch": 0.6559633027522935, "grad_norm": 0.024690093472599983, "learning_rate": 2.926055430100647e-05, "loss": 0.0601, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 286, "tokens_per_second_per_gpu": 363.46 }, { "epoch": 0.658256880733945, "grad_norm": 0.021927161142230034, "learning_rate": 2.8916743327490803e-05, "loss": 0.0598, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 287, "tokens_per_second_per_gpu": 395.18 }, { "epoch": 0.6605504587155964, "grad_norm": 0.029110578820109367, "learning_rate": 2.8574140554928175e-05, "loss": 0.0732, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 288, "tokens_per_second_per_gpu": 395.82 }, { "epoch": 0.6628440366972477, "grad_norm": 0.025474051013588905, "learning_rate": 2.8232765616575563e-05, "loss": 0.0674, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 289, "tokens_per_second_per_gpu": 435.52 }, { "epoch": 0.6651376146788991, "grad_norm": 0.02178235538303852, "learning_rate": 2.789263807532746e-05, "loss": 0.0616, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 290, "tokens_per_second_per_gpu": 442.39 }, { "epoch": 0.6674311926605505, "grad_norm": 0.023412682116031647, "learning_rate": 2.7553777422594774e-05, "loss": 0.0673, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 291, "tokens_per_second_per_gpu": 412.59 }, { "epoch": 0.6697247706422018, "grad_norm": 0.023469222709536552, "learning_rate": 2.721620307718793e-05, "loss": 0.0682, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.77, "memory/max_allocated (GiB)": 48.77, "step": 292, "tokens_per_second_per_gpu": 276.54 }, { "epoch": 0.6720183486238532, "grad_norm": 0.03131282329559326, "learning_rate": 2.687993438420392e-05, "loss": 0.0647, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 293, "tokens_per_second_per_gpu": 392.4 }, { "epoch": 0.6743119266055045, "grad_norm": 0.02991569973528385, "learning_rate": 2.65449906139178e-05, "loss": 0.0681, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 294, "tokens_per_second_per_gpu": 377.34 }, { "epoch": 0.676605504587156, "grad_norm": 0.02651585452258587, "learning_rate": 2.6211390960678413e-05, "loss": 0.0802, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.77, "memory/max_allocated (GiB)": 48.77, "step": 295, "tokens_per_second_per_gpu": 358.61 }, { "epoch": 0.6788990825688074, "grad_norm": 0.022964881733059883, "learning_rate": 2.5879154541808337e-05, "loss": 0.0643, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 296, "tokens_per_second_per_gpu": 484.52 }, { "epoch": 0.6811926605504587, "grad_norm": 0.028967639431357384, "learning_rate": 2.554830039650834e-05, "loss": 0.0632, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 297, "tokens_per_second_per_gpu": 440.4 }, { "epoch": 0.6834862385321101, "grad_norm": 0.02948296256363392, "learning_rate": 2.5218847484766495e-05, "loss": 0.0752, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 298, "tokens_per_second_per_gpu": 288.6 }, { "epoch": 0.6857798165137615, "grad_norm": 0.03220253810286522, "learning_rate": 2.4890814686271448e-05, "loss": 0.0634, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 299, "tokens_per_second_per_gpu": 447.81 }, { "epoch": 0.6880733944954128, "grad_norm": 0.028979238122701645, "learning_rate": 2.456422079933056e-05, "loss": 0.0689, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 300, "tokens_per_second_per_gpu": 458.5 }, { "epoch": 0.6903669724770642, "grad_norm": 0.024549167603254318, "learning_rate": 2.4239084539792745e-05, "loss": 0.0593, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 301, "tokens_per_second_per_gpu": 419.65 }, { "epoch": 0.6926605504587156, "grad_norm": 0.02671237848699093, "learning_rate": 2.391542453997578e-05, "loss": 0.0657, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 302, "tokens_per_second_per_gpu": 368.27 }, { "epoch": 0.694954128440367, "grad_norm": 0.03672722727060318, "learning_rate": 2.3593259347598657e-05, "loss": 0.0535, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 303, "tokens_per_second_per_gpu": 474.85 }, { "epoch": 0.6972477064220184, "grad_norm": 0.03666655346751213, "learning_rate": 2.3272607424718675e-05, "loss": 0.0646, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 304, "tokens_per_second_per_gpu": 393.88 }, { "epoch": 0.6995412844036697, "grad_norm": 0.025117024779319763, "learning_rate": 2.29534871466734e-05, "loss": 0.0699, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.04, "memory/max_allocated (GiB)": 49.04, "step": 305, "tokens_per_second_per_gpu": 449.5 }, { "epoch": 0.7018348623853211, "grad_norm": 0.035403817892074585, "learning_rate": 2.2635916801027706e-05, "loss": 0.0769, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 306, "tokens_per_second_per_gpu": 420.33 }, { "epoch": 0.7041284403669725, "grad_norm": 0.026707297191023827, "learning_rate": 2.2319914586525777e-05, "loss": 0.0633, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 307, "tokens_per_second_per_gpu": 451.77 }, { "epoch": 0.7064220183486238, "grad_norm": 0.02504413016140461, "learning_rate": 2.2005498612048155e-05, "loss": 0.0597, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 308, "tokens_per_second_per_gpu": 357.06 }, { "epoch": 0.7087155963302753, "grad_norm": 0.02307130955159664, "learning_rate": 2.1692686895574005e-05, "loss": 0.064, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 309, "tokens_per_second_per_gpu": 474.84 }, { "epoch": 0.7110091743119266, "grad_norm": 0.026173440739512444, "learning_rate": 2.1381497363148673e-05, "loss": 0.063, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 310, "tokens_per_second_per_gpu": 403.88 }, { "epoch": 0.713302752293578, "grad_norm": 0.027350088581442833, "learning_rate": 2.1071947847856222e-05, "loss": 0.0674, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 311, "tokens_per_second_per_gpu": 409.62 }, { "epoch": 0.7155963302752294, "grad_norm": 0.02530243620276451, "learning_rate": 2.0764056088797645e-05, "loss": 0.063, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 312, "tokens_per_second_per_gpu": 385.83 }, { "epoch": 0.7178899082568807, "grad_norm": 0.028018414974212646, "learning_rate": 2.045783973007429e-05, "loss": 0.0634, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 313, "tokens_per_second_per_gpu": 395.65 }, { "epoch": 0.7201834862385321, "grad_norm": 0.02613895572721958, "learning_rate": 2.0153316319776662e-05, "loss": 0.0653, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 314, "tokens_per_second_per_gpu": 357.1 }, { "epoch": 0.7224770642201835, "grad_norm": 0.026048416271805763, "learning_rate": 1.985050330897883e-05, "loss": 0.0644, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 315, "tokens_per_second_per_gpu": 395.0 }, { "epoch": 0.7247706422018348, "grad_norm": 0.030031291767954826, "learning_rate": 1.954941805073848e-05, "loss": 0.078, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 316, "tokens_per_second_per_gpu": 372.79 }, { "epoch": 0.7270642201834863, "grad_norm": 0.029979195445775986, "learning_rate": 1.9250077799102322e-05, "loss": 0.0651, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 317, "tokens_per_second_per_gpu": 438.54 }, { "epoch": 0.7293577981651376, "grad_norm": 0.025628041476011276, "learning_rate": 1.8952499708117432e-05, "loss": 0.0669, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 318, "tokens_per_second_per_gpu": 474.63 }, { "epoch": 0.731651376146789, "grad_norm": 0.024868648499250412, "learning_rate": 1.8656700830848174e-05, "loss": 0.0656, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 319, "tokens_per_second_per_gpu": 445.15 }, { "epoch": 0.7339449541284404, "grad_norm": 0.024810567498207092, "learning_rate": 1.8362698118398967e-05, "loss": 0.064, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 320, "tokens_per_second_per_gpu": 383.63 }, { "epoch": 0.7362385321100917, "grad_norm": 0.02743346616625786, "learning_rate": 1.8070508418942876e-05, "loss": 0.0758, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 321, "tokens_per_second_per_gpu": 386.17 }, { "epoch": 0.7385321100917431, "grad_norm": 0.028884073719382286, "learning_rate": 1.7780148476756147e-05, "loss": 0.0675, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 322, "tokens_per_second_per_gpu": 498.58 }, { "epoch": 0.7408256880733946, "grad_norm": 0.028301537036895752, "learning_rate": 1.7491634931258587e-05, "loss": 0.0734, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 323, "tokens_per_second_per_gpu": 392.47 }, { "epoch": 0.7431192660550459, "grad_norm": 0.02405114285647869, "learning_rate": 1.7204984316060063e-05, "loss": 0.0538, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 324, "tokens_per_second_per_gpu": 409.24 }, { "epoch": 0.7454128440366973, "grad_norm": 0.029399245977401733, "learning_rate": 1.6920213058013022e-05, "loss": 0.0693, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.08, "memory/max_allocated (GiB)": 49.08, "step": 325, "tokens_per_second_per_gpu": 461.77 }, { "epoch": 0.7477064220183486, "grad_norm": 0.02802177332341671, "learning_rate": 1.6637337476271124e-05, "loss": 0.0647, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 326, "tokens_per_second_per_gpu": 389.28 }, { "epoch": 0.75, "grad_norm": 0.024391207844018936, "learning_rate": 1.6356373781354058e-05, "loss": 0.066, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 327, "tokens_per_second_per_gpu": 376.51 }, { "epoch": 0.7522935779816514, "grad_norm": 0.02589585818350315, "learning_rate": 1.6077338074218596e-05, "loss": 0.0676, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 328, "tokens_per_second_per_gpu": 422.1 }, { "epoch": 0.7545871559633027, "grad_norm": 0.022877002134919167, "learning_rate": 1.580024634533587e-05, "loss": 0.0653, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 329, "tokens_per_second_per_gpu": 440.68 }, { "epoch": 0.7568807339449541, "grad_norm": 0.029319310560822487, "learning_rate": 1.5525114473775014e-05, "loss": 0.0871, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 330, "tokens_per_second_per_gpu": 435.63 }, { "epoch": 0.7591743119266054, "grad_norm": 0.03219328075647354, "learning_rate": 1.5251958226293306e-05, "loss": 0.0801, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 331, "tokens_per_second_per_gpu": 363.82 }, { "epoch": 0.7614678899082569, "grad_norm": 0.024657782167196274, "learning_rate": 1.4980793256432474e-05, "loss": 0.0622, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 332, "tokens_per_second_per_gpu": 342.94 }, { "epoch": 0.7637614678899083, "grad_norm": 0.03142733871936798, "learning_rate": 1.4711635103621719e-05, "loss": 0.0681, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 333, "tokens_per_second_per_gpu": 404.61 }, { "epoch": 0.7660550458715596, "grad_norm": 0.026000676676630974, "learning_rate": 1.4444499192287275e-05, "loss": 0.065, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 334, "tokens_per_second_per_gpu": 367.91 }, { "epoch": 0.768348623853211, "grad_norm": 0.03227536380290985, "learning_rate": 1.4179400830968415e-05, "loss": 0.0767, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 335, "tokens_per_second_per_gpu": 314.19 }, { "epoch": 0.7706422018348624, "grad_norm": 0.025221284478902817, "learning_rate": 1.3916355211440164e-05, "loss": 0.0645, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 336, "tokens_per_second_per_gpu": 362.08 }, { "epoch": 0.7729357798165137, "grad_norm": 0.030213654041290283, "learning_rate": 1.3655377407842812e-05, "loss": 0.066, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 337, "tokens_per_second_per_gpu": 466.08 }, { "epoch": 0.7752293577981652, "grad_norm": 0.026164716109633446, "learning_rate": 1.3396482375817975e-05, "loss": 0.0656, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 338, "tokens_per_second_per_gpu": 458.34 }, { "epoch": 0.7775229357798165, "grad_norm": 0.0265730619430542, "learning_rate": 1.3139684951651588e-05, "loss": 0.0636, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 339, "tokens_per_second_per_gpu": 399.93 }, { "epoch": 0.7798165137614679, "grad_norm": 0.026285763829946518, "learning_rate": 1.2884999851423673e-05, "loss": 0.0682, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 340, "tokens_per_second_per_gpu": 421.4 }, { "epoch": 0.7821100917431193, "grad_norm": 0.023802319541573524, "learning_rate": 1.2632441670165056e-05, "loss": 0.0641, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.08, "memory/max_allocated (GiB)": 49.08, "step": 341, "tokens_per_second_per_gpu": 439.55 }, { "epoch": 0.7844036697247706, "grad_norm": 0.024973031133413315, "learning_rate": 1.2382024881020937e-05, "loss": 0.0615, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 342, "tokens_per_second_per_gpu": 492.26 }, { "epoch": 0.786697247706422, "grad_norm": 0.029818380251526833, "learning_rate": 1.213376383442153e-05, "loss": 0.0746, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 343, "tokens_per_second_per_gpu": 394.15 }, { "epoch": 0.7889908256880734, "grad_norm": 0.028851691633462906, "learning_rate": 1.188767275725966e-05, "loss": 0.0744, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 344, "tokens_per_second_per_gpu": 422.33 }, { "epoch": 0.7912844036697247, "grad_norm": 0.03523954004049301, "learning_rate": 1.164376575207547e-05, "loss": 0.077, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 345, "tokens_per_second_per_gpu": 286.3 }, { "epoch": 0.7935779816513762, "grad_norm": 0.023627813905477524, "learning_rate": 1.140205679624834e-05, "loss": 0.0641, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 346, "tokens_per_second_per_gpu": 351.44 }, { "epoch": 0.7958715596330275, "grad_norm": 0.026164906099438667, "learning_rate": 1.1162559741195733e-05, "loss": 0.0658, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 347, "tokens_per_second_per_gpu": 389.38 }, { "epoch": 0.7981651376146789, "grad_norm": 0.023336883634328842, "learning_rate": 1.092528831157959e-05, "loss": 0.062, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 348, "tokens_per_second_per_gpu": 472.15 }, { "epoch": 0.8004587155963303, "grad_norm": 0.02306864783167839, "learning_rate": 1.0690256104519764e-05, "loss": 0.0629, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 349, "tokens_per_second_per_gpu": 422.0 }, { "epoch": 0.8027522935779816, "grad_norm": 0.026163572445511818, "learning_rate": 1.0457476588814774e-05, "loss": 0.0667, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.77, "memory/max_allocated (GiB)": 48.77, "step": 350, "tokens_per_second_per_gpu": 389.59 }, { "epoch": 0.805045871559633, "grad_norm": 0.024867909029126167, "learning_rate": 1.0226963104170002e-05, "loss": 0.0674, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 351, "tokens_per_second_per_gpu": 429.91 }, { "epoch": 0.8073394495412844, "grad_norm": 0.023188138380646706, "learning_rate": 9.998728860433276e-06, "loss": 0.0645, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 352, "tokens_per_second_per_gpu": 388.46 }, { "epoch": 0.8096330275229358, "grad_norm": 0.03035775013267994, "learning_rate": 9.772786936837785e-06, "loss": 0.0707, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 353, "tokens_per_second_per_gpu": 397.77 }, { "epoch": 0.8119266055045872, "grad_norm": 0.04821021109819412, "learning_rate": 9.549150281252633e-06, "loss": 0.0646, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 354, "tokens_per_second_per_gpu": 445.4 }, { "epoch": 0.8142201834862385, "grad_norm": 0.030557144433259964, "learning_rate": 9.327831709440792e-06, "loss": 0.0659, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 355, "tokens_per_second_per_gpu": 382.05 }, { "epoch": 0.8165137614678899, "grad_norm": 0.02662436105310917, "learning_rate": 9.108843904324715e-06, "loss": 0.0626, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 356, "tokens_per_second_per_gpu": 412.62 }, { "epoch": 0.8188073394495413, "grad_norm": 0.027914568781852722, "learning_rate": 8.8921994152595e-06, "loss": 0.0681, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 357, "tokens_per_second_per_gpu": 297.66 }, { "epoch": 0.8211009174311926, "grad_norm": 0.027242561802268028, "learning_rate": 8.677910657313782e-06, "loss": 0.067, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 358, "tokens_per_second_per_gpu": 457.91 }, { "epoch": 0.823394495412844, "grad_norm": 0.030475802719593048, "learning_rate": 8.465989910558209e-06, "loss": 0.0689, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 359, "tokens_per_second_per_gpu": 368.2 }, { "epoch": 0.8256880733944955, "grad_norm": 0.028360676020383835, "learning_rate": 8.256449319361748e-06, "loss": 0.0687, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 360, "tokens_per_second_per_gpu": 384.64 }, { "epoch": 0.8279816513761468, "grad_norm": 0.031053343787789345, "learning_rate": 8.049300891695744e-06, "loss": 0.0754, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 361, "tokens_per_second_per_gpu": 320.92 }, { "epoch": 0.8302752293577982, "grad_norm": 0.030271202325820923, "learning_rate": 7.844556498445788e-06, "loss": 0.072, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 362, "tokens_per_second_per_gpu": 437.12 }, { "epoch": 0.8325688073394495, "grad_norm": 0.027202172204852104, "learning_rate": 7.642227872731417e-06, "loss": 0.0696, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 363, "tokens_per_second_per_gpu": 332.31 }, { "epoch": 0.8348623853211009, "grad_norm": 0.02677847445011139, "learning_rate": 7.4423266092337855e-06, "loss": 0.0703, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 364, "tokens_per_second_per_gpu": 364.39 }, { "epoch": 0.8371559633027523, "grad_norm": 0.0259072408080101, "learning_rate": 7.244864163531162e-06, "loss": 0.0678, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.77, "memory/max_allocated (GiB)": 48.77, "step": 365, "tokens_per_second_per_gpu": 367.02 }, { "epoch": 0.8394495412844036, "grad_norm": 0.02673807553946972, "learning_rate": 7.049851851442468e-06, "loss": 0.0661, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 366, "tokens_per_second_per_gpu": 475.35 }, { "epoch": 0.841743119266055, "grad_norm": 0.027974814176559448, "learning_rate": 6.857300848378856e-06, "loss": 0.0747, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 367, "tokens_per_second_per_gpu": 409.23 }, { "epoch": 0.8440366972477065, "grad_norm": 0.022259563207626343, "learning_rate": 6.667222188703226e-06, "loss": 0.064, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 368, "tokens_per_second_per_gpu": 440.59 }, { "epoch": 0.8463302752293578, "grad_norm": 0.02939799055457115, "learning_rate": 6.479626765097918e-06, "loss": 0.0693, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 369, "tokens_per_second_per_gpu": 455.83 }, { "epoch": 0.8486238532110092, "grad_norm": 0.029195845127105713, "learning_rate": 6.294525327940515e-06, "loss": 0.0711, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 370, "tokens_per_second_per_gpu": 394.89 }, { "epoch": 0.8509174311926605, "grad_norm": 0.0236493106931448, "learning_rate": 6.111928484687723e-06, "loss": 0.0643, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 371, "tokens_per_second_per_gpu": 408.68 }, { "epoch": 0.8532110091743119, "grad_norm": 0.02727104350924492, "learning_rate": 5.931846699267557e-06, "loss": 0.067, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 372, "tokens_per_second_per_gpu": 509.27 }, { "epoch": 0.8555045871559633, "grad_norm": 0.034410908818244934, "learning_rate": 5.7542902914796745e-06, "loss": 0.0624, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 373, "tokens_per_second_per_gpu": 556.2 }, { "epoch": 0.8577981651376146, "grad_norm": 0.0287538543343544, "learning_rate": 5.579269436403967e-06, "loss": 0.0651, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.08, "memory/max_allocated (GiB)": 49.08, "step": 374, "tokens_per_second_per_gpu": 381.61 }, { "epoch": 0.8600917431192661, "grad_norm": 0.02870243228971958, "learning_rate": 5.4067941638174806e-06, "loss": 0.0731, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 375, "tokens_per_second_per_gpu": 361.13 }, { "epoch": 0.8623853211009175, "grad_norm": 0.026416806504130363, "learning_rate": 5.2368743576196536e-06, "loss": 0.064, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 376, "tokens_per_second_per_gpu": 326.97 }, { "epoch": 0.8646788990825688, "grad_norm": 0.023003704845905304, "learning_rate": 5.0695197552659e-06, "loss": 0.0625, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 377, "tokens_per_second_per_gpu": 438.76 }, { "epoch": 0.8669724770642202, "grad_norm": 0.037476420402526855, "learning_rate": 4.9047399472095746e-06, "loss": 0.0697, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 378, "tokens_per_second_per_gpu": 345.81 }, { "epoch": 0.8692660550458715, "grad_norm": 0.02971925400197506, "learning_rate": 4.742544376352443e-06, "loss": 0.0663, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 379, "tokens_per_second_per_gpu": 436.62 }, { "epoch": 0.8715596330275229, "grad_norm": 0.023713113740086555, "learning_rate": 4.582942337503465e-06, "loss": 0.0602, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 380, "tokens_per_second_per_gpu": 448.46 }, { "epoch": 0.8738532110091743, "grad_norm": 0.02941006049513817, "learning_rate": 4.425942976846187e-06, "loss": 0.0725, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 381, "tokens_per_second_per_gpu": 329.17 }, { "epoch": 0.8761467889908257, "grad_norm": 0.028299743309617043, "learning_rate": 4.271555291414636e-06, "loss": 0.072, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 382, "tokens_per_second_per_gpu": 340.56 }, { "epoch": 0.8784403669724771, "grad_norm": 0.03180241584777832, "learning_rate": 4.119788128577667e-06, "loss": 0.0766, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 383, "tokens_per_second_per_gpu": 446.49 }, { "epoch": 0.8807339449541285, "grad_norm": 0.026926379650831223, "learning_rate": 3.9706501855319765e-06, "loss": 0.0683, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 384, "tokens_per_second_per_gpu": 440.01 }, { "epoch": 0.8830275229357798, "grad_norm": 0.03347824513912201, "learning_rate": 3.824150008803767e-06, "loss": 0.0751, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 385, "tokens_per_second_per_gpu": 343.0 }, { "epoch": 0.8853211009174312, "grad_norm": 0.030953101813793182, "learning_rate": 3.680295993758881e-06, "loss": 0.0689, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 386, "tokens_per_second_per_gpu": 393.96 }, { "epoch": 0.8876146788990825, "grad_norm": 0.032475098967552185, "learning_rate": 3.539096384121743e-06, "loss": 0.0828, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 387, "tokens_per_second_per_gpu": 378.9 }, { "epoch": 0.8899082568807339, "grad_norm": 0.02490062825381756, "learning_rate": 3.40055927150294e-06, "loss": 0.0623, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 388, "tokens_per_second_per_gpu": 408.26 }, { "epoch": 0.8922018348623854, "grad_norm": 0.02600006014108658, "learning_rate": 3.2646925949355312e-06, "loss": 0.0658, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 389, "tokens_per_second_per_gpu": 446.25 }, { "epoch": 0.8944954128440367, "grad_norm": 0.024244820699095726, "learning_rate": 3.1315041404200663e-06, "loss": 0.0655, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 390, "tokens_per_second_per_gpu": 420.39 }, { "epoch": 0.8967889908256881, "grad_norm": 0.0253219585865736, "learning_rate": 3.00100154047841e-06, "loss": 0.0674, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 391, "tokens_per_second_per_gpu": 463.53 }, { "epoch": 0.8990825688073395, "grad_norm": 0.027757421135902405, "learning_rate": 2.8731922737163685e-06, "loss": 0.0681, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 392, "tokens_per_second_per_gpu": 472.68 }, { "epoch": 0.9013761467889908, "grad_norm": 0.02381259575486183, "learning_rate": 2.7480836643950956e-06, "loss": 0.0596, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.08, "memory/max_allocated (GiB)": 49.08, "step": 393, "tokens_per_second_per_gpu": 452.37 }, { "epoch": 0.9036697247706422, "grad_norm": 0.024906722828745842, "learning_rate": 2.6256828820113766e-06, "loss": 0.0669, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 394, "tokens_per_second_per_gpu": 327.71 }, { "epoch": 0.9059633027522935, "grad_norm": 0.025515113025903702, "learning_rate": 2.5059969408867843e-06, "loss": 0.0636, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 395, "tokens_per_second_per_gpu": 409.3 }, { "epoch": 0.908256880733945, "grad_norm": 0.026188403367996216, "learning_rate": 2.3890326997656975e-06, "loss": 0.0688, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 396, "tokens_per_second_per_gpu": 371.81 }, { "epoch": 0.9105504587155964, "grad_norm": 0.027840575203299522, "learning_rate": 2.274796861422246e-06, "loss": 0.0737, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 397, "tokens_per_second_per_gpu": 447.44 }, { "epoch": 0.9128440366972477, "grad_norm": 0.0268483255058527, "learning_rate": 2.163295972276219e-06, "loss": 0.0583, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 398, "tokens_per_second_per_gpu": 383.02 }, { "epoch": 0.9151376146788991, "grad_norm": 0.027824856340885162, "learning_rate": 2.054536422017922e-06, "loss": 0.0767, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 399, "tokens_per_second_per_gpu": 331.96 }, { "epoch": 0.9174311926605505, "grad_norm": 0.024313461035490036, "learning_rate": 1.9485244432419667e-06, "loss": 0.0694, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 400, "tokens_per_second_per_gpu": 371.17 }, { "epoch": 0.9197247706422018, "grad_norm": 0.02038564346730709, "learning_rate": 1.8452661110901715e-06, "loss": 0.0563, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 401, "tokens_per_second_per_gpu": 474.9 }, { "epoch": 0.9220183486238532, "grad_norm": 0.030249858275055885, "learning_rate": 1.7447673429033362e-06, "loss": 0.0685, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 402, "tokens_per_second_per_gpu": 324.48 }, { "epoch": 0.9243119266055045, "grad_norm": 0.027523530647158623, "learning_rate": 1.6470338978822108e-06, "loss": 0.0666, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 403, "tokens_per_second_per_gpu": 405.82 }, { "epoch": 0.926605504587156, "grad_norm": 0.026385333389043808, "learning_rate": 1.5520713767574246e-06, "loss": 0.0768, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 404, "tokens_per_second_per_gpu": 380.7 }, { "epoch": 0.9288990825688074, "grad_norm": 0.02548050880432129, "learning_rate": 1.4598852214685488e-06, "loss": 0.0649, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 405, "tokens_per_second_per_gpu": 421.77 }, { "epoch": 0.9311926605504587, "grad_norm": 0.0276033915579319, "learning_rate": 1.3704807148521903e-06, "loss": 0.0722, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 406, "tokens_per_second_per_gpu": 391.2 }, { "epoch": 0.9334862385321101, "grad_norm": 0.025824090465903282, "learning_rate": 1.2838629803393342e-06, "loss": 0.0658, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 407, "tokens_per_second_per_gpu": 363.71 }, { "epoch": 0.9357798165137615, "grad_norm": 0.032180044800043106, "learning_rate": 1.2000369816616674e-06, "loss": 0.0677, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 408, "tokens_per_second_per_gpu": 490.46 }, { "epoch": 0.9380733944954128, "grad_norm": 0.03195993974804878, "learning_rate": 1.119007522567167e-06, "loss": 0.08, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 409, "tokens_per_second_per_gpu": 443.35 }, { "epoch": 0.9403669724770642, "grad_norm": 0.024462653324007988, "learning_rate": 1.0407792465447986e-06, "loss": 0.0589, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 410, "tokens_per_second_per_gpu": 511.33 }, { "epoch": 0.9426605504587156, "grad_norm": 0.02783488854765892, "learning_rate": 9.653566365584176e-07, "loss": 0.0705, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 411, "tokens_per_second_per_gpu": 407.62 }, { "epoch": 0.944954128440367, "grad_norm": 0.03449428081512451, "learning_rate": 8.927440147898702e-07, "loss": 0.0801, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 412, "tokens_per_second_per_gpu": 306.83 }, { "epoch": 0.9472477064220184, "grad_norm": 0.027761735022068024, "learning_rate": 8.229455423913013e-07, "loss": 0.0749, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.77, "memory/max_allocated (GiB)": 48.77, "step": 413, "tokens_per_second_per_gpu": 327.64 }, { "epoch": 0.9495412844036697, "grad_norm": 0.029755057767033577, "learning_rate": 7.559652192467126e-07, "loss": 0.0778, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 414, "tokens_per_second_per_gpu": 384.79 }, { "epoch": 0.9518348623853211, "grad_norm": 0.028378870338201523, "learning_rate": 6.918068837427128e-07, "loss": 0.0672, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 415, "tokens_per_second_per_gpu": 406.2 }, { "epoch": 0.9541284403669725, "grad_norm": 0.02773345075547695, "learning_rate": 6.304742125485874e-07, "loss": 0.06, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 416, "tokens_per_second_per_gpu": 387.18 }, { "epoch": 0.9564220183486238, "grad_norm": 0.0268245879560709, "learning_rate": 5.719707204055735e-07, "loss": 0.0621, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 417, "tokens_per_second_per_gpu": 411.81 }, { "epoch": 0.9587155963302753, "grad_norm": 0.033236313611269, "learning_rate": 5.162997599254704e-07, "loss": 0.0578, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.08, "memory/max_allocated (GiB)": 49.08, "step": 418, "tokens_per_second_per_gpu": 471.32 }, { "epoch": 0.9610091743119266, "grad_norm": 0.022961758077144623, "learning_rate": 4.634645213984934e-07, "loss": 0.0643, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 419, "tokens_per_second_per_gpu": 436.57 }, { "epoch": 0.963302752293578, "grad_norm": 0.028307458385825157, "learning_rate": 4.134680326104645e-07, "loss": 0.0691, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 420, "tokens_per_second_per_gpu": 492.18 }, { "epoch": 0.9655963302752294, "grad_norm": 0.026976363733410835, "learning_rate": 3.663131586692792e-07, "loss": 0.0655, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 421, "tokens_per_second_per_gpu": 327.53 }, { "epoch": 0.9678899082568807, "grad_norm": 0.024504244327545166, "learning_rate": 3.2200260184075406e-07, "loss": 0.0658, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 422, "tokens_per_second_per_gpu": 419.83 }, { "epoch": 0.9701834862385321, "grad_norm": 0.023533035069704056, "learning_rate": 2.805389013937454e-07, "loss": 0.0556, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 423, "tokens_per_second_per_gpu": 401.57 }, { "epoch": 0.9724770642201835, "grad_norm": 0.022774042561650276, "learning_rate": 2.419244334546267e-07, "loss": 0.0581, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.73, "memory/max_allocated (GiB)": 48.73, "step": 424, "tokens_per_second_per_gpu": 329.45 }, { "epoch": 0.9747706422018348, "grad_norm": 0.03273961320519447, "learning_rate": 2.061614108711474e-07, "loss": 0.0824, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 425, "tokens_per_second_per_gpu": 373.34 }, { "epoch": 0.9770642201834863, "grad_norm": 0.02143704518675804, "learning_rate": 1.732518830856067e-07, "loss": 0.0588, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 426, "tokens_per_second_per_gpu": 431.87 }, { "epoch": 0.9793577981651376, "grad_norm": 0.026173925027251244, "learning_rate": 1.431977360173975e-07, "loss": 0.0678, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 427, "tokens_per_second_per_gpu": 439.15 }, { "epoch": 0.981651376146789, "grad_norm": 0.026415711268782616, "learning_rate": 1.16000691954965e-07, "loss": 0.067, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 428, "tokens_per_second_per_gpu": 480.36 }, { "epoch": 0.9839449541284404, "grad_norm": 0.025120330974459648, "learning_rate": 9.1662309457069e-08, "loss": 0.0644, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 429, "tokens_per_second_per_gpu": 414.88 }, { "epoch": 0.9862385321100917, "grad_norm": 0.023429665714502335, "learning_rate": 7.018398326350539e-08, "loss": 0.0645, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 430, "tokens_per_second_per_gpu": 330.04 }, { "epoch": 0.9885321100917431, "grad_norm": 0.03130911663174629, "learning_rate": 5.15669442151423e-08, "loss": 0.0723, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 431, "tokens_per_second_per_gpu": 269.97 }, { "epoch": 0.9908256880733946, "grad_norm": 0.026494460180401802, "learning_rate": 3.581225918342646e-08, "loss": 0.0685, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 432, "tokens_per_second_per_gpu": 387.46 }, { "epoch": 0.9931192660550459, "grad_norm": 0.032140735536813736, "learning_rate": 2.292083100920994e-08, "loss": 0.0631, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 433, "tokens_per_second_per_gpu": 427.4 }, { "epoch": 0.9954128440366973, "grad_norm": 0.025799578055739403, "learning_rate": 1.2893398451024886e-08, "loss": 0.0695, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.04, "memory/max_allocated (GiB)": 49.04, "step": 434, "tokens_per_second_per_gpu": 461.05 }, { "epoch": 0.9977064220183486, "grad_norm": 0.031855881214141846, "learning_rate": 5.730536142745102e-09, "loss": 0.0818, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 435, "tokens_per_second_per_gpu": 415.34 }, { "epoch": 1.0, "grad_norm": 0.026343608275055885, "learning_rate": 1.432654560679092e-09, "loss": 0.0674, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 436, "tokens_per_second_per_gpu": 342.89 } ], "logging_steps": 1, "max_steps": 436, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 60, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.337198826144924e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }