{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.27522935779816515, "eval_steps": 500, "global_step": 120, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0022935779816513763, "grad_norm": 0.12869106233119965, "learning_rate": 0.0, "loss": 0.1978, "memory/device_reserved (GiB)": 50.77, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 1, "tokens_per_second_per_gpu": 354.96 }, { "epoch": 0.0045871559633027525, "grad_norm": 0.15667210519313812, "learning_rate": 4.7619047619047615e-06, "loss": 0.2353, "memory/device_reserved (GiB)": 50.77, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 2, "tokens_per_second_per_gpu": 406.37 }, { "epoch": 0.006880733944954129, "grad_norm": 0.2217973917722702, "learning_rate": 9.523809523809523e-06, "loss": 0.2243, "memory/device_reserved (GiB)": 50.87, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 3, "tokens_per_second_per_gpu": 371.18 }, { "epoch": 0.009174311926605505, "grad_norm": 0.15948686003684998, "learning_rate": 1.4285714285714285e-05, "loss": 0.2392, "memory/device_reserved (GiB)": 50.87, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 4, "tokens_per_second_per_gpu": 414.48 }, { "epoch": 0.011467889908256881, "grad_norm": 0.153566375374794, "learning_rate": 1.9047619047619046e-05, "loss": 0.2182, "memory/device_reserved (GiB)": 50.87, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 5, "tokens_per_second_per_gpu": 369.22 }, { "epoch": 0.013761467889908258, "grad_norm": 0.1521972268819809, "learning_rate": 2.380952380952381e-05, "loss": 0.2112, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 49.04, "memory/max_allocated (GiB)": 49.04, "step": 6, "tokens_per_second_per_gpu": 429.31 }, { "epoch": 0.016055045871559634, "grad_norm": 0.168710395693779, "learning_rate": 2.857142857142857e-05, "loss": 0.226, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 7, "tokens_per_second_per_gpu": 417.78 }, { "epoch": 0.01834862385321101, "grad_norm": 0.13864850997924805, "learning_rate": 3.3333333333333335e-05, "loss": 0.1884, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 8, "tokens_per_second_per_gpu": 439.56 }, { "epoch": 0.020642201834862386, "grad_norm": 0.15227903425693512, "learning_rate": 3.809523809523809e-05, "loss": 0.1996, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 9, "tokens_per_second_per_gpu": 411.33 }, { "epoch": 0.022935779816513763, "grad_norm": 0.13421630859375, "learning_rate": 4.2857142857142856e-05, "loss": 0.1599, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 10, "tokens_per_second_per_gpu": 496.3 }, { "epoch": 0.02522935779816514, "grad_norm": 0.14955134689807892, "learning_rate": 4.761904761904762e-05, "loss": 0.1735, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 11, "tokens_per_second_per_gpu": 372.95 }, { "epoch": 0.027522935779816515, "grad_norm": 0.1432778388261795, "learning_rate": 5.2380952380952384e-05, "loss": 0.1515, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 12, "tokens_per_second_per_gpu": 398.65 }, { "epoch": 0.02981651376146789, "grad_norm": 0.14163611829280853, "learning_rate": 5.714285714285714e-05, "loss": 0.1517, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 13, "tokens_per_second_per_gpu": 440.5 }, { "epoch": 0.03211009174311927, "grad_norm": 0.15477906167507172, "learning_rate": 6.19047619047619e-05, "loss": 0.1444, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 14, "tokens_per_second_per_gpu": 385.32 }, { "epoch": 0.034403669724770644, "grad_norm": 0.1055532768368721, "learning_rate": 6.666666666666667e-05, "loss": 0.1292, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 15, "tokens_per_second_per_gpu": 453.02 }, { "epoch": 0.03669724770642202, "grad_norm": 0.10180933028459549, "learning_rate": 7.142857142857143e-05, "loss": 0.1208, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 16, "tokens_per_second_per_gpu": 474.27 }, { "epoch": 0.0389908256880734, "grad_norm": 0.07999677956104279, "learning_rate": 7.619047619047618e-05, "loss": 0.132, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 17, "tokens_per_second_per_gpu": 382.05 }, { "epoch": 0.04128440366972477, "grad_norm": 0.09194924682378769, "learning_rate": 8.095238095238096e-05, "loss": 0.1067, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 18, "tokens_per_second_per_gpu": 398.61 }, { "epoch": 0.04357798165137615, "grad_norm": 0.0931428000330925, "learning_rate": 8.571428571428571e-05, "loss": 0.1088, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 19, "tokens_per_second_per_gpu": 447.07 }, { "epoch": 0.045871559633027525, "grad_norm": 0.06202042102813721, "learning_rate": 9.047619047619048e-05, "loss": 0.0962, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 20, "tokens_per_second_per_gpu": 382.57 }, { "epoch": 0.0481651376146789, "grad_norm": 0.04220607504248619, "learning_rate": 9.523809523809524e-05, "loss": 0.0963, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 21, "tokens_per_second_per_gpu": 423.29 }, { "epoch": 0.05045871559633028, "grad_norm": 0.050066106021404266, "learning_rate": 0.0001, "loss": 0.1032, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 22, "tokens_per_second_per_gpu": 381.35 }, { "epoch": 0.052752293577981654, "grad_norm": 0.0557384118437767, "learning_rate": 9.999856734543933e-05, "loss": 0.1025, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 23, "tokens_per_second_per_gpu": 393.62 }, { "epoch": 0.05504587155963303, "grad_norm": 0.04612402245402336, "learning_rate": 9.999426946385727e-05, "loss": 0.0985, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 24, "tokens_per_second_per_gpu": 515.46 }, { "epoch": 0.05733944954128441, "grad_norm": 0.09721734374761581, "learning_rate": 9.998710660154898e-05, "loss": 0.1062, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 25, "tokens_per_second_per_gpu": 398.15 }, { "epoch": 0.05963302752293578, "grad_norm": 0.036745935678482056, "learning_rate": 9.997707916899079e-05, "loss": 0.1045, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 26, "tokens_per_second_per_gpu": 422.42 }, { "epoch": 0.06192660550458716, "grad_norm": 0.04298936203122139, "learning_rate": 9.996418774081658e-05, "loss": 0.0923, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 27, "tokens_per_second_per_gpu": 440.87 }, { "epoch": 0.06422018348623854, "grad_norm": 0.033536747097969055, "learning_rate": 9.994843305578486e-05, "loss": 0.096, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 28, "tokens_per_second_per_gpu": 370.28 }, { "epoch": 0.06651376146788991, "grad_norm": 0.03256046772003174, "learning_rate": 9.99298160167365e-05, "loss": 0.0832, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 29, "tokens_per_second_per_gpu": 357.19 }, { "epoch": 0.06880733944954129, "grad_norm": 0.042709868401288986, "learning_rate": 9.990833769054293e-05, "loss": 0.086, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 30, "tokens_per_second_per_gpu": 441.89 }, { "epoch": 0.07110091743119266, "grad_norm": 0.04347776621580124, "learning_rate": 9.988399930804504e-05, "loss": 0.1, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 48.77, "memory/max_allocated (GiB)": 48.77, "step": 31, "tokens_per_second_per_gpu": 348.66 }, { "epoch": 0.07339449541284404, "grad_norm": 0.030414681881666183, "learning_rate": 9.985680226398261e-05, "loss": 0.0811, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 32, "tokens_per_second_per_gpu": 435.28 }, { "epoch": 0.07568807339449542, "grad_norm": 0.034023743122816086, "learning_rate": 9.98267481169144e-05, "loss": 0.0743, "memory/device_reserved (GiB)": 50.93, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 33, "tokens_per_second_per_gpu": 482.51 }, { "epoch": 0.0779816513761468, "grad_norm": 0.03136487305164337, "learning_rate": 9.979383858912885e-05, "loss": 0.0739, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.08, "memory/max_allocated (GiB)": 49.08, "step": 34, "tokens_per_second_per_gpu": 496.59 }, { "epoch": 0.08027522935779817, "grad_norm": 0.028108298778533936, "learning_rate": 9.975807556654537e-05, "loss": 0.077, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 35, "tokens_per_second_per_gpu": 349.1 }, { "epoch": 0.08256880733944955, "grad_norm": 0.028020795434713364, "learning_rate": 9.971946109860626e-05, "loss": 0.0775, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 36, "tokens_per_second_per_gpu": 351.02 }, { "epoch": 0.08486238532110092, "grad_norm": 0.028756650164723396, "learning_rate": 9.967799739815925e-05, "loss": 0.0788, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 37, "tokens_per_second_per_gpu": 534.52 }, { "epoch": 0.0871559633027523, "grad_norm": 0.02806459739804268, "learning_rate": 9.963368684133072e-05, "loss": 0.0809, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 38, "tokens_per_second_per_gpu": 367.94 }, { "epoch": 0.08944954128440367, "grad_norm": 0.02387731708586216, "learning_rate": 9.958653196738954e-05, "loss": 0.0642, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.04, "memory/max_allocated (GiB)": 49.04, "step": 39, "tokens_per_second_per_gpu": 466.74 }, { "epoch": 0.09174311926605505, "grad_norm": 0.027889851480722427, "learning_rate": 9.953653547860151e-05, "loss": 0.0904, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 40, "tokens_per_second_per_gpu": 371.51 }, { "epoch": 0.09403669724770643, "grad_norm": 0.031659577041864395, "learning_rate": 9.948370024007454e-05, "loss": 0.081, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 41, "tokens_per_second_per_gpu": 479.04 }, { "epoch": 0.0963302752293578, "grad_norm": 0.03186093270778656, "learning_rate": 9.942802927959443e-05, "loss": 0.0881, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 42, "tokens_per_second_per_gpu": 364.73 }, { "epoch": 0.09862385321100918, "grad_norm": 0.0313677079975605, "learning_rate": 9.936952578745142e-05, "loss": 0.0808, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 43, "tokens_per_second_per_gpu": 418.0 }, { "epoch": 0.10091743119266056, "grad_norm": 0.0264989472925663, "learning_rate": 9.93081931162573e-05, "loss": 0.0664, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 44, "tokens_per_second_per_gpu": 439.24 }, { "epoch": 0.10321100917431193, "grad_norm": 0.026272334158420563, "learning_rate": 9.92440347807533e-05, "loss": 0.0683, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.04, "memory/max_allocated (GiB)": 49.04, "step": 45, "tokens_per_second_per_gpu": 482.81 }, { "epoch": 0.10550458715596331, "grad_norm": 0.029066840186715126, "learning_rate": 9.91770544576087e-05, "loss": 0.0737, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 46, "tokens_per_second_per_gpu": 389.87 }, { "epoch": 0.10779816513761468, "grad_norm": 0.024542706087231636, "learning_rate": 9.910725598521013e-05, "loss": 0.0737, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 47, "tokens_per_second_per_gpu": 473.12 }, { "epoch": 0.11009174311926606, "grad_norm": 0.042941153049468994, "learning_rate": 9.90346433634416e-05, "loss": 0.0951, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 48, "tokens_per_second_per_gpu": 325.12 }, { "epoch": 0.11238532110091744, "grad_norm": 0.029044413939118385, "learning_rate": 9.89592207534552e-05, "loss": 0.0745, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.73, "memory/max_allocated (GiB)": 48.73, "step": 49, "tokens_per_second_per_gpu": 315.62 }, { "epoch": 0.11467889908256881, "grad_norm": 0.028920788317918777, "learning_rate": 9.888099247743283e-05, "loss": 0.0818, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 50, "tokens_per_second_per_gpu": 441.3 }, { "epoch": 0.11697247706422019, "grad_norm": 0.026095205917954445, "learning_rate": 9.879996301833833e-05, "loss": 0.0688, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 51, "tokens_per_second_per_gpu": 386.22 }, { "epoch": 0.11926605504587157, "grad_norm": 0.024823926389217377, "learning_rate": 9.871613701966067e-05, "loss": 0.0701, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 52, "tokens_per_second_per_gpu": 511.32 }, { "epoch": 0.12155963302752294, "grad_norm": 0.036093298345804214, "learning_rate": 9.862951928514782e-05, "loss": 0.0823, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 53, "tokens_per_second_per_gpu": 323.2 }, { "epoch": 0.12385321100917432, "grad_norm": 0.03257686272263527, "learning_rate": 9.854011477853146e-05, "loss": 0.0769, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.04, "memory/max_allocated (GiB)": 49.04, "step": 54, "tokens_per_second_per_gpu": 447.62 }, { "epoch": 0.12614678899082568, "grad_norm": 0.03413158655166626, "learning_rate": 9.844792862324258e-05, "loss": 0.0728, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 55, "tokens_per_second_per_gpu": 451.05 }, { "epoch": 0.12844036697247707, "grad_norm": 0.02947932481765747, "learning_rate": 9.835296610211779e-05, "loss": 0.0713, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 56, "tokens_per_second_per_gpu": 457.44 }, { "epoch": 0.13073394495412843, "grad_norm": 0.0220651775598526, "learning_rate": 9.825523265709666e-05, "loss": 0.0607, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 57, "tokens_per_second_per_gpu": 456.49 }, { "epoch": 0.13302752293577982, "grad_norm": 0.026394842192530632, "learning_rate": 9.815473388890983e-05, "loss": 0.0716, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 58, "tokens_per_second_per_gpu": 393.95 }, { "epoch": 0.1353211009174312, "grad_norm": 0.027936838567256927, "learning_rate": 9.805147555675805e-05, "loss": 0.0738, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 59, "tokens_per_second_per_gpu": 464.83 }, { "epoch": 0.13761467889908258, "grad_norm": 0.023982539772987366, "learning_rate": 9.794546357798208e-05, "loss": 0.0608, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 60, "tokens_per_second_per_gpu": 450.66 }, { "epoch": 0.13990825688073394, "grad_norm": 0.027479754760861397, "learning_rate": 9.783670402772379e-05, "loss": 0.0672, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 61, "tokens_per_second_per_gpu": 455.94 }, { "epoch": 0.14220183486238533, "grad_norm": 0.02617599070072174, "learning_rate": 9.772520313857775e-05, "loss": 0.0804, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 62, "tokens_per_second_per_gpu": 394.85 }, { "epoch": 0.1444954128440367, "grad_norm": 0.030884992331266403, "learning_rate": 9.761096730023432e-05, "loss": 0.0768, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 63, "tokens_per_second_per_gpu": 446.63 }, { "epoch": 0.14678899082568808, "grad_norm": 0.027579287067055702, "learning_rate": 9.749400305911322e-05, "loss": 0.0659, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 64, "tokens_per_second_per_gpu": 484.34 }, { "epoch": 0.14908256880733944, "grad_norm": 0.030303625389933586, "learning_rate": 9.737431711798864e-05, "loss": 0.0645, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 65, "tokens_per_second_per_gpu": 437.07 }, { "epoch": 0.15137614678899083, "grad_norm": 0.027446158230304718, "learning_rate": 9.725191633560491e-05, "loss": 0.08, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 66, "tokens_per_second_per_gpu": 411.5 }, { "epoch": 0.1536697247706422, "grad_norm": 0.03177177160978317, "learning_rate": 9.712680772628364e-05, "loss": 0.0801, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 67, "tokens_per_second_per_gpu": 429.18 }, { "epoch": 0.1559633027522936, "grad_norm": 0.0288909412920475, "learning_rate": 9.69989984595216e-05, "loss": 0.0707, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.04, "memory/max_allocated (GiB)": 49.04, "step": 68, "tokens_per_second_per_gpu": 408.55 }, { "epoch": 0.15825688073394495, "grad_norm": 0.02751251310110092, "learning_rate": 9.686849585957994e-05, "loss": 0.0736, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 69, "tokens_per_second_per_gpu": 420.0 }, { "epoch": 0.16055045871559634, "grad_norm": 0.023428168147802353, "learning_rate": 9.673530740506447e-05, "loss": 0.0648, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 70, "tokens_per_second_per_gpu": 512.59 }, { "epoch": 0.1628440366972477, "grad_norm": 0.031534772366285324, "learning_rate": 9.659944072849707e-05, "loss": 0.0818, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 71, "tokens_per_second_per_gpu": 456.9 }, { "epoch": 0.1651376146788991, "grad_norm": 0.027208171784877777, "learning_rate": 9.646090361587827e-05, "loss": 0.0709, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 72, "tokens_per_second_per_gpu": 378.48 }, { "epoch": 0.16743119266055045, "grad_norm": 0.02961639314889908, "learning_rate": 9.631970400624113e-05, "loss": 0.0764, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 73, "tokens_per_second_per_gpu": 316.38 }, { "epoch": 0.16972477064220184, "grad_norm": 0.027367761358618736, "learning_rate": 9.617584999119625e-05, "loss": 0.0672, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 74, "tokens_per_second_per_gpu": 402.44 }, { "epoch": 0.1720183486238532, "grad_norm": 0.030167503282427788, "learning_rate": 9.602934981446803e-05, "loss": 0.0743, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 75, "tokens_per_second_per_gpu": 531.29 }, { "epoch": 0.1743119266055046, "grad_norm": 0.0387263149023056, "learning_rate": 9.588021187142235e-05, "loss": 0.083, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 76, "tokens_per_second_per_gpu": 424.59 }, { "epoch": 0.17660550458715596, "grad_norm": 0.027617793530225754, "learning_rate": 9.572844470858537e-05, "loss": 0.0769, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 77, "tokens_per_second_per_gpu": 461.9 }, { "epoch": 0.17889908256880735, "grad_norm": 0.029771512374281883, "learning_rate": 9.557405702315381e-05, "loss": 0.0658, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 78, "tokens_per_second_per_gpu": 475.77 }, { "epoch": 0.1811926605504587, "grad_norm": 0.029358675703406334, "learning_rate": 9.541705766249655e-05, "loss": 0.066, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 79, "tokens_per_second_per_gpu": 489.33 }, { "epoch": 0.1834862385321101, "grad_norm": 0.023111771792173386, "learning_rate": 9.525745562364756e-05, "loss": 0.066, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 80, "tokens_per_second_per_gpu": 382.84 }, { "epoch": 0.18577981651376146, "grad_norm": 0.029448291286826134, "learning_rate": 9.509526005279044e-05, "loss": 0.0608, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 81, "tokens_per_second_per_gpu": 415.81 }, { "epoch": 0.18807339449541285, "grad_norm": 0.02794116735458374, "learning_rate": 9.493048024473412e-05, "loss": 0.0736, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 82, "tokens_per_second_per_gpu": 400.02 }, { "epoch": 0.19036697247706422, "grad_norm": 0.04534873738884926, "learning_rate": 9.476312564238034e-05, "loss": 0.0673, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 83, "tokens_per_second_per_gpu": 369.1 }, { "epoch": 0.1926605504587156, "grad_norm": 0.026540853083133698, "learning_rate": 9.459320583618252e-05, "loss": 0.0558, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.04, "memory/max_allocated (GiB)": 49.04, "step": 84, "tokens_per_second_per_gpu": 611.61 }, { "epoch": 0.19495412844036697, "grad_norm": 0.03129403293132782, "learning_rate": 9.442073056359604e-05, "loss": 0.0741, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 85, "tokens_per_second_per_gpu": 492.16 }, { "epoch": 0.19724770642201836, "grad_norm": 0.027526071295142174, "learning_rate": 9.424570970852034e-05, "loss": 0.0733, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 86, "tokens_per_second_per_gpu": 427.76 }, { "epoch": 0.19954128440366972, "grad_norm": 0.025468798354268074, "learning_rate": 9.406815330073244e-05, "loss": 0.0613, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 87, "tokens_per_second_per_gpu": 462.82 }, { "epoch": 0.2018348623853211, "grad_norm": 0.029043635353446007, "learning_rate": 9.388807151531229e-05, "loss": 0.0758, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 88, "tokens_per_second_per_gpu": 353.91 }, { "epoch": 0.20412844036697247, "grad_norm": 0.03196391835808754, "learning_rate": 9.37054746720595e-05, "loss": 0.0678, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 89, "tokens_per_second_per_gpu": 411.71 }, { "epoch": 0.20642201834862386, "grad_norm": 0.033272091299295425, "learning_rate": 9.352037323490208e-05, "loss": 0.0722, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 90, "tokens_per_second_per_gpu": 398.81 }, { "epoch": 0.20871559633027523, "grad_norm": 0.03096090629696846, "learning_rate": 9.333277781129678e-05, "loss": 0.0809, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 91, "tokens_per_second_per_gpu": 393.81 }, { "epoch": 0.21100917431192662, "grad_norm": 0.026267440989613533, "learning_rate": 9.314269915162114e-05, "loss": 0.0604, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 92, "tokens_per_second_per_gpu": 453.78 }, { "epoch": 0.21330275229357798, "grad_norm": 0.02608361840248108, "learning_rate": 9.295014814855753e-05, "loss": 0.0663, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 93, "tokens_per_second_per_gpu": 430.47 }, { "epoch": 0.21559633027522937, "grad_norm": 0.024829065427184105, "learning_rate": 9.275513583646884e-05, "loss": 0.0598, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 94, "tokens_per_second_per_gpu": 384.01 }, { "epoch": 0.21788990825688073, "grad_norm": 0.03385532647371292, "learning_rate": 9.255767339076622e-05, "loss": 0.0719, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 95, "tokens_per_second_per_gpu": 440.35 }, { "epoch": 0.22018348623853212, "grad_norm": 0.029608217999339104, "learning_rate": 9.23577721272686e-05, "loss": 0.094, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.04, "memory/max_allocated (GiB)": 49.04, "step": 96, "tokens_per_second_per_gpu": 485.56 }, { "epoch": 0.22247706422018348, "grad_norm": 0.02693762816488743, "learning_rate": 9.215544350155422e-05, "loss": 0.0755, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 97, "tokens_per_second_per_gpu": 432.16 }, { "epoch": 0.22477064220183487, "grad_norm": 0.02771424688398838, "learning_rate": 9.195069910830427e-05, "loss": 0.0692, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 98, "tokens_per_second_per_gpu": 412.93 }, { "epoch": 0.22706422018348624, "grad_norm": 0.02276022732257843, "learning_rate": 9.174355068063828e-05, "loss": 0.0637, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 99, "tokens_per_second_per_gpu": 418.24 }, { "epoch": 0.22935779816513763, "grad_norm": 0.026155246421694756, "learning_rate": 9.15340100894418e-05, "loss": 0.0698, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 100, "tokens_per_second_per_gpu": 403.6 }, { "epoch": 0.231651376146789, "grad_norm": 0.022778436541557312, "learning_rate": 9.132208934268622e-05, "loss": 0.0654, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 101, "tokens_per_second_per_gpu": 491.32 }, { "epoch": 0.23394495412844038, "grad_norm": 0.04701945558190346, "learning_rate": 9.110780058474052e-05, "loss": 0.0741, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 102, "tokens_per_second_per_gpu": 444.03 }, { "epoch": 0.23623853211009174, "grad_norm": 0.030211661010980606, "learning_rate": 9.08911560956753e-05, "loss": 0.0789, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 103, "tokens_per_second_per_gpu": 514.87 }, { "epoch": 0.23853211009174313, "grad_norm": 0.026159459725022316, "learning_rate": 9.067216829055922e-05, "loss": 0.0637, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 104, "tokens_per_second_per_gpu": 446.47 }, { "epoch": 0.2408256880733945, "grad_norm": 0.02918146923184395, "learning_rate": 9.045084971874738e-05, "loss": 0.0727, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 105, "tokens_per_second_per_gpu": 425.37 }, { "epoch": 0.24311926605504589, "grad_norm": 0.03170175105333328, "learning_rate": 9.022721306316222e-05, "loss": 0.0857, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.85, "memory/max_allocated (GiB)": 48.85, "step": 106, "tokens_per_second_per_gpu": 301.79 }, { "epoch": 0.24541284403669725, "grad_norm": 0.032674651592969894, "learning_rate": 9.000127113956674e-05, "loss": 0.0795, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.77, "memory/max_allocated (GiB)": 48.77, "step": 107, "tokens_per_second_per_gpu": 338.41 }, { "epoch": 0.24770642201834864, "grad_norm": 0.026492780074477196, "learning_rate": 8.977303689583e-05, "loss": 0.0775, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 108, "tokens_per_second_per_gpu": 383.35 }, { "epoch": 0.25, "grad_norm": 0.0290480125695467, "learning_rate": 8.954252341118523e-05, "loss": 0.076, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 109, "tokens_per_second_per_gpu": 382.78 }, { "epoch": 0.25229357798165136, "grad_norm": 0.030473977327346802, "learning_rate": 8.930974389548023e-05, "loss": 0.0761, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.93, "memory/max_allocated (GiB)": 48.93, "step": 110, "tokens_per_second_per_gpu": 476.56 }, { "epoch": 0.2545871559633027, "grad_norm": 0.02930077351629734, "learning_rate": 8.90747116884204e-05, "loss": 0.0691, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 111, "tokens_per_second_per_gpu": 441.2 }, { "epoch": 0.25688073394495414, "grad_norm": 0.02884151227772236, "learning_rate": 8.883744025880428e-05, "loss": 0.0806, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 112, "tokens_per_second_per_gpu": 406.96 }, { "epoch": 0.2591743119266055, "grad_norm": 0.02618175558745861, "learning_rate": 8.859794320375168e-05, "loss": 0.0677, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 113, "tokens_per_second_per_gpu": 430.04 }, { "epoch": 0.26146788990825687, "grad_norm": 0.026963548734784126, "learning_rate": 8.835623424792452e-05, "loss": 0.0694, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.89, "memory/max_allocated (GiB)": 48.89, "step": 114, "tokens_per_second_per_gpu": 351.9 }, { "epoch": 0.26376146788990823, "grad_norm": 0.021544624119997025, "learning_rate": 8.811232724274035e-05, "loss": 0.0613, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 115, "tokens_per_second_per_gpu": 480.22 }, { "epoch": 0.26605504587155965, "grad_norm": 0.03840009495615959, "learning_rate": 8.786623616557847e-05, "loss": 0.0723, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 116, "tokens_per_second_per_gpu": 433.18 }, { "epoch": 0.268348623853211, "grad_norm": 0.022571468725800514, "learning_rate": 8.761797511897906e-05, "loss": 0.065, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 49.0, "memory/max_allocated (GiB)": 49.0, "step": 117, "tokens_per_second_per_gpu": 421.92 }, { "epoch": 0.2706422018348624, "grad_norm": 0.02688576467335224, "learning_rate": 8.736755832983497e-05, "loss": 0.0772, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 118, "tokens_per_second_per_gpu": 354.3 }, { "epoch": 0.27293577981651373, "grad_norm": 0.025858785957098007, "learning_rate": 8.711500014857634e-05, "loss": 0.0745, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.81, "memory/max_allocated (GiB)": 48.81, "step": 119, "tokens_per_second_per_gpu": 365.46 }, { "epoch": 0.27522935779816515, "grad_norm": 0.02718079835176468, "learning_rate": 8.686031504834843e-05, "loss": 0.0759, "memory/device_reserved (GiB)": 50.97, "memory/max_active (GiB)": 48.97, "memory/max_allocated (GiB)": 48.97, "step": 120, "tokens_per_second_per_gpu": 426.06 } ], "logging_steps": 1, "max_steps": 436, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 60, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.4689538053609882e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }