{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.956175298804781, "eval_steps": 500, "global_step": 186, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01593625498007968, "grad_norm": 3.265625, "learning_rate": 0.0, "loss": 1.1802, "memory/device_reserved (GiB)": 62.1, "memory/max_active (GiB)": 50.46, "memory/max_allocated (GiB)": 50.46, "step": 1, "tokens_per_second_per_gpu": 4078.9, "total_tokens": 187960 }, { "epoch": 0.03187250996015936, "grad_norm": 3.53125, "learning_rate": 1.111111111111111e-07, "loss": 1.2461, "memory/device_reserved (GiB)": 77.62, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 2, "tokens_per_second_per_gpu": 5600.28, "total_tokens": 380007 }, { "epoch": 0.04780876494023904, "grad_norm": 3.5625, "learning_rate": 2.222222222222222e-07, "loss": 1.3145, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 3, "tokens_per_second_per_gpu": 5504.25, "total_tokens": 566579 }, { "epoch": 0.06374501992031872, "grad_norm": 3.625, "learning_rate": 3.333333333333333e-07, "loss": 1.2505, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 4, "tokens_per_second_per_gpu": 5678.77, "total_tokens": 754678 }, { "epoch": 0.0796812749003984, "grad_norm": 3.46875, "learning_rate": 4.444444444444444e-07, "loss": 1.2344, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 5, "tokens_per_second_per_gpu": 5644.44, "total_tokens": 935084 }, { "epoch": 0.09561752988047809, "grad_norm": 3.71875, "learning_rate": 5.555555555555555e-07, "loss": 1.334, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 6, "tokens_per_second_per_gpu": 5488.37, "total_tokens": 1114037 }, { "epoch": 0.11155378486055777, "grad_norm": 3.4375, "learning_rate": 6.666666666666666e-07, "loss": 1.1704, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 7, "tokens_per_second_per_gpu": 5468.62, "total_tokens": 1302175 }, { "epoch": 0.12749003984063745, "grad_norm": 3.484375, "learning_rate": 7.777777777777778e-07, "loss": 1.2471, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 8, "tokens_per_second_per_gpu": 5412.64, "total_tokens": 1483342 }, { "epoch": 0.14342629482071714, "grad_norm": 3.375, "learning_rate": 8.888888888888888e-07, "loss": 1.2354, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 9, "tokens_per_second_per_gpu": 5455.54, "total_tokens": 1667332 }, { "epoch": 0.1593625498007968, "grad_norm": 3.546875, "learning_rate": 1e-06, "loss": 1.3232, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 10, "tokens_per_second_per_gpu": 5574.32, "total_tokens": 1849357 }, { "epoch": 0.1752988047808765, "grad_norm": 3.671875, "learning_rate": 1.111111111111111e-06, "loss": 1.3232, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 11, "tokens_per_second_per_gpu": 5321.58, "total_tokens": 2026853 }, { "epoch": 0.19123505976095617, "grad_norm": 3.359375, "learning_rate": 1.2222222222222223e-06, "loss": 1.2529, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 12, "tokens_per_second_per_gpu": 5847.85, "total_tokens": 2213285 }, { "epoch": 0.20717131474103587, "grad_norm": 3.328125, "learning_rate": 1.3333333333333332e-06, "loss": 1.2559, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 13, "tokens_per_second_per_gpu": 5613.77, "total_tokens": 2400076 }, { "epoch": 0.22310756972111553, "grad_norm": 3.15625, "learning_rate": 1.4444444444444443e-06, "loss": 1.2129, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 14, "tokens_per_second_per_gpu": 5571.51, "total_tokens": 2593112 }, { "epoch": 0.23904382470119523, "grad_norm": 3.125, "learning_rate": 1.5555555555555556e-06, "loss": 1.2153, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 15, "tokens_per_second_per_gpu": 5626.56, "total_tokens": 2782327 }, { "epoch": 0.2549800796812749, "grad_norm": 3.3125, "learning_rate": 1.6666666666666667e-06, "loss": 1.2598, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 16, "tokens_per_second_per_gpu": 5731.24, "total_tokens": 2966947 }, { "epoch": 0.27091633466135456, "grad_norm": 3.09375, "learning_rate": 1.7777777777777775e-06, "loss": 1.1714, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 17, "tokens_per_second_per_gpu": 6088.78, "total_tokens": 3161163 }, { "epoch": 0.2868525896414343, "grad_norm": 3.234375, "learning_rate": 1.8888888888888888e-06, "loss": 1.2402, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 18, "tokens_per_second_per_gpu": 5499.3, "total_tokens": 3343301 }, { "epoch": 0.30278884462151395, "grad_norm": 3.171875, "learning_rate": 2e-06, "loss": 1.2158, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 19, "tokens_per_second_per_gpu": 5505.1, "total_tokens": 3527203 }, { "epoch": 0.3187250996015936, "grad_norm": 3.21875, "learning_rate": 1.9998251609127463e-06, "loss": 1.2446, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 20, "tokens_per_second_per_gpu": 5441.06, "total_tokens": 3710228 }, { "epoch": 0.3346613545816733, "grad_norm": 2.875, "learning_rate": 1.9993007047883984e-06, "loss": 1.1895, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 21, "tokens_per_second_per_gpu": 5817.48, "total_tokens": 3903090 }, { "epoch": 0.350597609561753, "grad_norm": 3.078125, "learning_rate": 1.9984268150178167e-06, "loss": 1.209, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 22, "tokens_per_second_per_gpu": 5823.26, "total_tokens": 4085213 }, { "epoch": 0.3665338645418327, "grad_norm": 2.71875, "learning_rate": 1.9972037971811797e-06, "loss": 1.1602, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 23, "tokens_per_second_per_gpu": 5595.44, "total_tokens": 4271930 }, { "epoch": 0.38247011952191234, "grad_norm": 2.625, "learning_rate": 1.9956320789411338e-06, "loss": 1.1587, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 24, "tokens_per_second_per_gpu": 5470.25, "total_tokens": 4458282 }, { "epoch": 0.398406374501992, "grad_norm": 2.96875, "learning_rate": 1.9937122098932426e-06, "loss": 1.2295, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 25, "tokens_per_second_per_gpu": 5381.09, "total_tokens": 4638920 }, { "epoch": 0.41434262948207173, "grad_norm": 2.765625, "learning_rate": 1.9914448613738106e-06, "loss": 1.2695, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 26, "tokens_per_second_per_gpu": 5413.01, "total_tokens": 4823256 }, { "epoch": 0.4302788844621514, "grad_norm": 2.703125, "learning_rate": 1.9888308262251284e-06, "loss": 1.2412, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 27, "tokens_per_second_per_gpu": 5555.69, "total_tokens": 5004546 }, { "epoch": 0.44621513944223107, "grad_norm": 2.703125, "learning_rate": 1.9858710185182355e-06, "loss": 1.2666, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 28, "tokens_per_second_per_gpu": 5321.57, "total_tokens": 5180916 }, { "epoch": 0.46215139442231074, "grad_norm": 2.5625, "learning_rate": 1.9825664732332882e-06, "loss": 1.207, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 29, "tokens_per_second_per_gpu": 5507.08, "total_tokens": 5362870 }, { "epoch": 0.47808764940239046, "grad_norm": 2.671875, "learning_rate": 1.9789183458976484e-06, "loss": 1.1904, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 30, "tokens_per_second_per_gpu": 5788.71, "total_tokens": 5548692 }, { "epoch": 0.4940239043824701, "grad_norm": 2.4375, "learning_rate": 1.9749279121818236e-06, "loss": 1.1865, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 31, "tokens_per_second_per_gpu": 5411.35, "total_tokens": 5734611 }, { "epoch": 0.5099601593625498, "grad_norm": 2.5, "learning_rate": 1.970596567453391e-06, "loss": 1.1953, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 32, "tokens_per_second_per_gpu": 5568.61, "total_tokens": 5915483 }, { "epoch": 0.5258964143426295, "grad_norm": 2.328125, "learning_rate": 1.965925826289068e-06, "loss": 1.1885, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 33, "tokens_per_second_per_gpu": 5527.28, "total_tokens": 6103887 }, { "epoch": 0.5418326693227091, "grad_norm": 2.390625, "learning_rate": 1.9609173219450997e-06, "loss": 1.2578, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 34, "tokens_per_second_per_gpu": 5786.79, "total_tokens": 6291053 }, { "epoch": 0.5577689243027888, "grad_norm": 2.390625, "learning_rate": 1.955572805786141e-06, "loss": 1.2656, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 35, "tokens_per_second_per_gpu": 5616.39, "total_tokens": 6476080 }, { "epoch": 0.5737051792828686, "grad_norm": 2.40625, "learning_rate": 1.9498941466728456e-06, "loss": 1.2285, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 36, "tokens_per_second_per_gpu": 5446.88, "total_tokens": 6654173 }, { "epoch": 0.5896414342629482, "grad_norm": 2.1875, "learning_rate": 1.9438833303083674e-06, "loss": 1.2314, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 37, "tokens_per_second_per_gpu": 5411.99, "total_tokens": 6838496 }, { "epoch": 0.6055776892430279, "grad_norm": 2.140625, "learning_rate": 1.937542458543999e-06, "loss": 1.1685, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 38, "tokens_per_second_per_gpu": 5285.57, "total_tokens": 7020101 }, { "epoch": 0.6215139442231076, "grad_norm": 2.09375, "learning_rate": 1.930873748644204e-06, "loss": 1.1489, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 39, "tokens_per_second_per_gpu": 5909.97, "total_tokens": 7209728 }, { "epoch": 0.6374501992031872, "grad_norm": 2.21875, "learning_rate": 1.9238795325112867e-06, "loss": 1.1924, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 40, "tokens_per_second_per_gpu": 5668.7, "total_tokens": 7392542 }, { "epoch": 0.6533864541832669, "grad_norm": 2.1875, "learning_rate": 1.916562255869976e-06, "loss": 1.21, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 41, "tokens_per_second_per_gpu": 5187.81, "total_tokens": 7570380 }, { "epoch": 0.6693227091633466, "grad_norm": 2.03125, "learning_rate": 1.908924477412211e-06, "loss": 1.1787, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 42, "tokens_per_second_per_gpu": 5502.78, "total_tokens": 7753287 }, { "epoch": 0.6852589641434262, "grad_norm": 1.90625, "learning_rate": 1.9009688679024189e-06, "loss": 1.1504, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 43, "tokens_per_second_per_gpu": 5664.86, "total_tokens": 7942863 }, { "epoch": 0.701195219123506, "grad_norm": 1.9921875, "learning_rate": 1.8926982092436114e-06, "loss": 1.2075, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 44, "tokens_per_second_per_gpu": 5493.62, "total_tokens": 8127398 }, { "epoch": 0.7171314741035857, "grad_norm": 1.9140625, "learning_rate": 1.8841153935046096e-06, "loss": 1.1484, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 45, "tokens_per_second_per_gpu": 5847.74, "total_tokens": 8312603 }, { "epoch": 0.7330677290836654, "grad_norm": 1.90625, "learning_rate": 1.8752234219087537e-06, "loss": 1.167, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 46, "tokens_per_second_per_gpu": 5502.76, "total_tokens": 8497383 }, { "epoch": 0.749003984063745, "grad_norm": 1.8828125, "learning_rate": 1.8660254037844386e-06, "loss": 1.147, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 47, "tokens_per_second_per_gpu": 5615.32, "total_tokens": 8688166 }, { "epoch": 0.7649402390438247, "grad_norm": 1.9453125, "learning_rate": 1.8565245554778515e-06, "loss": 1.1992, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 48, "tokens_per_second_per_gpu": 5767.9, "total_tokens": 8874386 }, { "epoch": 0.7808764940239044, "grad_norm": 1.90625, "learning_rate": 1.8467241992282841e-06, "loss": 1.1475, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 49, "tokens_per_second_per_gpu": 5789.02, "total_tokens": 9059229 }, { "epoch": 0.796812749003984, "grad_norm": 1.75, "learning_rate": 1.8366277620064197e-06, "loss": 1.0986, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 50, "tokens_per_second_per_gpu": 5192.74, "total_tokens": 9255532 }, { "epoch": 0.8127490039840638, "grad_norm": 1.7578125, "learning_rate": 1.8262387743159948e-06, "loss": 1.0908, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 51, "tokens_per_second_per_gpu": 5522.43, "total_tokens": 9446438 }, { "epoch": 0.8286852589641435, "grad_norm": 1.7109375, "learning_rate": 1.8155608689592601e-06, "loss": 1.1084, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 52, "tokens_per_second_per_gpu": 5467.0, "total_tokens": 9632510 }, { "epoch": 0.8446215139442231, "grad_norm": 1.6171875, "learning_rate": 1.8045977797666683e-06, "loss": 1.0781, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 53, "tokens_per_second_per_gpu": 5831.19, "total_tokens": 9824144 }, { "epoch": 0.8605577689243028, "grad_norm": 1.78125, "learning_rate": 1.7933533402912351e-06, "loss": 1.1533, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 54, "tokens_per_second_per_gpu": 5705.42, "total_tokens": 10010140 }, { "epoch": 0.8764940239043825, "grad_norm": 1.7734375, "learning_rate": 1.7818314824680298e-06, "loss": 1.1689, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 55, "tokens_per_second_per_gpu": 5696.79, "total_tokens": 10195855 }, { "epoch": 0.8924302788844621, "grad_norm": 1.6796875, "learning_rate": 1.770036235239263e-06, "loss": 1.1182, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 56, "tokens_per_second_per_gpu": 6045.85, "total_tokens": 10381589 }, { "epoch": 0.9083665338645418, "grad_norm": 1.6875, "learning_rate": 1.7579717231454529e-06, "loss": 1.1738, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 57, "tokens_per_second_per_gpu": 5658.53, "total_tokens": 10573779 }, { "epoch": 0.9243027888446215, "grad_norm": 1.65625, "learning_rate": 1.7456421648831654e-06, "loss": 1.1553, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 58, "tokens_per_second_per_gpu": 5601.69, "total_tokens": 10762764 }, { "epoch": 0.9402390438247012, "grad_norm": 1.7109375, "learning_rate": 1.733051871829826e-06, "loss": 1.1416, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 59, "tokens_per_second_per_gpu": 5873.82, "total_tokens": 10948310 }, { "epoch": 0.9561752988047809, "grad_norm": 1.5859375, "learning_rate": 1.7202052465361266e-06, "loss": 1.1514, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 60, "tokens_per_second_per_gpu": 5552.06, "total_tokens": 11134379 }, { "epoch": 0.9721115537848606, "grad_norm": 1.5859375, "learning_rate": 1.7071067811865474e-06, "loss": 1.1243, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 61, "tokens_per_second_per_gpu": 5016.43, "total_tokens": 11320025 }, { "epoch": 0.9880478087649402, "grad_norm": 1.4921875, "learning_rate": 1.6937610560285416e-06, "loss": 1.0957, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 62, "tokens_per_second_per_gpu": 5350.32, "total_tokens": 11504140 }, { "epoch": 1.0, "grad_norm": 1.609375, "learning_rate": 1.6801727377709191e-06, "loss": 1.1582, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 63, "tokens_per_second_per_gpu": 4173.22, "total_tokens": 11627371 }, { "epoch": 1.0159362549800797, "grad_norm": 1.4609375, "learning_rate": 1.6663465779520037e-06, "loss": 1.0864, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 64, "tokens_per_second_per_gpu": 5530.73, "total_tokens": 11815331 }, { "epoch": 1.0318725099601593, "grad_norm": 1.6015625, "learning_rate": 1.6522874112781212e-06, "loss": 1.1445, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 65, "tokens_per_second_per_gpu": 5941.29, "total_tokens": 12007378 }, { "epoch": 1.047808764940239, "grad_norm": 1.546875, "learning_rate": 1.6380001539330085e-06, "loss": 1.209, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 66, "tokens_per_second_per_gpu": 5598.11, "total_tokens": 12193950 }, { "epoch": 1.0637450199203187, "grad_norm": 1.578125, "learning_rate": 1.6234898018587336e-06, "loss": 1.1421, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 67, "tokens_per_second_per_gpu": 5671.24, "total_tokens": 12382049 }, { "epoch": 1.0796812749003983, "grad_norm": 1.5, "learning_rate": 1.6087614290087205e-06, "loss": 1.1323, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 68, "tokens_per_second_per_gpu": 5627.89, "total_tokens": 12562455 }, { "epoch": 1.095617529880478, "grad_norm": 1.5859375, "learning_rate": 1.5938201855735014e-06, "loss": 1.2207, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 69, "tokens_per_second_per_gpu": 5368.43, "total_tokens": 12741408 }, { "epoch": 1.1115537848605577, "grad_norm": 1.46875, "learning_rate": 1.578671296179806e-06, "loss": 1.0659, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 70, "tokens_per_second_per_gpu": 5466.98, "total_tokens": 12929546 }, { "epoch": 1.1274900398406373, "grad_norm": 1.421875, "learning_rate": 1.563320058063622e-06, "loss": 1.1416, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 71, "tokens_per_second_per_gpu": 5509.23, "total_tokens": 13110713 }, { "epoch": 1.1434262948207172, "grad_norm": 1.34375, "learning_rate": 1.5477718392178713e-06, "loss": 1.1318, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 72, "tokens_per_second_per_gpu": 5553.33, "total_tokens": 13294703 }, { "epoch": 1.159362549800797, "grad_norm": 1.484375, "learning_rate": 1.5320320765153365e-06, "loss": 1.2129, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 73, "tokens_per_second_per_gpu": 5457.5, "total_tokens": 13476728 }, { "epoch": 1.1752988047808766, "grad_norm": 1.5, "learning_rate": 1.5161062738075065e-06, "loss": 1.2109, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 74, "tokens_per_second_per_gpu": 5229.94, "total_tokens": 13654224 }, { "epoch": 1.1912350597609562, "grad_norm": 1.3671875, "learning_rate": 1.5e-06, "loss": 1.1475, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 75, "tokens_per_second_per_gpu": 5856.14, "total_tokens": 13840656 }, { "epoch": 1.207171314741036, "grad_norm": 1.3984375, "learning_rate": 1.4837188871052397e-06, "loss": 1.1494, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 76, "tokens_per_second_per_gpu": 5621.93, "total_tokens": 14027447 }, { "epoch": 1.2231075697211156, "grad_norm": 1.3125, "learning_rate": 1.467268628273062e-06, "loss": 1.1133, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 77, "tokens_per_second_per_gpu": 5678.9, "total_tokens": 14220483 }, { "epoch": 1.2390438247011952, "grad_norm": 1.2734375, "learning_rate": 1.4506549757999453e-06, "loss": 1.1182, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 78, "tokens_per_second_per_gpu": 5610.92, "total_tokens": 14409698 }, { "epoch": 1.254980079681275, "grad_norm": 1.4296875, "learning_rate": 1.433883739117558e-06, "loss": 1.1582, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 79, "tokens_per_second_per_gpu": 5624.06, "total_tokens": 14594318 }, { "epoch": 1.2709163346613546, "grad_norm": 1.328125, "learning_rate": 1.4169607827613282e-06, "loss": 1.0762, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 80, "tokens_per_second_per_gpu": 6081.02, "total_tokens": 14788534 }, { "epoch": 1.2868525896414342, "grad_norm": 1.3125, "learning_rate": 1.3998920243197408e-06, "loss": 1.1372, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 81, "tokens_per_second_per_gpu": 5511.47, "total_tokens": 14970672 }, { "epoch": 1.302788844621514, "grad_norm": 1.3671875, "learning_rate": 1.3826834323650898e-06, "loss": 1.1167, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 82, "tokens_per_second_per_gpu": 5614.12, "total_tokens": 15154574 }, { "epoch": 1.3187250996015936, "grad_norm": 1.390625, "learning_rate": 1.3653410243663951e-06, "loss": 1.1455, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 83, "tokens_per_second_per_gpu": 5438.76, "total_tokens": 15337599 }, { "epoch": 1.3346613545816732, "grad_norm": 1.234375, "learning_rate": 1.347870864585227e-06, "loss": 1.103, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 84, "tokens_per_second_per_gpu": 5795.88, "total_tokens": 15530461 }, { "epoch": 1.3505976095617531, "grad_norm": 1.3515625, "learning_rate": 1.3302790619551672e-06, "loss": 1.1162, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 85, "tokens_per_second_per_gpu": 5723.72, "total_tokens": 15712584 }, { "epoch": 1.3665338645418328, "grad_norm": 1.2265625, "learning_rate": 1.3125717679456444e-06, "loss": 1.0786, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 86, "tokens_per_second_per_gpu": 5598.4, "total_tokens": 15899301 }, { "epoch": 1.3824701195219125, "grad_norm": 1.1875, "learning_rate": 1.2947551744109043e-06, "loss": 1.082, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 87, "tokens_per_second_per_gpu": 5458.74, "total_tokens": 16085653 }, { "epoch": 1.3984063745019921, "grad_norm": 1.3515625, "learning_rate": 1.2768355114248492e-06, "loss": 1.1436, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 88, "tokens_per_second_per_gpu": 5579.84, "total_tokens": 16266291 }, { "epoch": 1.4143426294820718, "grad_norm": 1.2734375, "learning_rate": 1.2588190451025207e-06, "loss": 1.1914, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 89, "tokens_per_second_per_gpu": 5336.85, "total_tokens": 16450627 }, { "epoch": 1.4302788844621515, "grad_norm": 1.25, "learning_rate": 1.240712075408973e-06, "loss": 1.167, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 90, "tokens_per_second_per_gpu": 5445.98, "total_tokens": 16631917 }, { "epoch": 1.4462151394422311, "grad_norm": 1.265625, "learning_rate": 1.2225209339563143e-06, "loss": 1.1895, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 91, "tokens_per_second_per_gpu": 5330.92, "total_tokens": 16808287 }, { "epoch": 1.4621513944223108, "grad_norm": 1.25, "learning_rate": 1.2042519817896804e-06, "loss": 1.1357, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 92, "tokens_per_second_per_gpu": 5515.76, "total_tokens": 16990241 }, { "epoch": 1.4780876494023905, "grad_norm": 1.328125, "learning_rate": 1.1859116071629147e-06, "loss": 1.1187, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 93, "tokens_per_second_per_gpu": 5890.54, "total_tokens": 17176063 }, { "epoch": 1.4940239043824701, "grad_norm": 1.2265625, "learning_rate": 1.1675062233047363e-06, "loss": 1.1211, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 94, "tokens_per_second_per_gpu": 5413.95, "total_tokens": 17361982 }, { "epoch": 1.5099601593625498, "grad_norm": 1.2734375, "learning_rate": 1.1490422661761743e-06, "loss": 1.1294, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 95, "tokens_per_second_per_gpu": 5447.34, "total_tokens": 17542854 }, { "epoch": 1.5258964143426295, "grad_norm": 1.1875, "learning_rate": 1.1305261922200517e-06, "loss": 1.125, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 96, "tokens_per_second_per_gpu": 5531.41, "total_tokens": 17731258 }, { "epoch": 1.5418326693227091, "grad_norm": 1.2109375, "learning_rate": 1.1119644761033077e-06, "loss": 1.1963, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 97, "tokens_per_second_per_gpu": 5812.79, "total_tokens": 17918424 }, { "epoch": 1.5577689243027888, "grad_norm": 1.203125, "learning_rate": 1.0933636084529506e-06, "loss": 1.2041, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 98, "tokens_per_second_per_gpu": 5613.31, "total_tokens": 18103451 }, { "epoch": 1.5737051792828685, "grad_norm": 1.2578125, "learning_rate": 1.0747300935864243e-06, "loss": 1.168, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 99, "tokens_per_second_per_gpu": 5555.4, "total_tokens": 18281544 }, { "epoch": 1.5896414342629481, "grad_norm": 1.171875, "learning_rate": 1.0560704472371917e-06, "loss": 1.1768, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 100, "tokens_per_second_per_gpu": 5413.52, "total_tokens": 18465867 }, { "epoch": 1.6055776892430278, "grad_norm": 1.15625, "learning_rate": 1.037391194276326e-06, "loss": 1.1162, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 101, "tokens_per_second_per_gpu": 5209.56, "total_tokens": 18647472 }, { "epoch": 1.6215139442231075, "grad_norm": 1.1328125, "learning_rate": 1.0186988664309022e-06, "loss": 1.0986, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 102, "tokens_per_second_per_gpu": 5913.8, "total_tokens": 18837099 }, { "epoch": 1.6374501992031871, "grad_norm": 1.21875, "learning_rate": 1e-06, "loss": 1.1392, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 103, "tokens_per_second_per_gpu": 5669.98, "total_tokens": 19019913 }, { "epoch": 1.6533864541832668, "grad_norm": 1.2109375, "learning_rate": 9.81301133569098e-07, "loss": 1.1582, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 104, "tokens_per_second_per_gpu": 5370.41, "total_tokens": 19197751 }, { "epoch": 1.6693227091633465, "grad_norm": 1.1796875, "learning_rate": 9.626088057236744e-07, "loss": 1.1318, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 105, "tokens_per_second_per_gpu": 5391.77, "total_tokens": 19380658 }, { "epoch": 1.6852589641434261, "grad_norm": 1.09375, "learning_rate": 9.43929552762808e-07, "loss": 1.1084, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 106, "tokens_per_second_per_gpu": 5564.12, "total_tokens": 19570234 }, { "epoch": 1.701195219123506, "grad_norm": 1.1640625, "learning_rate": 9.252699064135758e-07, "loss": 1.1616, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 107, "tokens_per_second_per_gpu": 5485.18, "total_tokens": 19754769 }, { "epoch": 1.7171314741035857, "grad_norm": 1.1328125, "learning_rate": 9.066363915470494e-07, "loss": 1.106, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 108, "tokens_per_second_per_gpu": 5839.07, "total_tokens": 19939974 }, { "epoch": 1.7330677290836654, "grad_norm": 1.1328125, "learning_rate": 8.880355238966921e-07, "loss": 1.125, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 109, "tokens_per_second_per_gpu": 5601.59, "total_tokens": 20124754 }, { "epoch": 1.749003984063745, "grad_norm": 1.15625, "learning_rate": 8.694738077799486e-07, "loss": 1.1084, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 110, "tokens_per_second_per_gpu": 5601.31, "total_tokens": 20315537 }, { "epoch": 1.7649402390438247, "grad_norm": 1.1640625, "learning_rate": 8.509577338238254e-07, "loss": 1.1602, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 111, "tokens_per_second_per_gpu": 5767.0, "total_tokens": 20501757 }, { "epoch": 1.7808764940239044, "grad_norm": 1.1796875, "learning_rate": 8.324937766952636e-07, "loss": 1.1094, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 112, "tokens_per_second_per_gpu": 5679.05, "total_tokens": 20686600 }, { "epoch": 1.796812749003984, "grad_norm": 1.1015625, "learning_rate": 8.140883928370854e-07, "loss": 1.0659, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 113, "tokens_per_second_per_gpu": 5176.17, "total_tokens": 20882903 }, { "epoch": 1.812749003984064, "grad_norm": 1.109375, "learning_rate": 7.957480182103197e-07, "loss": 1.0562, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 114, "tokens_per_second_per_gpu": 5609.67, "total_tokens": 21073809 }, { "epoch": 1.8286852589641436, "grad_norm": 1.09375, "learning_rate": 7.774790660436857e-07, "loss": 1.0747, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 115, "tokens_per_second_per_gpu": 5480.8, "total_tokens": 21259881 }, { "epoch": 1.8446215139442232, "grad_norm": 1.0390625, "learning_rate": 7.592879245910272e-07, "loss": 1.0459, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 116, "tokens_per_second_per_gpu": 5854.74, "total_tokens": 21451515 }, { "epoch": 1.860557768924303, "grad_norm": 1.1484375, "learning_rate": 7.411809548974791e-07, "loss": 1.1191, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 117, "tokens_per_second_per_gpu": 5607.88, "total_tokens": 21637511 }, { "epoch": 1.8764940239043826, "grad_norm": 1.1328125, "learning_rate": 7.231644885751507e-07, "loss": 1.1377, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 118, "tokens_per_second_per_gpu": 5697.72, "total_tokens": 21823226 }, { "epoch": 1.8924302788844622, "grad_norm": 1.1171875, "learning_rate": 7.052448255890957e-07, "loss": 1.0869, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 119, "tokens_per_second_per_gpu": 6034.6, "total_tokens": 22008960 }, { "epoch": 1.908366533864542, "grad_norm": 1.1171875, "learning_rate": 6.874282320543556e-07, "loss": 1.144, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 120, "tokens_per_second_per_gpu": 5668.36, "total_tokens": 22201150 }, { "epoch": 1.9243027888446216, "grad_norm": 1.0859375, "learning_rate": 6.697209380448332e-07, "loss": 1.1255, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 121, "tokens_per_second_per_gpu": 5720.74, "total_tokens": 22390135 }, { "epoch": 1.9402390438247012, "grad_norm": 1.15625, "learning_rate": 6.521291354147726e-07, "loss": 1.1104, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 122, "tokens_per_second_per_gpu": 5870.28, "total_tokens": 22575681 }, { "epoch": 1.956175298804781, "grad_norm": 1.0625, "learning_rate": 6.34658975633605e-07, "loss": 1.1221, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 123, "tokens_per_second_per_gpu": 5453.27, "total_tokens": 22761750 }, { "epoch": 1.9721115537848606, "grad_norm": 1.0859375, "learning_rate": 6.173165676349102e-07, "loss": 1.0967, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 124, "tokens_per_second_per_gpu": 4989.67, "total_tokens": 22947396 }, { "epoch": 1.9880478087649402, "grad_norm": 1.0390625, "learning_rate": 6.001079756802592e-07, "loss": 1.0703, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 125, "tokens_per_second_per_gpu": 5385.24, "total_tokens": 23131511 }, { "epoch": 2.0, "grad_norm": 1.140625, "learning_rate": 5.830392172386722e-07, "loss": 1.1328, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 126, "tokens_per_second_per_gpu": 4540.27, "total_tokens": 23254742 }, { "epoch": 2.0159362549800797, "grad_norm": 1.03125, "learning_rate": 5.661162608824419e-07, "loss": 1.061, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 127, "tokens_per_second_per_gpu": 5523.13, "total_tokens": 23442702 }, { "epoch": 2.0318725099601593, "grad_norm": 1.140625, "learning_rate": 5.493450242000546e-07, "loss": 1.1201, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 128, "tokens_per_second_per_gpu": 5699.3, "total_tokens": 23634749 }, { "epoch": 2.047808764940239, "grad_norm": 1.109375, "learning_rate": 5.327313717269379e-07, "loss": 1.1875, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 129, "tokens_per_second_per_gpu": 5607.25, "total_tokens": 23821321 }, { "epoch": 2.0637450199203187, "grad_norm": 1.15625, "learning_rate": 5.162811128947602e-07, "loss": 1.1191, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 130, "tokens_per_second_per_gpu": 5781.65, "total_tokens": 24009420 }, { "epoch": 2.0796812749003983, "grad_norm": 1.1015625, "learning_rate": 5.000000000000002e-07, "loss": 1.1104, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 131, "tokens_per_second_per_gpu": 5505.55, "total_tokens": 24189826 }, { "epoch": 2.095617529880478, "grad_norm": 1.1796875, "learning_rate": 4.838937261924933e-07, "loss": 1.1973, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 132, "tokens_per_second_per_gpu": 5512.31, "total_tokens": 24368779 }, { "epoch": 2.1115537848605577, "grad_norm": 1.1015625, "learning_rate": 4.6796792348466353e-07, "loss": 1.0444, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 133, "tokens_per_second_per_gpu": 5465.08, "total_tokens": 24556917 }, { "epoch": 2.1274900398406373, "grad_norm": 1.1015625, "learning_rate": 4.522281607821288e-07, "loss": 1.1206, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 134, "tokens_per_second_per_gpu": 5419.8, "total_tokens": 24738084 }, { "epoch": 2.143426294820717, "grad_norm": 1.03125, "learning_rate": 4.366799419363779e-07, "loss": 1.1143, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 135, "tokens_per_second_per_gpu": 5539.41, "total_tokens": 24922074 }, { "epoch": 2.1593625498007967, "grad_norm": 1.1328125, "learning_rate": 4.2132870382019427e-07, "loss": 1.1924, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 136, "tokens_per_second_per_gpu": 5444.42, "total_tokens": 25104099 }, { "epoch": 2.1752988047808763, "grad_norm": 1.1328125, "learning_rate": 4.061798144264985e-07, "loss": 1.1885, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 137, "tokens_per_second_per_gpu": 5332.72, "total_tokens": 25281595 }, { "epoch": 2.191235059760956, "grad_norm": 1.0703125, "learning_rate": 3.912385709912793e-07, "loss": 1.1318, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 138, "tokens_per_second_per_gpu": 5857.46, "total_tokens": 25468027 }, { "epoch": 2.2071713147410357, "grad_norm": 1.1015625, "learning_rate": 3.765101981412665e-07, "loss": 1.1328, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 139, "tokens_per_second_per_gpu": 5613.35, "total_tokens": 25654818 }, { "epoch": 2.2231075697211153, "grad_norm": 1.0390625, "learning_rate": 3.6199984606699153e-07, "loss": 1.0981, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 140, "tokens_per_second_per_gpu": 5586.6, "total_tokens": 25847854 }, { "epoch": 2.239043824701195, "grad_norm": 1.015625, "learning_rate": 3.477125887218791e-07, "loss": 1.105, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 141, "tokens_per_second_per_gpu": 5723.95, "total_tokens": 26037069 }, { "epoch": 2.2549800796812747, "grad_norm": 1.15625, "learning_rate": 3.3365342204799606e-07, "loss": 1.1416, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 142, "tokens_per_second_per_gpu": 5609.66, "total_tokens": 26221689 }, { "epoch": 2.2709163346613543, "grad_norm": 1.0703125, "learning_rate": 3.198272622290804e-07, "loss": 1.0625, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 143, "tokens_per_second_per_gpu": 6065.87, "total_tokens": 26415905 }, { "epoch": 2.2868525896414345, "grad_norm": 1.0703125, "learning_rate": 3.0623894397145833e-07, "loss": 1.123, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 144, "tokens_per_second_per_gpu": 5513.26, "total_tokens": 26598043 }, { "epoch": 2.302788844621514, "grad_norm": 1.078125, "learning_rate": 2.9289321881345254e-07, "loss": 1.103, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 145, "tokens_per_second_per_gpu": 5505.93, "total_tokens": 26781945 }, { "epoch": 2.318725099601594, "grad_norm": 1.140625, "learning_rate": 2.797947534638736e-07, "loss": 1.1348, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 146, "tokens_per_second_per_gpu": 5529.92, "total_tokens": 26964970 }, { "epoch": 2.3346613545816735, "grad_norm": 1.0234375, "learning_rate": 2.6694812817017387e-07, "loss": 1.0938, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 147, "tokens_per_second_per_gpu": 5703.33, "total_tokens": 27157832 }, { "epoch": 2.350597609561753, "grad_norm": 1.1640625, "learning_rate": 2.543578351168344e-07, "loss": 1.1045, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 148, "tokens_per_second_per_gpu": 5833.69, "total_tokens": 27339955 }, { "epoch": 2.366533864541833, "grad_norm": 1.03125, "learning_rate": 2.4202827685454687e-07, "loss": 1.0674, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 149, "tokens_per_second_per_gpu": 5586.43, "total_tokens": 27526672 }, { "epoch": 2.3824701195219125, "grad_norm": 1.03125, "learning_rate": 2.299637647607372e-07, "loss": 1.0728, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 150, "tokens_per_second_per_gpu": 5369.71, "total_tokens": 27713024 }, { "epoch": 2.398406374501992, "grad_norm": 1.1640625, "learning_rate": 2.181685175319702e-07, "loss": 1.1318, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 151, "tokens_per_second_per_gpu": 5560.0, "total_tokens": 27893662 }, { "epoch": 2.414342629482072, "grad_norm": 1.1015625, "learning_rate": 2.0664665970876495e-07, "loss": 1.1807, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 152, "tokens_per_second_per_gpu": 5346.34, "total_tokens": 28077998 }, { "epoch": 2.4302788844621515, "grad_norm": 1.078125, "learning_rate": 1.9540222023333163e-07, "loss": 1.1572, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 153, "tokens_per_second_per_gpu": 5554.36, "total_tokens": 28259288 }, { "epoch": 2.446215139442231, "grad_norm": 1.09375, "learning_rate": 1.8443913104073982e-07, "loss": 1.1807, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 154, "tokens_per_second_per_gpu": 5321.03, "total_tokens": 28435658 }, { "epoch": 2.462151394422311, "grad_norm": 1.1015625, "learning_rate": 1.737612256840053e-07, "loss": 1.127, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 155, "tokens_per_second_per_gpu": 5503.74, "total_tokens": 28617612 }, { "epoch": 2.4780876494023905, "grad_norm": 1.1640625, "learning_rate": 1.6337223799358024e-07, "loss": 1.1099, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 156, "tokens_per_second_per_gpu": 5737.94, "total_tokens": 28803434 }, { "epoch": 2.49402390438247, "grad_norm": 1.0859375, "learning_rate": 1.5327580077171588e-07, "loss": 1.1143, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 157, "tokens_per_second_per_gpu": 5419.26, "total_tokens": 28989353 }, { "epoch": 2.50996015936255, "grad_norm": 1.140625, "learning_rate": 1.4347544452214867e-07, "loss": 1.1216, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 158, "tokens_per_second_per_gpu": 5564.03, "total_tokens": 29170225 }, { "epoch": 2.5258964143426295, "grad_norm": 1.0703125, "learning_rate": 1.3397459621556128e-07, "loss": 1.1182, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 159, "tokens_per_second_per_gpu": 5533.6, "total_tokens": 29358629 }, { "epoch": 2.541832669322709, "grad_norm": 1.09375, "learning_rate": 1.2477657809124632e-07, "loss": 1.1895, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 160, "tokens_per_second_per_gpu": 5801.01, "total_tokens": 29545795 }, { "epoch": 2.557768924302789, "grad_norm": 1.09375, "learning_rate": 1.1588460649539034e-07, "loss": 1.1973, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 161, "tokens_per_second_per_gpu": 5496.74, "total_tokens": 29730822 }, { "epoch": 2.5737051792828685, "grad_norm": 1.1484375, "learning_rate": 1.0730179075638868e-07, "loss": 1.1611, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 162, "tokens_per_second_per_gpu": 5661.86, "total_tokens": 29908915 }, { "epoch": 2.589641434262948, "grad_norm": 1.078125, "learning_rate": 9.903113209758096e-08, "loss": 1.1709, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 163, "tokens_per_second_per_gpu": 5334.67, "total_tokens": 30093238 }, { "epoch": 2.605577689243028, "grad_norm": 1.0703125, "learning_rate": 9.107552258778905e-08, "loss": 1.1099, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 164, "tokens_per_second_per_gpu": 5276.84, "total_tokens": 30274843 }, { "epoch": 2.6215139442231075, "grad_norm": 1.046875, "learning_rate": 8.34377441300238e-08, "loss": 1.0933, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 165, "tokens_per_second_per_gpu": 5920.44, "total_tokens": 30464470 }, { "epoch": 2.637450199203187, "grad_norm": 1.1328125, "learning_rate": 7.612046748871326e-08, "loss": 1.1348, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 166, "tokens_per_second_per_gpu": 5562.69, "total_tokens": 30647284 }, { "epoch": 2.653386454183267, "grad_norm": 1.1328125, "learning_rate": 6.912625135579586e-08, "loss": 1.1553, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 167, "tokens_per_second_per_gpu": 5381.44, "total_tokens": 30825122 }, { "epoch": 2.6693227091633465, "grad_norm": 1.078125, "learning_rate": 6.245754145600091e-08, "loss": 1.126, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 168, "tokens_per_second_per_gpu": 5392.77, "total_tokens": 31008029 }, { "epoch": 2.685258964143426, "grad_norm": 1.03125, "learning_rate": 5.611666969163242e-08, "loss": 1.1025, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 169, "tokens_per_second_per_gpu": 5636.12, "total_tokens": 31197605 }, { "epoch": 2.7011952191235062, "grad_norm": 1.09375, "learning_rate": 5.0105853327154004e-08, "loss": 1.1572, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 170, "tokens_per_second_per_gpu": 5442.72, "total_tokens": 31382140 }, { "epoch": 2.717131474103586, "grad_norm": 1.0703125, "learning_rate": 4.442719421385921e-08, "loss": 1.1011, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 171, "tokens_per_second_per_gpu": 5829.11, "total_tokens": 31567345 }, { "epoch": 2.7330677290836656, "grad_norm": 1.078125, "learning_rate": 3.908267805490051e-08, "loss": 1.123, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 172, "tokens_per_second_per_gpu": 5509.97, "total_tokens": 31752125 }, { "epoch": 2.7490039840637452, "grad_norm": 1.1015625, "learning_rate": 3.4074173710931796e-08, "loss": 1.105, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 173, "tokens_per_second_per_gpu": 5609.04, "total_tokens": 31942908 }, { "epoch": 2.764940239043825, "grad_norm": 1.109375, "learning_rate": 2.9403432546609043e-08, "loss": 1.1533, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 174, "tokens_per_second_per_gpu": 5753.07, "total_tokens": 32129128 }, { "epoch": 2.7808764940239046, "grad_norm": 1.1328125, "learning_rate": 2.507208781817638e-08, "loss": 1.1074, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 175, "tokens_per_second_per_gpu": 5763.07, "total_tokens": 32313971 }, { "epoch": 2.7968127490039842, "grad_norm": 1.0625, "learning_rate": 2.1081654102351632e-08, "loss": 1.0635, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 176, "tokens_per_second_per_gpu": 5200.76, "total_tokens": 32510274 }, { "epoch": 2.812749003984064, "grad_norm": 1.078125, "learning_rate": 1.7433526766711725e-08, "loss": 1.0547, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 177, "tokens_per_second_per_gpu": 5511.76, "total_tokens": 32701180 }, { "epoch": 2.8286852589641436, "grad_norm": 1.0546875, "learning_rate": 1.4128981481764113e-08, "loss": 1.0728, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 178, "tokens_per_second_per_gpu": 5453.23, "total_tokens": 32887252 }, { "epoch": 2.8446215139442232, "grad_norm": 1.0, "learning_rate": 1.1169173774871477e-08, "loss": 1.0454, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 179, "tokens_per_second_per_gpu": 5831.37, "total_tokens": 33078886 }, { "epoch": 2.860557768924303, "grad_norm": 1.109375, "learning_rate": 8.555138626189618e-09, "loss": 1.1182, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 180, "tokens_per_second_per_gpu": 5712.45, "total_tokens": 33264882 }, { "epoch": 2.8764940239043826, "grad_norm": 1.09375, "learning_rate": 6.2877901067573955e-09, "loss": 1.1357, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 181, "tokens_per_second_per_gpu": 5709.15, "total_tokens": 33450597 }, { "epoch": 2.8924302788844622, "grad_norm": 1.1015625, "learning_rate": 4.367921058866186e-09, "loss": 1.085, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 182, "tokens_per_second_per_gpu": 5923.67, "total_tokens": 33636331 }, { "epoch": 2.908366533864542, "grad_norm": 1.0859375, "learning_rate": 2.7962028188198706e-09, "loss": 1.1416, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 183, "tokens_per_second_per_gpu": 5643.13, "total_tokens": 33828521 }, { "epoch": 2.9243027888446216, "grad_norm": 1.0625, "learning_rate": 1.5731849821833953e-09, "loss": 1.1255, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 184, "tokens_per_second_per_gpu": 5703.08, "total_tokens": 34017506 }, { "epoch": 2.9402390438247012, "grad_norm": 1.1328125, "learning_rate": 6.992952116013917e-10, "loss": 1.1094, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 185, "tokens_per_second_per_gpu": 5872.3, "total_tokens": 34203052 }, { "epoch": 2.956175298804781, "grad_norm": 1.046875, "learning_rate": 1.7483908725357543e-10, "loss": 1.1221, "memory/device_reserved (GiB)": 77.63, "memory/max_active (GiB)": 65.77, "memory/max_allocated (GiB)": 65.77, "step": 186, "tokens_per_second_per_gpu": 5556.53, "total_tokens": 34389121 } ], "logging_steps": 1, "max_steps": 186, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 62, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.3756016575819284e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }