{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.956175298804781, "eval_steps": 500, "global_step": 186, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01593625498007968, "grad_norm": 4.5625, "learning_rate": 0.0, "loss": 1.4053, "memory/device_reserved (GiB)": 61.34, "memory/max_active (GiB)": 49.6, "memory/max_allocated (GiB)": 49.6, "step": 1, "tokens_per_second_per_gpu": 4706.79, "total_tokens": 180518 }, { "epoch": 0.03187250996015936, "grad_norm": 4.34375, "learning_rate": 1.111111111111111e-06, "loss": 1.3369, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 2, "tokens_per_second_per_gpu": 5826.72, "total_tokens": 363757 }, { "epoch": 0.04780876494023904, "grad_norm": 4.15625, "learning_rate": 2.222222222222222e-06, "loss": 1.3623, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 3, "tokens_per_second_per_gpu": 5939.96, "total_tokens": 558043 }, { "epoch": 0.06374501992031872, "grad_norm": 4.34375, "learning_rate": 3.3333333333333333e-06, "loss": 1.3643, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 4, "tokens_per_second_per_gpu": 5941.47, "total_tokens": 743276 }, { "epoch": 0.0796812749003984, "grad_norm": 3.90625, "learning_rate": 4.444444444444444e-06, "loss": 1.2998, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 5, "tokens_per_second_per_gpu": 5380.38, "total_tokens": 929761 }, { "epoch": 0.09561752988047809, "grad_norm": 3.546875, "learning_rate": 5.555555555555557e-06, "loss": 1.3018, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 6, "tokens_per_second_per_gpu": 5949.69, "total_tokens": 1118316 }, { "epoch": 0.11155378486055777, "grad_norm": 3.171875, "learning_rate": 6.666666666666667e-06, "loss": 1.2793, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 7, "tokens_per_second_per_gpu": 5785.23, "total_tokens": 1301615 }, { "epoch": 0.12749003984063745, "grad_norm": 2.96875, "learning_rate": 7.77777777777778e-06, "loss": 1.3115, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 8, "tokens_per_second_per_gpu": 5941.21, "total_tokens": 1490474 }, { "epoch": 0.14342629482071714, "grad_norm": 2.296875, "learning_rate": 8.888888888888888e-06, "loss": 1.2588, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 9, "tokens_per_second_per_gpu": 5534.49, "total_tokens": 1667576 }, { "epoch": 0.1593625498007968, "grad_norm": 1.5625, "learning_rate": 1e-05, "loss": 1.1992, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 10, "tokens_per_second_per_gpu": 6154.87, "total_tokens": 1857807 }, { "epoch": 0.1752988047808765, "grad_norm": 1.0703125, "learning_rate": 1.1111111111111113e-05, "loss": 1.1436, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 11, "tokens_per_second_per_gpu": 5715.6, "total_tokens": 2041489 }, { "epoch": 0.19123505976095617, "grad_norm": 0.9765625, "learning_rate": 1.2222222222222224e-05, "loss": 1.2402, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 12, "tokens_per_second_per_gpu": 5749.37, "total_tokens": 2216014 }, { "epoch": 0.20717131474103587, "grad_norm": 0.9609375, "learning_rate": 1.3333333333333333e-05, "loss": 1.2051, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 13, "tokens_per_second_per_gpu": 5748.94, "total_tokens": 2397131 }, { "epoch": 0.22310756972111553, "grad_norm": 0.921875, "learning_rate": 1.4444444444444446e-05, "loss": 1.1211, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 14, "tokens_per_second_per_gpu": 6171.1, "total_tokens": 2590472 }, { "epoch": 0.23904382470119523, "grad_norm": 0.8984375, "learning_rate": 1.555555555555556e-05, "loss": 1.1777, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 15, "tokens_per_second_per_gpu": 6160.5, "total_tokens": 2780711 }, { "epoch": 0.2549800796812749, "grad_norm": 0.80078125, "learning_rate": 1.6666666666666667e-05, "loss": 1.1025, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 16, "tokens_per_second_per_gpu": 5706.58, "total_tokens": 2968588 }, { "epoch": 0.27091633466135456, "grad_norm": 0.65234375, "learning_rate": 1.7777777777777777e-05, "loss": 1.2041, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 17, "tokens_per_second_per_gpu": 5569.19, "total_tokens": 3148691 }, { "epoch": 0.2868525896414343, "grad_norm": 0.59765625, "learning_rate": 1.888888888888889e-05, "loss": 1.168, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 18, "tokens_per_second_per_gpu": 5894.91, "total_tokens": 3332398 }, { "epoch": 0.30278884462151395, "grad_norm": 0.5625, "learning_rate": 2e-05, "loss": 1.0977, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 19, "tokens_per_second_per_gpu": 6092.09, "total_tokens": 3526610 }, { "epoch": 0.3187250996015936, "grad_norm": 0.54296875, "learning_rate": 1.9998251609127465e-05, "loss": 1.1372, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 20, "tokens_per_second_per_gpu": 5971.19, "total_tokens": 3711042 }, { "epoch": 0.3346613545816733, "grad_norm": 0.5078125, "learning_rate": 1.9993007047883988e-05, "loss": 1.0659, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 21, "tokens_per_second_per_gpu": 5750.71, "total_tokens": 3890841 }, { "epoch": 0.350597609561753, "grad_norm": 0.50390625, "learning_rate": 1.998426815017817e-05, "loss": 1.124, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 22, "tokens_per_second_per_gpu": 5968.21, "total_tokens": 4074024 }, { "epoch": 0.3665338645418327, "grad_norm": 0.4609375, "learning_rate": 1.9972037971811802e-05, "loss": 1.064, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 23, "tokens_per_second_per_gpu": 5672.93, "total_tokens": 4261426 }, { "epoch": 0.38247011952191234, "grad_norm": 0.458984375, "learning_rate": 1.9956320789411338e-05, "loss": 1.0977, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 24, "tokens_per_second_per_gpu": 5947.63, "total_tokens": 4448221 }, { "epoch": 0.398406374501992, "grad_norm": 0.400390625, "learning_rate": 1.9937122098932428e-05, "loss": 0.9438, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 25, "tokens_per_second_per_gpu": 5830.3, "total_tokens": 4643418 }, { "epoch": 0.41434262948207173, "grad_norm": 0.451171875, "learning_rate": 1.9914448613738107e-05, "loss": 1.0786, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 26, "tokens_per_second_per_gpu": 5753.23, "total_tokens": 4826564 }, { "epoch": 0.4302788844621514, "grad_norm": 0.41796875, "learning_rate": 1.9888308262251286e-05, "loss": 1.1084, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 27, "tokens_per_second_per_gpu": 5786.21, "total_tokens": 5008617 }, { "epoch": 0.44621513944223107, "grad_norm": 0.392578125, "learning_rate": 1.985871018518236e-05, "loss": 1.0488, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 28, "tokens_per_second_per_gpu": 5935.98, "total_tokens": 5194550 }, { "epoch": 0.46215139442231074, "grad_norm": 0.37109375, "learning_rate": 1.9825664732332886e-05, "loss": 1.0894, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 29, "tokens_per_second_per_gpu": 5927.93, "total_tokens": 5380376 }, { "epoch": 0.47808764940239046, "grad_norm": 0.35546875, "learning_rate": 1.9789183458976485e-05, "loss": 1.0869, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 30, "tokens_per_second_per_gpu": 6097.05, "total_tokens": 5567310 }, { "epoch": 0.4940239043824701, "grad_norm": 0.37109375, "learning_rate": 1.9749279121818235e-05, "loss": 1.0181, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 31, "tokens_per_second_per_gpu": 6055.18, "total_tokens": 5750982 }, { "epoch": 0.5099601593625498, "grad_norm": 0.380859375, "learning_rate": 1.970596567453391e-05, "loss": 1.0552, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 32, "tokens_per_second_per_gpu": 6008.39, "total_tokens": 5937332 }, { "epoch": 0.5258964143426295, "grad_norm": 0.376953125, "learning_rate": 1.9659258262890683e-05, "loss": 1.0439, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 33, "tokens_per_second_per_gpu": 6030.07, "total_tokens": 6120851 }, { "epoch": 0.5418326693227091, "grad_norm": 0.369140625, "learning_rate": 1.9609173219450998e-05, "loss": 1.0835, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 34, "tokens_per_second_per_gpu": 5726.88, "total_tokens": 6297402 }, { "epoch": 0.5577689243027888, "grad_norm": 0.396484375, "learning_rate": 1.955572805786141e-05, "loss": 1.1074, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 35, "tokens_per_second_per_gpu": 5816.12, "total_tokens": 6480316 }, { "epoch": 0.5737051792828686, "grad_norm": 0.357421875, "learning_rate": 1.9498941466728462e-05, "loss": 1.0391, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 36, "tokens_per_second_per_gpu": 5765.66, "total_tokens": 6665052 }, { "epoch": 0.5896414342629482, "grad_norm": 0.345703125, "learning_rate": 1.9438833303083677e-05, "loss": 1.0371, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 37, "tokens_per_second_per_gpu": 5749.09, "total_tokens": 6849283 }, { "epoch": 0.6055776892430279, "grad_norm": 0.34375, "learning_rate": 1.9375424585439994e-05, "loss": 1.0503, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 38, "tokens_per_second_per_gpu": 5927.3, "total_tokens": 7032513 }, { "epoch": 0.6215139442231076, "grad_norm": 0.330078125, "learning_rate": 1.9308737486442045e-05, "loss": 1.0479, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 39, "tokens_per_second_per_gpu": 5905.92, "total_tokens": 7214561 }, { "epoch": 0.6374501992031872, "grad_norm": 0.3359375, "learning_rate": 1.9238795325112867e-05, "loss": 1.0098, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 40, "tokens_per_second_per_gpu": 5853.3, "total_tokens": 7400854 }, { "epoch": 0.6533864541832669, "grad_norm": 0.36328125, "learning_rate": 1.9165622558699763e-05, "loss": 1.106, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 41, "tokens_per_second_per_gpu": 5556.45, "total_tokens": 7577263 }, { "epoch": 0.6693227091633466, "grad_norm": 0.396484375, "learning_rate": 1.908924477412211e-05, "loss": 1.0498, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 42, "tokens_per_second_per_gpu": 5928.26, "total_tokens": 7763586 }, { "epoch": 0.6852589641434262, "grad_norm": 0.32421875, "learning_rate": 1.900968867902419e-05, "loss": 1.0171, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 43, "tokens_per_second_per_gpu": 6102.36, "total_tokens": 7953595 }, { "epoch": 0.701195219123506, "grad_norm": 0.36328125, "learning_rate": 1.8926982092436117e-05, "loss": 1.0688, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 44, "tokens_per_second_per_gpu": 6058.46, "total_tokens": 8135608 }, { "epoch": 0.7171314741035857, "grad_norm": 0.359375, "learning_rate": 1.8841153935046098e-05, "loss": 0.978, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 45, "tokens_per_second_per_gpu": 5806.45, "total_tokens": 8328038 }, { "epoch": 0.7330677290836654, "grad_norm": 0.333984375, "learning_rate": 1.8752234219087538e-05, "loss": 1.0435, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 46, "tokens_per_second_per_gpu": 5940.85, "total_tokens": 8517629 }, { "epoch": 0.749003984063745, "grad_norm": 0.400390625, "learning_rate": 1.866025403784439e-05, "loss": 1.0317, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 47, "tokens_per_second_per_gpu": 5929.8, "total_tokens": 8700619 }, { "epoch": 0.7649402390438247, "grad_norm": 0.328125, "learning_rate": 1.8565245554778516e-05, "loss": 0.9819, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 48, "tokens_per_second_per_gpu": 5991.89, "total_tokens": 8886726 }, { "epoch": 0.7808764940239044, "grad_norm": 0.34765625, "learning_rate": 1.8467241992282842e-05, "loss": 1.0396, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 49, "tokens_per_second_per_gpu": 5941.59, "total_tokens": 9074210 }, { "epoch": 0.796812749003984, "grad_norm": 0.345703125, "learning_rate": 1.83662776200642e-05, "loss": 1.0703, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 50, "tokens_per_second_per_gpu": 5856.71, "total_tokens": 9253264 }, { "epoch": 0.8127490039840638, "grad_norm": 0.33203125, "learning_rate": 1.826238774315995e-05, "loss": 1.0078, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 51, "tokens_per_second_per_gpu": 5883.97, "total_tokens": 9437019 }, { "epoch": 0.8286852589641435, "grad_norm": 0.326171875, "learning_rate": 1.8155608689592604e-05, "loss": 1.0352, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 52, "tokens_per_second_per_gpu": 6284.45, "total_tokens": 9624777 }, { "epoch": 0.8446215139442231, "grad_norm": 0.34375, "learning_rate": 1.8045977797666685e-05, "loss": 1.0015, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 53, "tokens_per_second_per_gpu": 6227.78, "total_tokens": 9816093 }, { "epoch": 0.8605577689243028, "grad_norm": 0.32421875, "learning_rate": 1.7933533402912354e-05, "loss": 1.0205, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 54, "tokens_per_second_per_gpu": 5562.75, "total_tokens": 10003875 }, { "epoch": 0.8764940239043825, "grad_norm": 0.3125, "learning_rate": 1.78183148246803e-05, "loss": 0.9985, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 55, "tokens_per_second_per_gpu": 6029.45, "total_tokens": 10195261 }, { "epoch": 0.8924302788844621, "grad_norm": 0.328125, "learning_rate": 1.7700362352392632e-05, "loss": 1.0151, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 56, "tokens_per_second_per_gpu": 5824.93, "total_tokens": 10378607 }, { "epoch": 0.9083665338645418, "grad_norm": 0.345703125, "learning_rate": 1.757971723145453e-05, "loss": 1.0737, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 57, "tokens_per_second_per_gpu": 5758.69, "total_tokens": 10565102 }, { "epoch": 0.9243027888446215, "grad_norm": 0.330078125, "learning_rate": 1.7456421648831658e-05, "loss": 1.0444, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 58, "tokens_per_second_per_gpu": 5699.09, "total_tokens": 10743645 }, { "epoch": 0.9402390438247012, "grad_norm": 0.337890625, "learning_rate": 1.7330518718298263e-05, "loss": 0.998, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 59, "tokens_per_second_per_gpu": 5772.72, "total_tokens": 10926325 }, { "epoch": 0.9561752988047809, "grad_norm": 0.361328125, "learning_rate": 1.7202052465361268e-05, "loss": 1.0659, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 60, "tokens_per_second_per_gpu": 5781.99, "total_tokens": 11105741 }, { "epoch": 0.9721115537848606, "grad_norm": 0.326171875, "learning_rate": 1.7071067811865477e-05, "loss": 1.0024, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 61, "tokens_per_second_per_gpu": 5416.25, "total_tokens": 11283752 }, { "epoch": 0.9880478087649402, "grad_norm": 0.314453125, "learning_rate": 1.693761056028542e-05, "loss": 0.9429, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 62, "tokens_per_second_per_gpu": 6080.81, "total_tokens": 11476891 }, { "epoch": 1.0, "grad_norm": 1.03125, "learning_rate": 1.6801727377709195e-05, "loss": 0.8979, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 63, "tokens_per_second_per_gpu": 4586.33, "total_tokens": 11600559 }, { "epoch": 1.0159362549800797, "grad_norm": 0.33203125, "learning_rate": 1.6663465779520042e-05, "loss": 1.0391, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 64, "tokens_per_second_per_gpu": 5765.65, "total_tokens": 11781077 }, { "epoch": 1.0318725099601593, "grad_norm": 0.328125, "learning_rate": 1.6522874112781213e-05, "loss": 0.9893, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 65, "tokens_per_second_per_gpu": 5812.65, "total_tokens": 11964316 }, { "epoch": 1.047808764940239, "grad_norm": 0.33203125, "learning_rate": 1.6380001539330088e-05, "loss": 1.019, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 66, "tokens_per_second_per_gpu": 5958.35, "total_tokens": 12158602 }, { "epoch": 1.0637450199203187, "grad_norm": 0.318359375, "learning_rate": 1.6234898018587336e-05, "loss": 1.0098, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 67, "tokens_per_second_per_gpu": 5947.9, "total_tokens": 12343835 }, { "epoch": 1.0796812749003983, "grad_norm": 0.31640625, "learning_rate": 1.608761429008721e-05, "loss": 0.959, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 68, "tokens_per_second_per_gpu": 5410.16, "total_tokens": 12530320 }, { "epoch": 1.095617529880478, "grad_norm": 0.337890625, "learning_rate": 1.5938201855735017e-05, "loss": 0.998, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 69, "tokens_per_second_per_gpu": 5950.0, "total_tokens": 12718875 }, { "epoch": 1.1115537848605577, "grad_norm": 0.31640625, "learning_rate": 1.578671296179806e-05, "loss": 0.9834, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 70, "tokens_per_second_per_gpu": 5806.97, "total_tokens": 12902174 }, { "epoch": 1.1274900398406373, "grad_norm": 0.322265625, "learning_rate": 1.563320058063622e-05, "loss": 1.02, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 71, "tokens_per_second_per_gpu": 5964.93, "total_tokens": 13091033 }, { "epoch": 1.1434262948207172, "grad_norm": 0.328125, "learning_rate": 1.5477718392178716e-05, "loss": 1.001, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 72, "tokens_per_second_per_gpu": 5543.95, "total_tokens": 13268135 }, { "epoch": 1.159362549800797, "grad_norm": 0.322265625, "learning_rate": 1.5320320765153367e-05, "loss": 0.9868, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 73, "tokens_per_second_per_gpu": 6161.46, "total_tokens": 13458366 }, { "epoch": 1.1752988047808766, "grad_norm": 0.33203125, "learning_rate": 1.5161062738075068e-05, "loss": 0.9404, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 74, "tokens_per_second_per_gpu": 5781.1, "total_tokens": 13642048 }, { "epoch": 1.1912350597609562, "grad_norm": 0.423828125, "learning_rate": 1.5000000000000002e-05, "loss": 1.0273, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 75, "tokens_per_second_per_gpu": 5754.51, "total_tokens": 13816573 }, { "epoch": 1.207171314741036, "grad_norm": 0.376953125, "learning_rate": 1.4837188871052399e-05, "loss": 0.999, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 76, "tokens_per_second_per_gpu": 5745.16, "total_tokens": 13997690 }, { "epoch": 1.2231075697211156, "grad_norm": 0.30859375, "learning_rate": 1.4672686282730622e-05, "loss": 0.9365, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 77, "tokens_per_second_per_gpu": 6187.18, "total_tokens": 14191031 }, { "epoch": 1.2390438247011952, "grad_norm": 0.310546875, "learning_rate": 1.4506549757999456e-05, "loss": 0.9932, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 78, "tokens_per_second_per_gpu": 6189.26, "total_tokens": 14381270 }, { "epoch": 1.254980079681275, "grad_norm": 0.361328125, "learning_rate": 1.4338837391175582e-05, "loss": 0.9253, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 79, "tokens_per_second_per_gpu": 5694.92, "total_tokens": 14569147 }, { "epoch": 1.2709163346613546, "grad_norm": 0.349609375, "learning_rate": 1.4169607827613284e-05, "loss": 1.0249, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 80, "tokens_per_second_per_gpu": 5574.13, "total_tokens": 14749250 }, { "epoch": 1.2868525896414342, "grad_norm": 0.33984375, "learning_rate": 1.3998920243197408e-05, "loss": 1.0044, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 81, "tokens_per_second_per_gpu": 5892.86, "total_tokens": 14932957 }, { "epoch": 1.302788844621514, "grad_norm": 0.31640625, "learning_rate": 1.3826834323650899e-05, "loss": 0.9443, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 82, "tokens_per_second_per_gpu": 6084.92, "total_tokens": 15127169 }, { "epoch": 1.3187250996015936, "grad_norm": 0.328125, "learning_rate": 1.3653410243663953e-05, "loss": 0.9878, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 83, "tokens_per_second_per_gpu": 5984.25, "total_tokens": 15311601 }, { "epoch": 1.3346613545816732, "grad_norm": 0.32421875, "learning_rate": 1.3478708645852272e-05, "loss": 0.9248, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 84, "tokens_per_second_per_gpu": 5744.73, "total_tokens": 15491400 }, { "epoch": 1.3505976095617531, "grad_norm": 0.33203125, "learning_rate": 1.3302790619551673e-05, "loss": 0.9824, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 85, "tokens_per_second_per_gpu": 6009.5, "total_tokens": 15674583 }, { "epoch": 1.3665338645418328, "grad_norm": 0.314453125, "learning_rate": 1.3125717679456447e-05, "loss": 0.9404, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 86, "tokens_per_second_per_gpu": 5690.82, "total_tokens": 15861985 }, { "epoch": 1.3824701195219125, "grad_norm": 0.34765625, "learning_rate": 1.2947551744109044e-05, "loss": 0.9731, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 87, "tokens_per_second_per_gpu": 5962.66, "total_tokens": 16048780 }, { "epoch": 1.3984063745019921, "grad_norm": 0.318359375, "learning_rate": 1.2768355114248493e-05, "loss": 0.8406, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 88, "tokens_per_second_per_gpu": 5796.13, "total_tokens": 16243977 }, { "epoch": 1.4143426294820718, "grad_norm": 0.337890625, "learning_rate": 1.2588190451025209e-05, "loss": 0.9692, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 89, "tokens_per_second_per_gpu": 5748.01, "total_tokens": 16427123 }, { "epoch": 1.4302788844621515, "grad_norm": 0.345703125, "learning_rate": 1.2407120754089733e-05, "loss": 0.998, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 90, "tokens_per_second_per_gpu": 5897.24, "total_tokens": 16609176 }, { "epoch": 1.4462151394422311, "grad_norm": 0.33203125, "learning_rate": 1.2225209339563144e-05, "loss": 0.9507, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 91, "tokens_per_second_per_gpu": 5936.81, "total_tokens": 16795109 }, { "epoch": 1.4621513944223108, "grad_norm": 0.328125, "learning_rate": 1.2042519817896805e-05, "loss": 0.9912, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 92, "tokens_per_second_per_gpu": 5949.43, "total_tokens": 16980935 }, { "epoch": 1.4780876494023905, "grad_norm": 0.333984375, "learning_rate": 1.1859116071629148e-05, "loss": 0.9888, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 93, "tokens_per_second_per_gpu": 6095.07, "total_tokens": 17167869 }, { "epoch": 1.4940239043824701, "grad_norm": 0.322265625, "learning_rate": 1.1675062233047365e-05, "loss": 0.9219, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 94, "tokens_per_second_per_gpu": 6067.77, "total_tokens": 17351541 }, { "epoch": 1.5099601593625498, "grad_norm": 0.3828125, "learning_rate": 1.1490422661761744e-05, "loss": 0.9648, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 95, "tokens_per_second_per_gpu": 6008.67, "total_tokens": 17537891 }, { "epoch": 1.5258964143426295, "grad_norm": 0.328125, "learning_rate": 1.130526192220052e-05, "loss": 0.9556, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 96, "tokens_per_second_per_gpu": 5955.34, "total_tokens": 17721410 }, { "epoch": 1.5418326693227091, "grad_norm": 0.3359375, "learning_rate": 1.1119644761033079e-05, "loss": 0.9951, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 97, "tokens_per_second_per_gpu": 5732.24, "total_tokens": 17897961 }, { "epoch": 1.5577689243027888, "grad_norm": 0.330078125, "learning_rate": 1.0933636084529507e-05, "loss": 1.02, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 98, "tokens_per_second_per_gpu": 5813.99, "total_tokens": 18080875 }, { "epoch": 1.5737051792828685, "grad_norm": 0.330078125, "learning_rate": 1.0747300935864245e-05, "loss": 0.958, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 99, "tokens_per_second_per_gpu": 5769.52, "total_tokens": 18265611 }, { "epoch": 1.5896414342629481, "grad_norm": 0.326171875, "learning_rate": 1.0560704472371919e-05, "loss": 0.9561, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 100, "tokens_per_second_per_gpu": 5726.38, "total_tokens": 18449842 }, { "epoch": 1.6055776892430278, "grad_norm": 0.326171875, "learning_rate": 1.037391194276326e-05, "loss": 0.9707, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 101, "tokens_per_second_per_gpu": 5944.93, "total_tokens": 18633072 }, { "epoch": 1.6215139442231075, "grad_norm": 0.3203125, "learning_rate": 1.0186988664309023e-05, "loss": 0.9707, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 102, "tokens_per_second_per_gpu": 5897.0, "total_tokens": 18815120 }, { "epoch": 1.6374501992031871, "grad_norm": 0.328125, "learning_rate": 1e-05, "loss": 0.9385, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 103, "tokens_per_second_per_gpu": 5841.82, "total_tokens": 19001413 }, { "epoch": 1.6533864541832668, "grad_norm": 0.341796875, "learning_rate": 9.81301133569098e-06, "loss": 1.0303, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 104, "tokens_per_second_per_gpu": 5567.16, "total_tokens": 19177822 }, { "epoch": 1.6693227091633465, "grad_norm": 0.330078125, "learning_rate": 9.626088057236745e-06, "loss": 0.9814, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 105, "tokens_per_second_per_gpu": 5850.82, "total_tokens": 19364145 }, { "epoch": 1.6852589641434261, "grad_norm": 0.31640625, "learning_rate": 9.439295527628083e-06, "loss": 0.9531, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 106, "tokens_per_second_per_gpu": 6142.09, "total_tokens": 19554154 }, { "epoch": 1.701195219123506, "grad_norm": 0.333984375, "learning_rate": 9.252699064135759e-06, "loss": 0.998, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 107, "tokens_per_second_per_gpu": 6059.46, "total_tokens": 19736167 }, { "epoch": 1.7171314741035857, "grad_norm": 0.33203125, "learning_rate": 9.066363915470494e-06, "loss": 0.9204, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 108, "tokens_per_second_per_gpu": 5807.52, "total_tokens": 19928597 }, { "epoch": 1.7330677290836654, "grad_norm": 0.3515625, "learning_rate": 8.880355238966923e-06, "loss": 0.978, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 109, "tokens_per_second_per_gpu": 5987.33, "total_tokens": 20118188 }, { "epoch": 1.749003984063745, "grad_norm": 0.3359375, "learning_rate": 8.694738077799487e-06, "loss": 0.9702, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 110, "tokens_per_second_per_gpu": 5888.18, "total_tokens": 20301178 }, { "epoch": 1.7649402390438247, "grad_norm": 0.357421875, "learning_rate": 8.509577338238255e-06, "loss": 0.9253, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 111, "tokens_per_second_per_gpu": 5972.68, "total_tokens": 20487285 }, { "epoch": 1.7808764940239044, "grad_norm": 0.337890625, "learning_rate": 8.324937766952638e-06, "loss": 0.9814, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 112, "tokens_per_second_per_gpu": 5932.16, "total_tokens": 20674769 }, { "epoch": 1.796812749003984, "grad_norm": 0.341796875, "learning_rate": 8.140883928370855e-06, "loss": 1.0088, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 113, "tokens_per_second_per_gpu": 5830.81, "total_tokens": 20853823 }, { "epoch": 1.812749003984064, "grad_norm": 0.322265625, "learning_rate": 7.957480182103198e-06, "loss": 0.9487, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 114, "tokens_per_second_per_gpu": 5865.22, "total_tokens": 21037578 }, { "epoch": 1.8286852589641436, "grad_norm": 0.328125, "learning_rate": 7.774790660436857e-06, "loss": 0.9819, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 115, "tokens_per_second_per_gpu": 6252.84, "total_tokens": 21225336 }, { "epoch": 1.8446215139442232, "grad_norm": 0.33203125, "learning_rate": 7.592879245910273e-06, "loss": 0.9482, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 116, "tokens_per_second_per_gpu": 6223.23, "total_tokens": 21416652 }, { "epoch": 1.860557768924303, "grad_norm": 0.322265625, "learning_rate": 7.411809548974792e-06, "loss": 0.9697, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 117, "tokens_per_second_per_gpu": 5561.76, "total_tokens": 21604434 }, { "epoch": 1.8764940239043826, "grad_norm": 0.30859375, "learning_rate": 7.2316448857515076e-06, "loss": 0.9468, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 118, "tokens_per_second_per_gpu": 6026.88, "total_tokens": 21795820 }, { "epoch": 1.8924302788844622, "grad_norm": 0.32421875, "learning_rate": 7.052448255890958e-06, "loss": 0.9624, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 119, "tokens_per_second_per_gpu": 5817.1, "total_tokens": 21979166 }, { "epoch": 1.908366533864542, "grad_norm": 0.33984375, "learning_rate": 6.874282320543557e-06, "loss": 1.022, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 120, "tokens_per_second_per_gpu": 5653.71, "total_tokens": 22165661 }, { "epoch": 1.9243027888446216, "grad_norm": 0.32421875, "learning_rate": 6.697209380448333e-06, "loss": 0.9961, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 121, "tokens_per_second_per_gpu": 5699.1, "total_tokens": 22344204 }, { "epoch": 1.9402390438247012, "grad_norm": 0.33203125, "learning_rate": 6.521291354147727e-06, "loss": 0.9521, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 122, "tokens_per_second_per_gpu": 5765.79, "total_tokens": 22526884 }, { "epoch": 1.956175298804781, "grad_norm": 0.349609375, "learning_rate": 6.34658975633605e-06, "loss": 1.0171, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 123, "tokens_per_second_per_gpu": 5780.38, "total_tokens": 22706300 }, { "epoch": 1.9721115537848606, "grad_norm": 0.318359375, "learning_rate": 6.173165676349103e-06, "loss": 0.957, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 124, "tokens_per_second_per_gpu": 5399.07, "total_tokens": 22884311 }, { "epoch": 1.9880478087649402, "grad_norm": 0.357421875, "learning_rate": 6.001079756802592e-06, "loss": 0.9028, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 125, "tokens_per_second_per_gpu": 5850.46, "total_tokens": 23077450 }, { "epoch": 2.0, "grad_norm": 0.39453125, "learning_rate": 5.830392172386723e-06, "loss": 0.8589, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 126, "tokens_per_second_per_gpu": 4564.2, "total_tokens": 23201118 }, { "epoch": 2.0159362549800797, "grad_norm": 0.32421875, "learning_rate": 5.66116260882442e-06, "loss": 0.9985, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 127, "tokens_per_second_per_gpu": 5832.26, "total_tokens": 23381636 }, { "epoch": 2.0318725099601593, "grad_norm": 0.328125, "learning_rate": 5.493450242000546e-06, "loss": 0.9521, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 128, "tokens_per_second_per_gpu": 5774.6, "total_tokens": 23564875 }, { "epoch": 2.047808764940239, "grad_norm": 0.328125, "learning_rate": 5.32731371726938e-06, "loss": 0.98, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 129, "tokens_per_second_per_gpu": 5972.38, "total_tokens": 23759161 }, { "epoch": 2.0637450199203187, "grad_norm": 0.328125, "learning_rate": 5.1628111289476025e-06, "loss": 0.9746, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 130, "tokens_per_second_per_gpu": 5919.54, "total_tokens": 23944394 }, { "epoch": 2.0796812749003983, "grad_norm": 0.31640625, "learning_rate": 5.000000000000003e-06, "loss": 0.9229, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 131, "tokens_per_second_per_gpu": 5414.05, "total_tokens": 24130879 }, { "epoch": 2.095617529880478, "grad_norm": 0.33203125, "learning_rate": 4.838937261924933e-06, "loss": 0.9639, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 132, "tokens_per_second_per_gpu": 5968.88, "total_tokens": 24319434 }, { "epoch": 2.1115537848605577, "grad_norm": 0.31640625, "learning_rate": 4.679679234846636e-06, "loss": 0.9502, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 133, "tokens_per_second_per_gpu": 5802.86, "total_tokens": 24502733 }, { "epoch": 2.1274900398406373, "grad_norm": 0.318359375, "learning_rate": 4.522281607821288e-06, "loss": 0.9854, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 134, "tokens_per_second_per_gpu": 5970.96, "total_tokens": 24691592 }, { "epoch": 2.143426294820717, "grad_norm": 0.373046875, "learning_rate": 4.3667994193637794e-06, "loss": 0.9683, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 135, "tokens_per_second_per_gpu": 5528.1, "total_tokens": 24868694 }, { "epoch": 2.1593625498007967, "grad_norm": 0.318359375, "learning_rate": 4.213287038201943e-06, "loss": 0.9561, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 136, "tokens_per_second_per_gpu": 6105.7, "total_tokens": 25058925 }, { "epoch": 2.1752988047808763, "grad_norm": 0.322265625, "learning_rate": 4.061798144264986e-06, "loss": 0.9116, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 137, "tokens_per_second_per_gpu": 5771.48, "total_tokens": 25242607 }, { "epoch": 2.191235059760956, "grad_norm": 0.3359375, "learning_rate": 3.912385709912794e-06, "loss": 0.9966, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 138, "tokens_per_second_per_gpu": 5723.02, "total_tokens": 25417132 }, { "epoch": 2.2071713147410357, "grad_norm": 0.318359375, "learning_rate": 3.7651019814126656e-06, "loss": 0.9712, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 139, "tokens_per_second_per_gpu": 5739.14, "total_tokens": 25598249 }, { "epoch": 2.2231075697211153, "grad_norm": 0.306640625, "learning_rate": 3.619998460669916e-06, "loss": 0.9106, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 140, "tokens_per_second_per_gpu": 6168.5, "total_tokens": 25791590 }, { "epoch": 2.239043824701195, "grad_norm": 0.31640625, "learning_rate": 3.4771258872187917e-06, "loss": 0.9673, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 141, "tokens_per_second_per_gpu": 6156.05, "total_tokens": 25981829 }, { "epoch": 2.2549800796812747, "grad_norm": 0.33203125, "learning_rate": 3.3365342204799613e-06, "loss": 0.9019, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 142, "tokens_per_second_per_gpu": 5766.33, "total_tokens": 26169706 }, { "epoch": 2.2709163346613543, "grad_norm": 0.50390625, "learning_rate": 3.1982726222908046e-06, "loss": 0.9995, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 143, "tokens_per_second_per_gpu": 5566.19, "total_tokens": 26349809 }, { "epoch": 2.2868525896414345, "grad_norm": 0.359375, "learning_rate": 3.0623894397145837e-06, "loss": 0.9805, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 144, "tokens_per_second_per_gpu": 5897.52, "total_tokens": 26533516 }, { "epoch": 2.302788844621514, "grad_norm": 0.375, "learning_rate": 2.9289321881345257e-06, "loss": 0.9219, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 145, "tokens_per_second_per_gpu": 6065.68, "total_tokens": 26727728 }, { "epoch": 2.318725099601594, "grad_norm": 0.3359375, "learning_rate": 2.7979475346387363e-06, "loss": 0.9639, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 146, "tokens_per_second_per_gpu": 5976.74, "total_tokens": 26912160 }, { "epoch": 2.3346613545816735, "grad_norm": 0.34765625, "learning_rate": 2.669481281701739e-06, "loss": 0.9038, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 147, "tokens_per_second_per_gpu": 5726.82, "total_tokens": 27091959 }, { "epoch": 2.350597609561753, "grad_norm": 0.341796875, "learning_rate": 2.5435783511683444e-06, "loss": 0.9614, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 148, "tokens_per_second_per_gpu": 5973.36, "total_tokens": 27275142 }, { "epoch": 2.366533864541833, "grad_norm": 0.33203125, "learning_rate": 2.420282768545469e-06, "loss": 0.9219, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 149, "tokens_per_second_per_gpu": 5654.39, "total_tokens": 27462544 }, { "epoch": 2.3824701195219125, "grad_norm": 0.322265625, "learning_rate": 2.2996376476073724e-06, "loss": 0.9526, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 150, "tokens_per_second_per_gpu": 5952.85, "total_tokens": 27649339 }, { "epoch": 2.398406374501992, "grad_norm": 0.3203125, "learning_rate": 2.1816851753197023e-06, "loss": 0.8235, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 151, "tokens_per_second_per_gpu": 5837.21, "total_tokens": 27844536 }, { "epoch": 2.414342629482072, "grad_norm": 0.333984375, "learning_rate": 2.0664665970876496e-06, "loss": 0.9521, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 152, "tokens_per_second_per_gpu": 5754.33, "total_tokens": 28027682 }, { "epoch": 2.4302788844621515, "grad_norm": 0.326171875, "learning_rate": 1.9540222023333165e-06, "loss": 0.9805, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 153, "tokens_per_second_per_gpu": 5902.39, "total_tokens": 28209735 }, { "epoch": 2.446215139442231, "grad_norm": 0.32421875, "learning_rate": 1.8443913104073984e-06, "loss": 0.9321, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 154, "tokens_per_second_per_gpu": 5930.45, "total_tokens": 28395668 }, { "epoch": 2.462151394422311, "grad_norm": 0.3203125, "learning_rate": 1.7376122568400533e-06, "loss": 0.9756, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 155, "tokens_per_second_per_gpu": 5945.66, "total_tokens": 28581494 }, { "epoch": 2.4780876494023905, "grad_norm": 0.322265625, "learning_rate": 1.6337223799358025e-06, "loss": 0.9736, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 156, "tokens_per_second_per_gpu": 6107.11, "total_tokens": 28768428 }, { "epoch": 2.49402390438247, "grad_norm": 0.31640625, "learning_rate": 1.5327580077171589e-06, "loss": 0.9067, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 157, "tokens_per_second_per_gpu": 6059.61, "total_tokens": 28952100 }, { "epoch": 2.50996015936255, "grad_norm": 0.326171875, "learning_rate": 1.4347544452214869e-06, "loss": 0.9512, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 158, "tokens_per_second_per_gpu": 6010.47, "total_tokens": 29138450 }, { "epoch": 2.5258964143426295, "grad_norm": 0.376953125, "learning_rate": 1.339745962155613e-06, "loss": 0.9409, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 159, "tokens_per_second_per_gpu": 6038.89, "total_tokens": 29321969 }, { "epoch": 2.541832669322709, "grad_norm": 0.330078125, "learning_rate": 1.2477657809124632e-06, "loss": 0.9824, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 160, "tokens_per_second_per_gpu": 5740.71, "total_tokens": 29498520 }, { "epoch": 2.557768924302789, "grad_norm": 0.328125, "learning_rate": 1.1588460649539036e-06, "loss": 1.0068, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 161, "tokens_per_second_per_gpu": 5804.04, "total_tokens": 29681434 }, { "epoch": 2.5737051792828685, "grad_norm": 0.337890625, "learning_rate": 1.073017907563887e-06, "loss": 0.9453, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 162, "tokens_per_second_per_gpu": 5741.44, "total_tokens": 29866170 }, { "epoch": 2.589641434262948, "grad_norm": 0.3203125, "learning_rate": 9.903113209758098e-07, "loss": 0.9443, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 163, "tokens_per_second_per_gpu": 5756.27, "total_tokens": 30050401 }, { "epoch": 2.605577689243028, "grad_norm": 0.32421875, "learning_rate": 9.107552258778907e-07, "loss": 0.9585, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 164, "tokens_per_second_per_gpu": 5959.88, "total_tokens": 30233631 }, { "epoch": 2.6215139442231075, "grad_norm": 0.3203125, "learning_rate": 8.343774413002382e-07, "loss": 0.9604, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 165, "tokens_per_second_per_gpu": 5899.57, "total_tokens": 30415679 }, { "epoch": 2.637450199203187, "grad_norm": 0.326171875, "learning_rate": 7.612046748871327e-07, "loss": 0.9277, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 166, "tokens_per_second_per_gpu": 5845.3, "total_tokens": 30601972 }, { "epoch": 2.653386454183267, "grad_norm": 0.33984375, "learning_rate": 6.912625135579587e-07, "loss": 1.022, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 167, "tokens_per_second_per_gpu": 5574.11, "total_tokens": 30778381 }, { "epoch": 2.6693227091633465, "grad_norm": 0.31640625, "learning_rate": 6.245754145600091e-07, "loss": 0.9707, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 168, "tokens_per_second_per_gpu": 5930.11, "total_tokens": 30964704 }, { "epoch": 2.685258964143426, "grad_norm": 0.318359375, "learning_rate": 5.611666969163243e-07, "loss": 0.9448, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 169, "tokens_per_second_per_gpu": 6128.63, "total_tokens": 31154713 }, { "epoch": 2.7011952191235062, "grad_norm": 0.333984375, "learning_rate": 5.010585332715401e-07, "loss": 0.9883, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 170, "tokens_per_second_per_gpu": 6064.77, "total_tokens": 31336726 }, { "epoch": 2.717131474103586, "grad_norm": 0.328125, "learning_rate": 4.4427194213859216e-07, "loss": 0.9131, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 171, "tokens_per_second_per_gpu": 5798.36, "total_tokens": 31529156 }, { "epoch": 2.7330677290836656, "grad_norm": 0.318359375, "learning_rate": 3.908267805490051e-07, "loss": 0.9697, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 172, "tokens_per_second_per_gpu": 5977.29, "total_tokens": 31718747 }, { "epoch": 2.7490039840637452, "grad_norm": 0.328125, "learning_rate": 3.4074173710931804e-07, "loss": 0.9619, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 173, "tokens_per_second_per_gpu": 5934.4, "total_tokens": 31901737 }, { "epoch": 2.764940239043825, "grad_norm": 0.322265625, "learning_rate": 2.940343254660905e-07, "loss": 0.9185, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 174, "tokens_per_second_per_gpu": 5978.89, "total_tokens": 32087844 }, { "epoch": 2.7808764940239046, "grad_norm": 0.328125, "learning_rate": 2.507208781817638e-07, "loss": 0.9751, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 175, "tokens_per_second_per_gpu": 5946.29, "total_tokens": 32275328 }, { "epoch": 2.7968127490039842, "grad_norm": 0.337890625, "learning_rate": 2.1081654102351634e-07, "loss": 1.0015, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 176, "tokens_per_second_per_gpu": 5871.96, "total_tokens": 32454382 }, { "epoch": 2.812749003984064, "grad_norm": 0.318359375, "learning_rate": 1.7433526766711727e-07, "loss": 0.9429, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 177, "tokens_per_second_per_gpu": 5872.24, "total_tokens": 32638137 }, { "epoch": 2.8286852589641436, "grad_norm": 0.3203125, "learning_rate": 1.4128981481764115e-07, "loss": 0.9746, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 178, "tokens_per_second_per_gpu": 6173.8, "total_tokens": 32825895 }, { "epoch": 2.8446215139442232, "grad_norm": 0.322265625, "learning_rate": 1.1169173774871478e-07, "loss": 0.9434, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 179, "tokens_per_second_per_gpu": 6232.85, "total_tokens": 33017211 }, { "epoch": 2.860557768924303, "grad_norm": 0.3203125, "learning_rate": 8.555138626189619e-08, "loss": 0.9644, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 180, "tokens_per_second_per_gpu": 5578.64, "total_tokens": 33204993 }, { "epoch": 2.8764940239043826, "grad_norm": 0.310546875, "learning_rate": 6.287790106757396e-08, "loss": 0.9429, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 181, "tokens_per_second_per_gpu": 6034.66, "total_tokens": 33396379 }, { "epoch": 2.8924302788844622, "grad_norm": 0.318359375, "learning_rate": 4.367921058866187e-08, "loss": 0.959, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 182, "tokens_per_second_per_gpu": 5841.56, "total_tokens": 33579725 }, { "epoch": 2.908366533864542, "grad_norm": 0.33984375, "learning_rate": 2.796202818819871e-08, "loss": 1.0166, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 183, "tokens_per_second_per_gpu": 5736.29, "total_tokens": 33766220 }, { "epoch": 2.9243027888446216, "grad_norm": 0.322265625, "learning_rate": 1.5731849821833955e-08, "loss": 0.9907, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 184, "tokens_per_second_per_gpu": 5704.78, "total_tokens": 33944763 }, { "epoch": 2.9402390438247012, "grad_norm": 0.326171875, "learning_rate": 6.992952116013918e-09, "loss": 0.9478, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 185, "tokens_per_second_per_gpu": 5773.31, "total_tokens": 34127443 }, { "epoch": 2.956175298804781, "grad_norm": 0.349609375, "learning_rate": 1.7483908725357546e-09, "loss": 1.0122, "memory/device_reserved (GiB)": 76.38, "memory/max_active (GiB)": 64.91, "memory/max_allocated (GiB)": 64.91, "step": 186, "tokens_per_second_per_gpu": 5785.44, "total_tokens": 34306859 } ], "logging_steps": 1, "max_steps": 186, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 62, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.2082055574021734e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }