Spaces:
Sleeping
Sleeping
Commit ·
b4eb298
1
Parent(s): c36b946
update
Browse files
examples/tutorials/by_deepspeed/step_2_train_model.py
CHANGED
|
@@ -56,8 +56,8 @@ def get_args():
|
|
| 56 |
|
| 57 |
parser.add_argument(
|
| 58 |
"--num_workers",
|
| 59 |
-
default=None if platform.system()
|
| 60 |
-
type=
|
| 61 |
),
|
| 62 |
args = parser.parse_args()
|
| 63 |
return args
|
|
@@ -75,13 +75,15 @@ def main():
|
|
| 75 |
trust_remote_code=True,
|
| 76 |
cache_dir=args.model_cache_dir,
|
| 77 |
)
|
| 78 |
-
|
|
|
|
| 79 |
tokenizer = AutoTokenizer.from_pretrained(
|
| 80 |
pretrained_model_name_or_path=args.model_name,
|
| 81 |
trust_remote_code=True,
|
| 82 |
cache_dir=args.model_cache_dir,
|
| 83 |
)
|
| 84 |
-
|
|
|
|
| 85 |
|
| 86 |
def format_func(example):
|
| 87 |
formated_text = tokenizer.apply_chat_template(
|
|
@@ -102,7 +104,8 @@ def main():
|
|
| 102 |
streaming=args.dataset_streaming,
|
| 103 |
)
|
| 104 |
dataset = dataset_dict["train"]
|
| 105 |
-
|
|
|
|
| 106 |
|
| 107 |
if args.dataset_streaming:
|
| 108 |
valid_dataset = dataset.take(args.valid_dataset_size)
|
|
@@ -129,8 +132,8 @@ def main():
|
|
| 129 |
output_dir=(temp_directory / "Qwen3-8B-sft-deepspeed/trainer_output"), # 请替换为你想要的路径
|
| 130 |
dataset_text_field="formated_text",
|
| 131 |
deepspeed="./ds_config/deepspeed_stage_3_config.json", # 添加deepspeed配置文件
|
| 132 |
-
per_device_train_batch_size=
|
| 133 |
-
gradient_accumulation_steps=
|
| 134 |
warmup_steps=100,
|
| 135 |
num_train_epochs=1, # Set this for 1 full training run.
|
| 136 |
# max_steps = 30,
|
|
@@ -150,8 +153,9 @@ def main():
|
|
| 150 |
gpu_stats = torch.cuda.get_device_properties(0)
|
| 151 |
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
|
| 152 |
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
|
| 153 |
-
|
| 154 |
-
|
|
|
|
| 155 |
|
| 156 |
trainer_stats = trainer.train()
|
| 157 |
|
|
@@ -160,19 +164,21 @@ def main():
|
|
| 160 |
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
|
| 161 |
used_percentage = round(used_memory / max_memory * 100, 3)
|
| 162 |
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
|
|
|
|
|
|
| 176 |
return
|
| 177 |
|
| 178 |
|
|
|
|
| 56 |
|
| 57 |
parser.add_argument(
|
| 58 |
"--num_workers",
|
| 59 |
+
default=None if platform.system() in ("Windows", "Darwin") else os.cpu_count() // 2,
|
| 60 |
+
type=int
|
| 61 |
),
|
| 62 |
args = parser.parse_args()
|
| 63 |
return args
|
|
|
|
| 75 |
trust_remote_code=True,
|
| 76 |
cache_dir=args.model_cache_dir,
|
| 77 |
)
|
| 78 |
+
if args.local_rank in (-1, 0):
|
| 79 |
+
print(model)
|
| 80 |
tokenizer = AutoTokenizer.from_pretrained(
|
| 81 |
pretrained_model_name_or_path=args.model_name,
|
| 82 |
trust_remote_code=True,
|
| 83 |
cache_dir=args.model_cache_dir,
|
| 84 |
)
|
| 85 |
+
if args.local_rank in (-1, 0):
|
| 86 |
+
print(tokenizer)
|
| 87 |
|
| 88 |
def format_func(example):
|
| 89 |
formated_text = tokenizer.apply_chat_template(
|
|
|
|
| 104 |
streaming=args.dataset_streaming,
|
| 105 |
)
|
| 106 |
dataset = dataset_dict["train"]
|
| 107 |
+
if args.local_rank in (-1, 0):
|
| 108 |
+
print(dataset)
|
| 109 |
|
| 110 |
if args.dataset_streaming:
|
| 111 |
valid_dataset = dataset.take(args.valid_dataset_size)
|
|
|
|
| 132 |
output_dir=(temp_directory / "Qwen3-8B-sft-deepspeed/trainer_output"), # 请替换为你想要的路径
|
| 133 |
dataset_text_field="formated_text",
|
| 134 |
deepspeed="./ds_config/deepspeed_stage_3_config.json", # 添加deepspeed配置文件
|
| 135 |
+
per_device_train_batch_size=2,
|
| 136 |
+
gradient_accumulation_steps=8,
|
| 137 |
warmup_steps=100,
|
| 138 |
num_train_epochs=1, # Set this for 1 full training run.
|
| 139 |
# max_steps = 30,
|
|
|
|
| 153 |
gpu_stats = torch.cuda.get_device_properties(0)
|
| 154 |
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
|
| 155 |
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
|
| 156 |
+
if args.local_rank in (-1, 0):
|
| 157 |
+
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
|
| 158 |
+
print(f"{start_gpu_memory} GB of memory reserved.")
|
| 159 |
|
| 160 |
trainer_stats = trainer.train()
|
| 161 |
|
|
|
|
| 164 |
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
|
| 165 |
used_percentage = round(used_memory / max_memory * 100, 3)
|
| 166 |
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
|
| 167 |
+
if args.local_rank in (-1, 0):
|
| 168 |
+
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
|
| 169 |
+
print(
|
| 170 |
+
f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training."
|
| 171 |
+
)
|
| 172 |
+
print(f"Peak reserved memory = {used_memory} GB.")
|
| 173 |
+
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
|
| 174 |
+
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
|
| 175 |
+
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
|
| 176 |
+
|
| 177 |
+
if args.local_rank in (-1, 0):
|
| 178 |
+
trained_models_dir = temp_directory / "trained_models" / "Qwen3-8B-sft-deepspeed"
|
| 179 |
+
trained_models_dir.mkdir(parents=True, exist_ok=True)
|
| 180 |
+
trainer.model.save_pretrained(trained_models_dir.as_posix())
|
| 181 |
+
tokenizer.save_pretrained(trained_models_dir.as_posix())
|
| 182 |
return
|
| 183 |
|
| 184 |
|