Spaces:

miyuki2026
/

OpenMiniMind

Sleeping

App Files Files Community

miyuki2026 commited on 28 days ago

Commit

b4eb298

1 Parent(s): c36b946

update

Browse files

Files changed (1) hide show

examples/tutorials/by_deepspeed/step_2_train_model.py +28 -22

examples/tutorials/by_deepspeed/step_2_train_model.py CHANGED Viewed

@@ -56,8 +56,8 @@ def get_args():
     parser.add_argument(
         "--num_workers",
-        default=None if platform.system() == "Windows" else os.cpu_count() // 2,
-        type=str
     ),
     args = parser.parse_args()
     return args
@@ -75,13 +75,15 @@ def main():
         trust_remote_code=True,
         cache_dir=args.model_cache_dir,
     )
-    print(model)
     tokenizer = AutoTokenizer.from_pretrained(
         pretrained_model_name_or_path=args.model_name,
         trust_remote_code=True,
         cache_dir=args.model_cache_dir,
     )
-    print(tokenizer)
     def format_func(example):
         formated_text = tokenizer.apply_chat_template(
@@ -102,7 +104,8 @@ def main():
         streaming=args.dataset_streaming,
     )
     dataset = dataset_dict["train"]
-    print(dataset)
     if args.dataset_streaming:
         valid_dataset = dataset.take(args.valid_dataset_size)
@@ -129,8 +132,8 @@ def main():
             output_dir=(temp_directory / "Qwen3-8B-sft-deepspeed/trainer_output"),  # 请替换为你想要的路径
             dataset_text_field="formated_text",
             deepspeed="./ds_config/deepspeed_stage_3_config.json",  # 添加deepspeed配置文件
-            per_device_train_batch_size=1,
-            gradient_accumulation_steps=64,  # Use GA to mimic batch size!
             warmup_steps=100,
             num_train_epochs=1,  # Set this for 1 full training run.
             # max_steps = 30,
@@ -150,8 +153,9 @@ def main():
     gpu_stats = torch.cuda.get_device_properties(0)
     start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
     max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
-    print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
-    print(f"{start_gpu_memory} GB of memory reserved.")
     trainer_stats = trainer.train()
@@ -160,19 +164,21 @@ def main():
     used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
     used_percentage = round(used_memory / max_memory * 100, 3)
     lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
-    print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
-    print(
-        f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training."
-    )
-    print(f"Peak reserved memory = {used_memory} GB.")
-    print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
-    print(f"Peak reserved memory % of max memory = {used_percentage} %.")
-    print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
-    trained_models_dir = temp_directory / "trained_models" / "Qwen3-8B-sft-deepspeed"
-    trained_models_dir.mkdir(parents=True, exist_ok=True)
-    trainer.model.save_pretrained(trained_models_dir.as_posix())
-    tokenizer.save_pretrained(trained_models_dir.as_posix())
     return

     parser.add_argument(
         "--num_workers",
+        default=None if platform.system() in ("Windows", "Darwin") else os.cpu_count() // 2,
+        type=int
     ),
     args = parser.parse_args()
     return args
         trust_remote_code=True,
         cache_dir=args.model_cache_dir,
     )
+    if args.local_rank in (-1, 0):
+        print(model)
     tokenizer = AutoTokenizer.from_pretrained(
         pretrained_model_name_or_path=args.model_name,
         trust_remote_code=True,
         cache_dir=args.model_cache_dir,
     )
+    if args.local_rank in (-1, 0):
+        print(tokenizer)
     def format_func(example):
         formated_text = tokenizer.apply_chat_template(
         streaming=args.dataset_streaming,
     )
     dataset = dataset_dict["train"]
+    if args.local_rank in (-1, 0):
+        print(dataset)
     if args.dataset_streaming:
         valid_dataset = dataset.take(args.valid_dataset_size)
             output_dir=(temp_directory / "Qwen3-8B-sft-deepspeed/trainer_output"),  # 请替换为你想要的路径
             dataset_text_field="formated_text",
             deepspeed="./ds_config/deepspeed_stage_3_config.json",  # 添加deepspeed配置文件
+            per_device_train_batch_size=2,
+            gradient_accumulation_steps=8,
             warmup_steps=100,
             num_train_epochs=1,  # Set this for 1 full training run.
             # max_steps = 30,
     gpu_stats = torch.cuda.get_device_properties(0)
     start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
     max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
+    if args.local_rank in (-1, 0):
+        print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
+        print(f"{start_gpu_memory} GB of memory reserved.")
     trainer_stats = trainer.train()
     used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
     used_percentage = round(used_memory / max_memory * 100, 3)
     lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
+    if args.local_rank in (-1, 0):
+        print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
+        print(
+            f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training."
+        )
+        print(f"Peak reserved memory = {used_memory} GB.")
+        print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
+        print(f"Peak reserved memory % of max memory = {used_percentage} %.")
+        print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
+    if args.local_rank in (-1, 0):
+        trained_models_dir = temp_directory / "trained_models" / "Qwen3-8B-sft-deepspeed"
+        trained_models_dir.mkdir(parents=True, exist_ok=True)
+        trainer.model.save_pretrained(trained_models_dir.as_posix())
+        tokenizer.save_pretrained(trained_models_dir.as_posix())
     return