miyuki2026 commited on
Commit
b4eb298
·
1 Parent(s): c36b946
examples/tutorials/by_deepspeed/step_2_train_model.py CHANGED
@@ -56,8 +56,8 @@ def get_args():
56
 
57
  parser.add_argument(
58
  "--num_workers",
59
- default=None if platform.system() == "Windows" else os.cpu_count() // 2,
60
- type=str
61
  ),
62
  args = parser.parse_args()
63
  return args
@@ -75,13 +75,15 @@ def main():
75
  trust_remote_code=True,
76
  cache_dir=args.model_cache_dir,
77
  )
78
- print(model)
 
79
  tokenizer = AutoTokenizer.from_pretrained(
80
  pretrained_model_name_or_path=args.model_name,
81
  trust_remote_code=True,
82
  cache_dir=args.model_cache_dir,
83
  )
84
- print(tokenizer)
 
85
 
86
  def format_func(example):
87
  formated_text = tokenizer.apply_chat_template(
@@ -102,7 +104,8 @@ def main():
102
  streaming=args.dataset_streaming,
103
  )
104
  dataset = dataset_dict["train"]
105
- print(dataset)
 
106
 
107
  if args.dataset_streaming:
108
  valid_dataset = dataset.take(args.valid_dataset_size)
@@ -129,8 +132,8 @@ def main():
129
  output_dir=(temp_directory / "Qwen3-8B-sft-deepspeed/trainer_output"), # 请替换为你想要的路径
130
  dataset_text_field="formated_text",
131
  deepspeed="./ds_config/deepspeed_stage_3_config.json", # 添加deepspeed配置文件
132
- per_device_train_batch_size=1,
133
- gradient_accumulation_steps=64, # Use GA to mimic batch size!
134
  warmup_steps=100,
135
  num_train_epochs=1, # Set this for 1 full training run.
136
  # max_steps = 30,
@@ -150,8 +153,9 @@ def main():
150
  gpu_stats = torch.cuda.get_device_properties(0)
151
  start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
152
  max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
153
- print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
154
- print(f"{start_gpu_memory} GB of memory reserved.")
 
155
 
156
  trainer_stats = trainer.train()
157
 
@@ -160,19 +164,21 @@ def main():
160
  used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
161
  used_percentage = round(used_memory / max_memory * 100, 3)
162
  lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
163
- print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
164
- print(
165
- f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training."
166
- )
167
- print(f"Peak reserved memory = {used_memory} GB.")
168
- print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
169
- print(f"Peak reserved memory % of max memory = {used_percentage} %.")
170
- print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
171
-
172
- trained_models_dir = temp_directory / "trained_models" / "Qwen3-8B-sft-deepspeed"
173
- trained_models_dir.mkdir(parents=True, exist_ok=True)
174
- trainer.model.save_pretrained(trained_models_dir.as_posix())
175
- tokenizer.save_pretrained(trained_models_dir.as_posix())
 
 
176
  return
177
 
178
 
 
56
 
57
  parser.add_argument(
58
  "--num_workers",
59
+ default=None if platform.system() in ("Windows", "Darwin") else os.cpu_count() // 2,
60
+ type=int
61
  ),
62
  args = parser.parse_args()
63
  return args
 
75
  trust_remote_code=True,
76
  cache_dir=args.model_cache_dir,
77
  )
78
+ if args.local_rank in (-1, 0):
79
+ print(model)
80
  tokenizer = AutoTokenizer.from_pretrained(
81
  pretrained_model_name_or_path=args.model_name,
82
  trust_remote_code=True,
83
  cache_dir=args.model_cache_dir,
84
  )
85
+ if args.local_rank in (-1, 0):
86
+ print(tokenizer)
87
 
88
  def format_func(example):
89
  formated_text = tokenizer.apply_chat_template(
 
104
  streaming=args.dataset_streaming,
105
  )
106
  dataset = dataset_dict["train"]
107
+ if args.local_rank in (-1, 0):
108
+ print(dataset)
109
 
110
  if args.dataset_streaming:
111
  valid_dataset = dataset.take(args.valid_dataset_size)
 
132
  output_dir=(temp_directory / "Qwen3-8B-sft-deepspeed/trainer_output"), # 请替换为你想要的路径
133
  dataset_text_field="formated_text",
134
  deepspeed="./ds_config/deepspeed_stage_3_config.json", # 添加deepspeed配置文件
135
+ per_device_train_batch_size=2,
136
+ gradient_accumulation_steps=8,
137
  warmup_steps=100,
138
  num_train_epochs=1, # Set this for 1 full training run.
139
  # max_steps = 30,
 
153
  gpu_stats = torch.cuda.get_device_properties(0)
154
  start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
155
  max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
156
+ if args.local_rank in (-1, 0):
157
+ print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
158
+ print(f"{start_gpu_memory} GB of memory reserved.")
159
 
160
  trainer_stats = trainer.train()
161
 
 
164
  used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
165
  used_percentage = round(used_memory / max_memory * 100, 3)
166
  lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
167
+ if args.local_rank in (-1, 0):
168
+ print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
169
+ print(
170
+ f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training."
171
+ )
172
+ print(f"Peak reserved memory = {used_memory} GB.")
173
+ print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
174
+ print(f"Peak reserved memory % of max memory = {used_percentage} %.")
175
+ print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
176
+
177
+ if args.local_rank in (-1, 0):
178
+ trained_models_dir = temp_directory / "trained_models" / "Qwen3-8B-sft-deepspeed"
179
+ trained_models_dir.mkdir(parents=True, exist_ok=True)
180
+ trainer.model.save_pretrained(trained_models_dir.as_posix())
181
+ tokenizer.save_pretrained(trained_models_dir.as_posix())
182
  return
183
 
184