miyuki2026 commited on
Commit
cb8268d
·
1 Parent(s): 09e6e81
examples/tutorials/by_deepspeed/step_2_train_model.py CHANGED
@@ -11,12 +11,12 @@ import platform
11
  os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
12
 
13
  if platform.system() in ("Windows", "Darwin"):
14
- from project_settings import project_path
15
  else:
16
  project_path = os.path.abspath("../../../")
17
  project_path = Path(project_path)
 
18
 
19
- from peft import LoraConfig
20
  # from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
21
  from modelscope import AutoConfig, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
22
  from trl import SFTTrainer, SFTConfig
@@ -42,14 +42,12 @@ def get_args():
42
  parser.add_argument("--dataset_split", default=None, type=str),
43
  parser.add_argument(
44
  "--dataset_cache_dir",
45
- # default=(project_path / "hub_datasets").as_posix(),
46
- default="/root/autodl-tmp/OpenMiniMind/hub_datasets",
47
  type=str
48
  ),
49
  parser.add_argument(
50
  "--model_cache_dir",
51
- # default=(project_path / "hub_models").as_posix(),
52
- default="/root/autodl-tmp/OpenMiniMind/hub_models",
53
  type=str
54
  ),
55
  parser.add_argument("--dataset_streaming", default=None, type=str),
@@ -75,13 +73,13 @@ def main():
75
  quantization_config=None,
76
  # device_map="auto",
77
  trust_remote_code=True,
78
- # cache_dir=args.model_cache_dir,
79
  )
80
  print(model)
81
  tokenizer = AutoTokenizer.from_pretrained(
82
  pretrained_model_name_or_path=args.model_name,
83
  trust_remote_code=True,
84
- # cache_dir=args.model_cache_dir,
85
  )
86
  print(tokenizer)
87
 
@@ -128,7 +126,7 @@ def main():
128
  train_dataset=train_dataset,
129
  eval_dataset=None, # Can set up evaluation!
130
  args=SFTConfig(
131
- output_dir="/root/autodl-tmp/OpenMiniMind/trainer_output", # 请替换为你想要的路径
132
  dataset_text_field="formated_text",
133
  deepspeed="./ds_config/deepspeed_stage_3_config.json", # 添加deepspeed配置文件
134
  per_device_train_batch_size=1,
@@ -138,6 +136,8 @@ def main():
138
  # max_steps = 30,
139
  learning_rate=3e-5, # Reduce to 2e-5 for long training runs
140
  logging_steps=1,
 
 
141
  optim="adamw_8bit",
142
  weight_decay=0,
143
  lr_scheduler_type="constant_with_warmup",
@@ -169,18 +169,10 @@ def main():
169
  print(f"Peak reserved memory % of max memory = {used_percentage} %.")
170
  print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
171
 
172
- # 只保存lora适配器参数
173
- trained_models_dir = project_path / "trained_models" / "Qwen3-8B-sft-deepspeed"
174
  trained_models_dir.mkdir(parents=True, exist_ok=True)
175
  trainer.model.save_pretrained(trained_models_dir.as_posix())
176
  tokenizer.save_pretrained(trained_models_dir.as_posix())
177
-
178
- # trained_models_dir = project_path / "trained_models" / "Qwen3-8B-sft-fp16"
179
- # trained_models_dir.mkdir(parents=True, exist_ok=True)
180
- # trainer.model.save_pretrained_merged(trained_models_dir.as_posix(), tokenizer, save_method="merged_16bit",)
181
- # trained_models_dir = project_path / "trained_models" / "Qwen3-8B-sft-int4"
182
- # trained_models_dir.mkdir(parents=True, exist_ok=True)
183
- # trainer.model.save_pretrained_merged(trained_models_dir.as_posix(), tokenizer, save_method="merged_4bit",)
184
  return
185
 
186
 
 
11
  os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
12
 
13
  if platform.system() in ("Windows", "Darwin"):
14
+ from project_settings import project_path, temp_directory
15
  else:
16
  project_path = os.path.abspath("../../../")
17
  project_path = Path(project_path)
18
+ temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
19
 
 
20
  # from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
21
  from modelscope import AutoConfig, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
22
  from trl import SFTTrainer, SFTConfig
 
42
  parser.add_argument("--dataset_split", default=None, type=str),
43
  parser.add_argument(
44
  "--dataset_cache_dir",
45
+ default=(temp_directory / "hub_datasets").as_posix(),
 
46
  type=str
47
  ),
48
  parser.add_argument(
49
  "--model_cache_dir",
50
+ default=(temp_directory / "hub_models").as_posix(),
 
51
  type=str
52
  ),
53
  parser.add_argument("--dataset_streaming", default=None, type=str),
 
73
  quantization_config=None,
74
  # device_map="auto",
75
  trust_remote_code=True,
76
+ cache_dir=args.model_cache_dir,
77
  )
78
  print(model)
79
  tokenizer = AutoTokenizer.from_pretrained(
80
  pretrained_model_name_or_path=args.model_name,
81
  trust_remote_code=True,
82
+ cache_dir=args.model_cache_dir,
83
  )
84
  print(tokenizer)
85
 
 
126
  train_dataset=train_dataset,
127
  eval_dataset=None, # Can set up evaluation!
128
  args=SFTConfig(
129
+ output_dir=(temp_directory / "Qwen3-8B-sft-deepspeed/trainer_output"), # 请替换为你想要的路径
130
  dataset_text_field="formated_text",
131
  deepspeed="./ds_config/deepspeed_stage_3_config.json", # 添加deepspeed配置文件
132
  per_device_train_batch_size=1,
 
136
  # max_steps = 30,
137
  learning_rate=3e-5, # Reduce to 2e-5 for long training runs
138
  logging_steps=1,
139
+ save_steps=100, # 每500步保存一次检查点
140
+ save_total_limit=2, # 最多只保留2个检查点,旧的自动清理
141
  optim="adamw_8bit",
142
  weight_decay=0,
143
  lr_scheduler_type="constant_with_warmup",
 
169
  print(f"Peak reserved memory % of max memory = {used_percentage} %.")
170
  print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
171
 
172
+ trained_models_dir = temp_directory / "trained_models" / "Qwen3-8B-sft-deepspeed"
 
173
  trained_models_dir.mkdir(parents=True, exist_ok=True)
174
  trainer.model.save_pretrained(trained_models_dir.as_posix())
175
  tokenizer.save_pretrained(trained_models_dir.as_posix())
 
 
 
 
 
 
 
176
  return
177
 
178