Spaces:
Sleeping
Sleeping
Commit ·
cb8268d
1
Parent(s): 09e6e81
update
Browse files
examples/tutorials/by_deepspeed/step_2_train_model.py
CHANGED
|
@@ -11,12 +11,12 @@ import platform
|
|
| 11 |
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
|
| 12 |
|
| 13 |
if platform.system() in ("Windows", "Darwin"):
|
| 14 |
-
from project_settings import project_path
|
| 15 |
else:
|
| 16 |
project_path = os.path.abspath("../../../")
|
| 17 |
project_path = Path(project_path)
|
|
|
|
| 18 |
|
| 19 |
-
from peft import LoraConfig
|
| 20 |
# from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
| 21 |
from modelscope import AutoConfig, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
| 22 |
from trl import SFTTrainer, SFTConfig
|
|
@@ -42,14 +42,12 @@ def get_args():
|
|
| 42 |
parser.add_argument("--dataset_split", default=None, type=str),
|
| 43 |
parser.add_argument(
|
| 44 |
"--dataset_cache_dir",
|
| 45 |
-
|
| 46 |
-
default="/root/autodl-tmp/OpenMiniMind/hub_datasets",
|
| 47 |
type=str
|
| 48 |
),
|
| 49 |
parser.add_argument(
|
| 50 |
"--model_cache_dir",
|
| 51 |
-
|
| 52 |
-
default="/root/autodl-tmp/OpenMiniMind/hub_models",
|
| 53 |
type=str
|
| 54 |
),
|
| 55 |
parser.add_argument("--dataset_streaming", default=None, type=str),
|
|
@@ -75,13 +73,13 @@ def main():
|
|
| 75 |
quantization_config=None,
|
| 76 |
# device_map="auto",
|
| 77 |
trust_remote_code=True,
|
| 78 |
-
|
| 79 |
)
|
| 80 |
print(model)
|
| 81 |
tokenizer = AutoTokenizer.from_pretrained(
|
| 82 |
pretrained_model_name_or_path=args.model_name,
|
| 83 |
trust_remote_code=True,
|
| 84 |
-
|
| 85 |
)
|
| 86 |
print(tokenizer)
|
| 87 |
|
|
@@ -128,7 +126,7 @@ def main():
|
|
| 128 |
train_dataset=train_dataset,
|
| 129 |
eval_dataset=None, # Can set up evaluation!
|
| 130 |
args=SFTConfig(
|
| 131 |
-
output_dir=
|
| 132 |
dataset_text_field="formated_text",
|
| 133 |
deepspeed="./ds_config/deepspeed_stage_3_config.json", # 添加deepspeed配置文件
|
| 134 |
per_device_train_batch_size=1,
|
|
@@ -138,6 +136,8 @@ def main():
|
|
| 138 |
# max_steps = 30,
|
| 139 |
learning_rate=3e-5, # Reduce to 2e-5 for long training runs
|
| 140 |
logging_steps=1,
|
|
|
|
|
|
|
| 141 |
optim="adamw_8bit",
|
| 142 |
weight_decay=0,
|
| 143 |
lr_scheduler_type="constant_with_warmup",
|
|
@@ -169,18 +169,10 @@ def main():
|
|
| 169 |
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
|
| 170 |
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
|
| 171 |
|
| 172 |
-
|
| 173 |
-
trained_models_dir = project_path / "trained_models" / "Qwen3-8B-sft-deepspeed"
|
| 174 |
trained_models_dir.mkdir(parents=True, exist_ok=True)
|
| 175 |
trainer.model.save_pretrained(trained_models_dir.as_posix())
|
| 176 |
tokenizer.save_pretrained(trained_models_dir.as_posix())
|
| 177 |
-
|
| 178 |
-
# trained_models_dir = project_path / "trained_models" / "Qwen3-8B-sft-fp16"
|
| 179 |
-
# trained_models_dir.mkdir(parents=True, exist_ok=True)
|
| 180 |
-
# trainer.model.save_pretrained_merged(trained_models_dir.as_posix(), tokenizer, save_method="merged_16bit",)
|
| 181 |
-
# trained_models_dir = project_path / "trained_models" / "Qwen3-8B-sft-int4"
|
| 182 |
-
# trained_models_dir.mkdir(parents=True, exist_ok=True)
|
| 183 |
-
# trainer.model.save_pretrained_merged(trained_models_dir.as_posix(), tokenizer, save_method="merged_4bit",)
|
| 184 |
return
|
| 185 |
|
| 186 |
|
|
|
|
| 11 |
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
|
| 12 |
|
| 13 |
if platform.system() in ("Windows", "Darwin"):
|
| 14 |
+
from project_settings import project_path, temp_directory
|
| 15 |
else:
|
| 16 |
project_path = os.path.abspath("../../../")
|
| 17 |
project_path = Path(project_path)
|
| 18 |
+
temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
|
| 19 |
|
|
|
|
| 20 |
# from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
| 21 |
from modelscope import AutoConfig, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
| 22 |
from trl import SFTTrainer, SFTConfig
|
|
|
|
| 42 |
parser.add_argument("--dataset_split", default=None, type=str),
|
| 43 |
parser.add_argument(
|
| 44 |
"--dataset_cache_dir",
|
| 45 |
+
default=(temp_directory / "hub_datasets").as_posix(),
|
|
|
|
| 46 |
type=str
|
| 47 |
),
|
| 48 |
parser.add_argument(
|
| 49 |
"--model_cache_dir",
|
| 50 |
+
default=(temp_directory / "hub_models").as_posix(),
|
|
|
|
| 51 |
type=str
|
| 52 |
),
|
| 53 |
parser.add_argument("--dataset_streaming", default=None, type=str),
|
|
|
|
| 73 |
quantization_config=None,
|
| 74 |
# device_map="auto",
|
| 75 |
trust_remote_code=True,
|
| 76 |
+
cache_dir=args.model_cache_dir,
|
| 77 |
)
|
| 78 |
print(model)
|
| 79 |
tokenizer = AutoTokenizer.from_pretrained(
|
| 80 |
pretrained_model_name_or_path=args.model_name,
|
| 81 |
trust_remote_code=True,
|
| 82 |
+
cache_dir=args.model_cache_dir,
|
| 83 |
)
|
| 84 |
print(tokenizer)
|
| 85 |
|
|
|
|
| 126 |
train_dataset=train_dataset,
|
| 127 |
eval_dataset=None, # Can set up evaluation!
|
| 128 |
args=SFTConfig(
|
| 129 |
+
output_dir=(temp_directory / "Qwen3-8B-sft-deepspeed/trainer_output"), # 请替换为你想要的路径
|
| 130 |
dataset_text_field="formated_text",
|
| 131 |
deepspeed="./ds_config/deepspeed_stage_3_config.json", # 添加deepspeed配置文件
|
| 132 |
per_device_train_batch_size=1,
|
|
|
|
| 136 |
# max_steps = 30,
|
| 137 |
learning_rate=3e-5, # Reduce to 2e-5 for long training runs
|
| 138 |
logging_steps=1,
|
| 139 |
+
save_steps=100, # 每500步保存一次检查点
|
| 140 |
+
save_total_limit=2, # 最多只保留2个检查点,旧的自动清理
|
| 141 |
optim="adamw_8bit",
|
| 142 |
weight_decay=0,
|
| 143 |
lr_scheduler_type="constant_with_warmup",
|
|
|
|
| 169 |
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
|
| 170 |
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
|
| 171 |
|
| 172 |
+
trained_models_dir = temp_directory / "trained_models" / "Qwen3-8B-sft-deepspeed"
|
|
|
|
| 173 |
trained_models_dir.mkdir(parents=True, exist_ok=True)
|
| 174 |
trainer.model.save_pretrained(trained_models_dir.as_posix())
|
| 175 |
tokenizer.save_pretrained(trained_models_dir.as_posix())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
return
|
| 177 |
|
| 178 |
|