File size: 7,127 Bytes
4ea4da5
 
0bc360d
 
 
4ea4da5
 
 
 
 
415ae04
4ea4da5
 
cb8268d
4ea4da5
 
 
cb8268d
4ea4da5
6c8daa9
 
4ea4da5
 
 
 
 
 
 
0bc360d
 
4ea4da5
 
46f3343
4ea4da5
94d6fef
4ea4da5
 
 
 
 
 
 
 
 
cb8268d
4ea4da5
 
ee55dd5
 
cb8268d
ee55dd5
 
4ea4da5
ac79c4a
4ea4da5
 
 
 
b4eb298
 
4ea4da5
 
 
 
 
 
 
 
6c8daa9
8c67d01
4ea4da5
6c8daa9
4ea4da5
ac79c4a
ee55dd5
cb8268d
4ea4da5
b4eb298
 
4ea4da5
6c8daa9
ee55dd5
cb8268d
4ea4da5
b4eb298
 
4ea4da5
 
 
3b275e4
4ea4da5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b4eb298
 
4ea4da5
 
 
 
 
 
 
 
 
 
c36b946
4ea4da5
 
 
 
 
 
 
 
 
 
 
 
92cd747
4ea4da5
 
b4eb298
 
4ea4da5
 
 
 
 
ac79c4a
cb8268d
4ea4da5
 
 
 
 
 
 
 
 
 
 
 
b4eb298
 
 
4ea4da5
 
 
 
 
 
 
 
b4eb298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ea4da5
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""

deepspeed --num_gpus=4 step_2_train_model.py

"""
import argparse
import os
from pathlib import Path
import platform

os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

if platform.system() in ("Windows", "Darwin"):
    from project_settings import project_path, temp_directory
else:
    project_path = os.path.abspath("../../../")
    project_path = Path(project_path)
    temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")

# from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from modelscope import AutoConfig, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset
import torch


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--local_rank", type=int, default=-1, help="Local rank passed from distributed launcher")

    parser.add_argument(
        "--model_name",
        default="Qwen/Qwen3-8B",
        type=str
    ),
    parser.add_argument(
        "--dataset_path",
        default="miyuki2026/tutorials",
        type=str
    ),
    parser.add_argument("--dataset_name", default=None, type=str),
    parser.add_argument("--dataset_split", default=None, type=str),
    parser.add_argument(
        "--dataset_cache_dir",
        default=(temp_directory / "hub_datasets").as_posix(),
        type=str
    ),
    parser.add_argument(
        "--model_cache_dir",
        default=(temp_directory / "hub_models").as_posix(),
        type=str
    ),
    parser.add_argument("--dataset_streaming", default=None, type=str),
    parser.add_argument("--valid_dataset_size", default=1000, type=str),
    parser.add_argument("--shuffle_buffer_size", default=5000, type=str),

    parser.add_argument(
        "--num_workers",
        default=None if platform.system() in ("Windows", "Darwin") else os.cpu_count() // 2,
        type=int
    ),
    args = parser.parse_args()
    return args


def main():
    args = get_args()

    os.environ["MODELSCOPE_CACHE"] = args.model_cache_dir

    model = AutoModelForCausalLM.from_pretrained(
        pretrained_model_name_or_path=args.model_name,
        quantization_config=None,
        device_map="auto", # 启用多GPU拆分
        trust_remote_code=True,
        cache_dir=args.model_cache_dir,
    )
    if args.local_rank in (-1, 0):
        print(model)
    tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_name_or_path=args.model_name,
        trust_remote_code=True,
        cache_dir=args.model_cache_dir,
    )
    if args.local_rank in (-1, 0):
        print(tokenizer)

    def format_func(example):
        formated_text = tokenizer.apply_chat_template(
            example["conversations"],
            tokenize=False,  # 训练时部分词,true返回的是张量
            add_generation_prompt=False,  # 训练期间要关闭,如果是推理则设为True
        )
        return {"formated_text": formated_text}

    dataset_dict = load_dataset(
        path=args.dataset_path,
        name=args.dataset_name,
        data_dir="keywords",
        # data_dir="psychology",
        split=args.dataset_split,
        cache_dir=args.dataset_cache_dir,
        # num_proc=args.num_workers if not args.dataset_streaming else None,
        streaming=args.dataset_streaming,
    )
    dataset = dataset_dict["train"]
    if args.local_rank in (-1, 0):
        print(dataset)

    if args.dataset_streaming:
        valid_dataset = dataset.take(args.valid_dataset_size)
        train_dataset = dataset.skip(args.valid_dataset_size)
        train_dataset = train_dataset.shuffle(buffer_size=args.shuffle_buffer_size, seed=None)
    else:
        dataset = dataset.train_test_split(test_size=args.valid_dataset_size, seed=None)
        train_dataset = dataset["train"]
        valid_dataset = dataset["test"]

    # train_dataset = valid_dataset
    train_dataset = train_dataset.map(
        format_func,
        batched=False,
        remove_columns=train_dataset.column_names,
    )

    trainer = SFTTrainer(
        model=model,
        processing_class=tokenizer,  # 新写法
        train_dataset=train_dataset,
        eval_dataset=None,  # Can set up evaluation!
        args=SFTConfig(
            output_dir=(temp_directory / "trainer_output/Qwen3-8B-sft-deepspeed"),  # 请替换为你想要的路径
            dataset_text_field="formated_text",
            deepspeed="./ds_config/deepspeed_stage_3_config.json",  # 添加deepspeed配置文件
            per_device_train_batch_size=2,
            gradient_accumulation_steps=8,
            warmup_steps=100,
            num_train_epochs=1,  # Set this for 1 full training run.
            # max_steps = 30,
            learning_rate=3e-5,  # Reduce to 2e-5 for long training runs
            logging_steps=1,
            save_steps=10,           # 每500步保存一次检查点
            save_total_limit=2,       # 最多只保留2个检查点,旧的自动清理
            optim="adamw_8bit",
            weight_decay=0,
            lr_scheduler_type="constant_with_warmup",
            seed=3407,
            report_to="none",  # Use this for WandB etc
        ),
    )

    # 显示当前内存统计信息
    gpu_stats = torch.cuda.get_device_properties(0)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    if args.local_rank in (-1, 0):
        print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
        print(f"{start_gpu_memory} GB of memory reserved.")

    trainer_stats = trainer.train()

    # 显示最终内存和时间统计信息
    used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
    used_percentage = round(used_memory / max_memory * 100, 3)
    lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
    if args.local_rank in (-1, 0):
        print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
        print(
            f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training."
        )
        print(f"Peak reserved memory = {used_memory} GB.")
        print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
        print(f"Peak reserved memory % of max memory = {used_percentage} %.")
        print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

    if args.local_rank in (-1, 0):
        trained_models_dir = temp_directory / "trained_models" / "Qwen3-8B-sft-deepspeed"
        trained_models_dir.mkdir(parents=True, exist_ok=True)
        trainer.model.save_pretrained(trained_models_dir.as_posix())
        tokenizer.save_pretrained(trained_models_dir.as_posix())
    return


if __name__ == "__main__":
    main()