#!/usr/bin/python3 # -*- coding: utf-8 -*- """ deepspeed --num_gpus=4 step_2_train_model.py """ import argparse import os from pathlib import Path import platform os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" if platform.system() in ("Windows", "Darwin"): from project_settings import project_path, temp_directory else: project_path = os.path.abspath("../../../") project_path = Path(project_path) temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp") # from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from modelscope import AutoConfig, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from trl import SFTTrainer, SFTConfig from datasets import load_dataset import torch def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--local_rank", type=int, default=-1, help="Local rank passed from distributed launcher") parser.add_argument( "--model_name", default="Qwen/Qwen3-8B", type=str ), parser.add_argument( "--dataset_path", default="miyuki2026/tutorials", type=str ), parser.add_argument("--dataset_name", default=None, type=str), parser.add_argument("--dataset_split", default=None, type=str), parser.add_argument( "--dataset_cache_dir", default=(temp_directory / "hub_datasets").as_posix(), type=str ), parser.add_argument( "--model_cache_dir", default=(temp_directory / "hub_models").as_posix(), type=str ), parser.add_argument("--dataset_streaming", default=None, type=str), parser.add_argument("--valid_dataset_size", default=1000, type=str), parser.add_argument("--shuffle_buffer_size", default=5000, type=str), parser.add_argument( "--num_workers", default=None if platform.system() in ("Windows", "Darwin") else os.cpu_count() // 2, type=int ), args = parser.parse_args() return args def main(): args = get_args() os.environ["MODELSCOPE_CACHE"] = args.model_cache_dir model = AutoModelForCausalLM.from_pretrained( pretrained_model_name_or_path=args.model_name, quantization_config=None, device_map="auto", # 启用多GPU拆分 trust_remote_code=True, cache_dir=args.model_cache_dir, ) if args.local_rank in (-1, 0): print(model) tokenizer = AutoTokenizer.from_pretrained( pretrained_model_name_or_path=args.model_name, trust_remote_code=True, cache_dir=args.model_cache_dir, ) if args.local_rank in (-1, 0): print(tokenizer) def format_func(example): formated_text = tokenizer.apply_chat_template( example["conversations"], tokenize=False, # 训练时部分词,true返回的是张量 add_generation_prompt=False, # 训练期间要关闭,如果是推理则设为True ) return {"formated_text": formated_text} dataset_dict = load_dataset( path=args.dataset_path, name=args.dataset_name, data_dir="keywords", # data_dir="psychology", split=args.dataset_split, cache_dir=args.dataset_cache_dir, # num_proc=args.num_workers if not args.dataset_streaming else None, streaming=args.dataset_streaming, ) dataset = dataset_dict["train"] if args.local_rank in (-1, 0): print(dataset) if args.dataset_streaming: valid_dataset = dataset.take(args.valid_dataset_size) train_dataset = dataset.skip(args.valid_dataset_size) train_dataset = train_dataset.shuffle(buffer_size=args.shuffle_buffer_size, seed=None) else: dataset = dataset.train_test_split(test_size=args.valid_dataset_size, seed=None) train_dataset = dataset["train"] valid_dataset = dataset["test"] # train_dataset = valid_dataset train_dataset = train_dataset.map( format_func, batched=False, remove_columns=train_dataset.column_names, ) trainer = SFTTrainer( model=model, processing_class=tokenizer, # 新写法 train_dataset=train_dataset, eval_dataset=None, # Can set up evaluation! args=SFTConfig( output_dir=(temp_directory / "trainer_output/Qwen3-8B-sft-deepspeed"), # 请替换为你想要的路径 dataset_text_field="formated_text", deepspeed="./ds_config/deepspeed_stage_3_config.json", # 添加deepspeed配置文件 per_device_train_batch_size=2, gradient_accumulation_steps=8, warmup_steps=100, num_train_epochs=1, # Set this for 1 full training run. # max_steps = 30, learning_rate=3e-5, # Reduce to 2e-5 for long training runs logging_steps=1, save_steps=10, # 每500步保存一次检查点 save_total_limit=2, # 最多只保留2个检查点,旧的自动清理 optim="adamw_8bit", weight_decay=0, lr_scheduler_type="constant_with_warmup", seed=3407, report_to="none", # Use this for WandB etc ), ) # 显示当前内存统计信息 gpu_stats = torch.cuda.get_device_properties(0) start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3) max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) if args.local_rank in (-1, 0): print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.") print(f"{start_gpu_memory} GB of memory reserved.") trainer_stats = trainer.train() # 显示最终内存和时间统计信息 used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3) used_memory_for_lora = round(used_memory - start_gpu_memory, 3) used_percentage = round(used_memory / max_memory * 100, 3) lora_percentage = round(used_memory_for_lora / max_memory * 100, 3) if args.local_rank in (-1, 0): print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.") print( f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training." ) print(f"Peak reserved memory = {used_memory} GB.") print(f"Peak reserved memory for training = {used_memory_for_lora} GB.") print(f"Peak reserved memory % of max memory = {used_percentage} %.") print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.") if args.local_rank in (-1, 0): trained_models_dir = temp_directory / "trained_models" / "Qwen3-8B-sft-deepspeed" trained_models_dir.mkdir(parents=True, exist_ok=True) trainer.model.save_pretrained(trained_models_dir.as_posix()) tokenizer.save_pretrained(trained_models_dir.as_posix()) return if __name__ == "__main__": main()