miyuki2026 commited on
Commit
c4ac4dc
·
1 Parent(s): d251d39
examples/tutorials/dpo/ultrafeedback-dpo/requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  transformers
 
2
  torch
3
  modelscope
4
  datasets
 
1
  transformers
2
+ peft
3
  torch
4
  modelscope
5
  datasets
examples/tutorials/dpo/ultrafeedback-dpo/step_2_train_dpo_model_single_gpu.py CHANGED
@@ -6,6 +6,8 @@ https://huggingface.co/docs/trl/v0.16.1/en/sft_trainer
6
  单卡 V00 32G 全参微调
7
  python3 step_2_train_dpo_model_single_gpu.py
8
 
 
 
9
  """
10
  import argparse
11
  import os
@@ -28,7 +30,9 @@ from datasets import load_dataset
28
  import torch
29
 
30
  from modelscope import AutoModelForCausalLM, AutoTokenizer
 
31
  from trl import DPOConfig, DPOTrainer
 
32
 
33
 
34
  def get_args():
@@ -116,24 +120,51 @@ def main():
116
 
117
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
118
 
 
 
 
 
 
 
 
 
119
  model = AutoModelForCausalLM.from_pretrained(
120
  args.model_name,
121
  cache_dir=args.model_cache_dir,
 
 
122
  trust_remote_code=True,
 
123
  )
124
  ref_model = AutoModelForCausalLM.from_pretrained(
125
  args.model_name,
126
  cache_dir=args.model_cache_dir,
127
  trust_remote_code=True,
 
 
 
128
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  tokenizer = AutoTokenizer.from_pretrained(
130
  args.model_name,
131
  cache_dir=args.model_cache_dir,
132
  trust_remote_code=True,
 
133
  )
134
- model = model.to(device)
135
- ref_model = ref_model.to(device)
136
-
137
  if tokenizer.pad_token is None:
138
  tokenizer.pad_token = tokenizer.eos_token
139
  tokenizer.pad_token_id = tokenizer.eos_token_id
@@ -168,7 +199,7 @@ def main():
168
  learning_rate=2e-5,
169
  warmup_steps=100,
170
  lr_scheduler_type="cosine",
171
- fp16=False,
172
  gradient_checkpointing=True, # 如果内存紧张,可以设为 True
173
  optim="adamw_torch",
174
  report_to="none",
 
6
  单卡 V00 32G 全参微调
7
  python3 step_2_train_dpo_model_single_gpu.py
8
 
9
+ DPO本来就是风格微调,用LoRA 训练更合理,更科学。
10
+
11
  """
12
  import argparse
13
  import os
 
30
  import torch
31
 
32
  from modelscope import AutoModelForCausalLM, AutoTokenizer
33
+ from transformers import BitsAndBytesConfig
34
  from trl import DPOConfig, DPOTrainer
35
+ from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
36
 
37
 
38
  def get_args():
 
120
 
121
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
122
 
123
+ bnb_config = BitsAndBytesConfig(
124
+ load_in_4bit=True, # 4-bit量化
125
+ bnb_4bit_quant_type="nf4", # 使用NF4量化
126
+ bnb_4bit_compute_dtype=torch.float16, # 计算时用FP16
127
+ bnb_4bit_use_double_quant=True, # 双重量化
128
+ bnb_4bit_quant_storage=torch.uint8, # 存储类型
129
+ )
130
+
131
  model = AutoModelForCausalLM.from_pretrained(
132
  args.model_name,
133
  cache_dir=args.model_cache_dir,
134
+ quantization_config=bnb_config,
135
+ device_map="auto",
136
  trust_remote_code=True,
137
+ use_cache=False, # 训练时禁用KV cache
138
  )
139
  ref_model = AutoModelForCausalLM.from_pretrained(
140
  args.model_name,
141
  cache_dir=args.model_cache_dir,
142
  trust_remote_code=True,
143
+ quantization_config=bnb_config,
144
+ device_map="auto",
145
+ use_cache=False,
146
  )
147
+ model = prepare_model_for_kbit_training(model)
148
+ ref_model = prepare_model_for_kbit_training(ref_model)
149
+
150
+ lora_config = LoraConfig(
151
+ r=16, # LoRA rank
152
+ lora_alpha=32, # LoRA alpha
153
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
154
+ lora_dropout=0.1,
155
+ bias="none",
156
+ task_type="CAUSAL_LM",
157
+ )
158
+ model = get_peft_model(model, lora_config)
159
+ ref_model = get_peft_model(ref_model, lora_config)
160
+ model.print_trainable_parameters()
161
+
162
  tokenizer = AutoTokenizer.from_pretrained(
163
  args.model_name,
164
  cache_dir=args.model_cache_dir,
165
  trust_remote_code=True,
166
+ padding_side="left", # DPO需要left padding
167
  )
 
 
 
168
  if tokenizer.pad_token is None:
169
  tokenizer.pad_token = tokenizer.eos_token
170
  tokenizer.pad_token_id = tokenizer.eos_token_id
 
199
  learning_rate=2e-5,
200
  warmup_steps=100,
201
  lr_scheduler_type="cosine",
202
+ fp16=True,
203
  gradient_checkpointing=True, # 如果内存紧张,可以设为 True
204
  optim="adamw_torch",
205
  report_to="none",