miyuki2026 commited on
Commit
a1cb0be
·
1 Parent(s): 75c5e57
Files changed (21) hide show
  1. examples/tutorials/dpo/ultrafeedback-dpo/{step_2_train_dpo_model_single_gpu.py → step_2_train_dpo_model_ddp_qlora.py} +20 -8
  2. examples/tutorials/dpo/ultrafeedback-dpo/step_2_train_dpo_model_single_gpu_qlora.py +249 -0
  3. examples/tutorials/grpo/step_2_train_grpo_model.py +11 -0
  4. examples/tutorials/{rlhf → ppo}/gpt2_sst2/step_1_prepare_data.py +0 -0
  5. examples/tutorials/{rlhf → ppo}/gpt2_sst2/step_2_train_sft_model.py +0 -0
  6. examples/tutorials/{rlhf → ppo}/gpt2_sst2/step_3_train_reward_model.py +0 -0
  7. examples/tutorials/{rlhf → ppo}/gpt2_sst2/step_4_test_reward_model.py +0 -0
  8. examples/tutorials/{rlhf → ppo}/gpt2_sst2/step_5_ppo_rlhf.py +0 -0
  9. examples/tutorials/{rlhf → ppo}/gpt2_sst2/step_5_ppo_rlhf2.py +0 -0
  10. examples/tutorials/{rlhf → ppo}/gpt2_sst2/step_5_pre_ppo_rlhf.py +0 -0
  11. examples/tutorials/{rlhf → ppo}/gpt2_sst2_generation/step_2_train_model.py +0 -0
  12. examples/tutorials/{rlhf → ppo}/gpt2_sst2_generation/step_3_generation.py +0 -0
  13. examples/tutorials/{rlhf → ppo}/gpt2_sst2_ppo/requirements.txt +0 -0
  14. examples/tutorials/{rlhf → ppo}/gpt2_sst2_ppo/step_1_prepare_data.py +0 -0
  15. examples/tutorials/{rlhf → ppo}/gpt2_sst2_ppo/step_2_train_model_ddp.py +0 -0
  16. examples/tutorials/{rlhf → ppo}/gpt2_sst2_ppo/step_2_train_model_on_cpu.py +0 -0
  17. examples/tutorials/{rlhf → ppo}/gpt2_sst2_ppo/step_3_generation.py +0 -0
  18. examples/tutorials/{rlhf → ppo}/gpt2_sst2_ppo/step_6_push_to_modelscope.py +0 -0
  19. examples/tutorials/{rlhf → ppo}/gpt2_sst2_reward/step_2_train_model.py +0 -0
  20. examples/tutorials/{rlhf → ppo}/gpt2_sst2_reward/step_3_test_model.py +0 -0
  21. examples/tutorials/{rlhf → ppo}/gpt2_sst2_reward/step_4_test_model.py +0 -0
examples/tutorials/dpo/ultrafeedback-dpo/{step_2_train_dpo_model_single_gpu.py → step_2_train_dpo_model_ddp_qlora.py} RENAMED
@@ -3,11 +3,14 @@
3
  """
4
  https://huggingface.co/docs/trl/v0.16.1/en/sft_trainer
5
 
6
- 卡 V00 32G 全参微调
7
- python3 step_2_train_dpo_model_single_gpu.py
 
 
8
 
9
  DPO本来就是风格微调,用LoRA 训练更合理,更科学。
10
 
 
11
  """
12
  import argparse
13
  import os
@@ -37,6 +40,8 @@ from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
37
 
38
  def get_args():
39
  parser = argparse.ArgumentParser()
 
 
40
  parser.add_argument(
41
  "--model_name",
42
  default=(project_path / "pretrained_models/jingyaogong/MiniMind2").as_posix() if debug_mode else "qgyd2021/Qwen2.5-0.5B-ultrachat-sft-deepspeed",
@@ -60,9 +65,12 @@ def get_args():
60
  ),
61
  parser.add_argument(
62
  "--output_model_dir",
63
- default=(temp_directory / "trained_models/qwen2_5-0_5B-ultrafeedback-dpo-single-gpu").as_posix(),
64
  type=str
65
  ),
 
 
 
66
  parser.add_argument(
67
  "--num_workers",
68
  default=None if debug_mode else os.cpu_count() // 2,
@@ -202,11 +210,14 @@ def main():
202
  optim="adamw_torch",
203
  report_to="none",
204
  max_length=1024 if debug_mode else 2048, # prompt + chosen 的最大长度
205
- max_prompt_length=512 if debug_mode else 1024, # prompt 的最大长度
206
  # DPO 特定参数
207
- beta=0.1, # DPO 的温度参数,控制对 preference 的置信度
208
  remove_unused_columns=False,
209
  dataloader_pin_memory=False,
 
 
 
 
210
  )
211
 
212
  trainer = DPOTrainer(
@@ -222,9 +233,10 @@ def main():
222
  trainer.train()
223
 
224
  # 保存模型
225
- print(f"保存模型到: {args.output_model_dir}")
226
- trainer.save_model()
227
- tokenizer.save_pretrained(args.output_model_dir)
 
228
 
229
  print("DPO 训练完成!")
230
  return
 
3
  """
4
  https://huggingface.co/docs/trl/v0.16.1/en/sft_trainer
5
 
6
+ 卡 V00 32G 全参微调
7
+ python3 -m torch.distributed.run --nproc_per_node=4 step_2_train_dpo_model_ddp_qlora.py
8
+
9
+ torchrun --nproc_per_node=4 step_2_train_dpo_model_ddp_qlora.py
10
 
11
  DPO本来就是风格微调,用LoRA 训练更合理,更科学。
12
 
13
+
14
  """
15
  import argparse
16
  import os
 
40
 
41
  def get_args():
42
  parser = argparse.ArgumentParser()
43
+ parser.add_argument("--local_rank", type=int, default=0) # torchrun会自动传递这个参数
44
+
45
  parser.add_argument(
46
  "--model_name",
47
  default=(project_path / "pretrained_models/jingyaogong/MiniMind2").as_posix() if debug_mode else "qgyd2021/Qwen2.5-0.5B-ultrachat-sft-deepspeed",
 
65
  ),
66
  parser.add_argument(
67
  "--output_model_dir",
68
+ default=(temp_directory / "trained_models/qwen2_5-0_5B-ultrafeedback-dpo-ddp-qlora").as_posix(),
69
  type=str
70
  ),
71
+
72
+ parser.add_argument("--beta", default=0.5, type=float),
73
+
74
  parser.add_argument(
75
  "--num_workers",
76
  default=None if debug_mode else os.cpu_count() // 2,
 
210
  optim="adamw_torch",
211
  report_to="none",
212
  max_length=1024 if debug_mode else 2048, # prompt + chosen 的最大长度
 
213
  # DPO 特定参数
214
+ beta=args.beta, # DPO 的温度参数,控制对 preference 的置信度
215
  remove_unused_columns=False,
216
  dataloader_pin_memory=False,
217
+
218
+ # ddp_find_unused_parameters=False, # 告诉DDP忽略未使用的参数
219
+ local_rank=args.local_rank, # 传递当前进程的local_rank
220
+
221
  )
222
 
223
  trainer = DPOTrainer(
 
233
  trainer.train()
234
 
235
  # 保存模型
236
+ if args.local_rank == 0: # 只在主进程保存
237
+ print(f"保存模型到: {args.output_model_dir}")
238
+ trainer.save_model()
239
+ tokenizer.save_pretrained(args.output_model_dir)
240
 
241
  print("DPO 训练完成!")
242
  return
examples/tutorials/dpo/ultrafeedback-dpo/step_2_train_dpo_model_single_gpu_qlora.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ https://huggingface.co/docs/trl/v0.16.1/en/sft_trainer
5
+
6
+ 单卡 V00 32G 全参微调
7
+ python3 step_2_train_dpo_model_single_gpu.py
8
+
9
+ DPO本来就是风格微调,用LoRA 训练更合理,更科学。
10
+
11
+
12
+ ---------------
13
+ {'loss': '0.6324', 'grad_norm': '1.082', 'learning_rate': '3.257e-06', 'rewards/chosen': '0.2385', 'rewards/rejected': '-0.2982', 'rewards/accuracies': '0.6438', 'rewards/margins': '0.5366', 'logps/chosen': '-367', 'logps/rejected': '-336.3', 'logits/chosen': '-1.805', 'logits/rejected': '-1.832', 'epoch': '0.7433'}
14
+ logps/chosen 比 logps/rejected 小,说明模型生成优选项的概率小于拒选项。
15
+ 最终的模型应是 logps/chosen 更大。
16
+ 如果loss损失不直降,就调大 LoRA的 rank。
17
+
18
+ ---------------
19
+ 此模型训练一开始就倾向于生成拒选项,0.74epoch时仍然倾向于生成拒选项。
20
+ GPT建议调大 beta,限制当前模型的自由度。
21
+ 当前 beta=0.1 改为 0.5
22
+
23
+
24
+ """
25
+ import argparse
26
+ import os
27
+ from pathlib import Path
28
+ import platform
29
+
30
+ # os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
31
+
32
+ debug_mode = True if platform.system() in ("Windows", "Darwin") else False
33
+ print(f"debug_mode: {debug_mode}")
34
+
35
+ if platform.system() in ("Windows", "Darwin"):
36
+ from project_settings import project_path, temp_directory
37
+ else:
38
+ project_path = os.path.abspath("../../../")
39
+ project_path = Path(project_path)
40
+ temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
41
+
42
+ from datasets import load_dataset
43
+ import torch
44
+
45
+ from modelscope import AutoModelForCausalLM, AutoTokenizer
46
+ from transformers import BitsAndBytesConfig
47
+ from trl import DPOConfig, DPOTrainer
48
+ from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
49
+
50
+
51
+ def get_args():
52
+ parser = argparse.ArgumentParser()
53
+ parser.add_argument(
54
+ "--model_name",
55
+ default=(project_path / "pretrained_models/jingyaogong/MiniMind2").as_posix() if debug_mode else "qgyd2021/Qwen2.5-0.5B-ultrachat-sft-deepspeed",
56
+ type=str
57
+ ),
58
+ parser.add_argument(
59
+ "--dataset_path",
60
+ default="HuggingFaceH4/ultrafeedback_binarized",
61
+ # default="miyuki2026/tutorials" if debug_mode else "HuggingFaceH4/ultrachat_200k",
62
+ type=str
63
+ ),
64
+ parser.add_argument(
65
+ "--dataset_cache_dir",
66
+ default=(temp_directory / "hub_datasets").as_posix(),
67
+ type=str
68
+ ),
69
+ parser.add_argument(
70
+ "--model_cache_dir",
71
+ default=(temp_directory / "hub_models").as_posix(),
72
+ type=str
73
+ ),
74
+ parser.add_argument(
75
+ "--output_model_dir",
76
+ default=(temp_directory / "trained_models/qwen2_5-0_5B-ultrafeedback-dpo-single-gpu-qlora").as_posix(),
77
+ type=str
78
+ ),
79
+
80
+ parser.add_argument("--beta", default=0.5, type=float),
81
+
82
+ parser.add_argument(
83
+ "--num_workers",
84
+ default=None if debug_mode else os.cpu_count() // 2,
85
+ type=int
86
+ ),
87
+ args = parser.parse_args()
88
+ return args
89
+
90
+
91
+ def format_func(examples, tokenizer):
92
+ chosen = examples["chosen"]
93
+ rejected = examples["rejected"]
94
+
95
+ chosen_prompt = chosen[:-1]
96
+ chosen_response = chosen[-1]
97
+
98
+ rejected_prompt = rejected[:-1]
99
+ rejected_response = rejected[-1]
100
+
101
+ chosen_prompt_text = tokenizer.apply_chat_template(
102
+ conversation=chosen_prompt,
103
+ tokenize=False,
104
+ add_generation_prompt=True, # DPO 需要添加生成提示,让模型知道要从这里开始生成
105
+ )
106
+ rejected_prompt_text = tokenizer.apply_chat_template(
107
+ conversation=rejected_prompt,
108
+ tokenize=False,
109
+ add_generation_prompt=True, # DPO 需要添加生成提示,让模型知道要从这里开始生成
110
+ )
111
+ if chosen_prompt_text != rejected_prompt_text:
112
+ raise AssertionError()
113
+
114
+ chosen_response_role = chosen_response["role"]
115
+ chosen_response_text = chosen_response["content"]
116
+ if chosen_response_role != "assistant":
117
+ raise AssertionError()
118
+
119
+ rejected_response_role = rejected_response["role"]
120
+ rejected_response_text = rejected_response["content"]
121
+ if rejected_response_role != "assistant":
122
+ raise AssertionError()
123
+
124
+ result = {
125
+ "prompt": chosen_prompt_text,
126
+ "chosen": chosen_response_text,
127
+ "rejected": rejected_response_text,
128
+ }
129
+ return result
130
+
131
+
132
+ def main():
133
+ args = get_args()
134
+
135
+ os.environ["MODELSCOPE_CACHE"] = args.model_cache_dir
136
+
137
+ bnb_config = BitsAndBytesConfig(
138
+ load_in_4bit=True,
139
+ bnb_4bit_quant_type="nf4",
140
+ bnb_4bit_compute_dtype=torch.float16,
141
+ bnb_4bit_use_double_quant=True,
142
+ bnb_4bit_quant_storage=torch.uint8,
143
+ )
144
+
145
+ model = AutoModelForCausalLM.from_pretrained(
146
+ args.model_name,
147
+ cache_dir=args.model_cache_dir,
148
+ quantization_config=bnb_config,
149
+ device_map="auto",
150
+ trust_remote_code=True,
151
+ use_cache=False, # 训练时禁用KV cache
152
+ )
153
+ ref_model = AutoModelForCausalLM.from_pretrained(
154
+ args.model_name,
155
+ cache_dir=args.model_cache_dir,
156
+ trust_remote_code=True,
157
+ quantization_config=bnb_config,
158
+ device_map="auto",
159
+ use_cache=False,
160
+ )
161
+ model = prepare_model_for_kbit_training(model)
162
+ ref_model = prepare_model_for_kbit_training(ref_model)
163
+
164
+ lora_config = LoraConfig(
165
+ r=16,
166
+ lora_alpha=32,
167
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
168
+ lora_dropout=0.1,
169
+ bias="none",
170
+ task_type="CAUSAL_LM",
171
+ )
172
+ model = get_peft_model(model, lora_config)
173
+ ref_model = get_peft_model(ref_model, lora_config)
174
+ model.print_trainable_parameters()
175
+
176
+ tokenizer = AutoTokenizer.from_pretrained(
177
+ args.model_name,
178
+ cache_dir=args.model_cache_dir,
179
+ trust_remote_code=True,
180
+ padding_side="left", # DPO需要left padding
181
+ )
182
+ if tokenizer.pad_token is None:
183
+ tokenizer.pad_token = tokenizer.eos_token
184
+ tokenizer.pad_token_id = tokenizer.eos_token_id
185
+
186
+ print(model)
187
+ print(ref_model)
188
+ print(tokenizer)
189
+
190
+ dataset_dict = load_dataset(
191
+ path=args.dataset_path,
192
+ cache_dir=args.dataset_cache_dir,
193
+ )
194
+ train_dataset = dataset_dict["train_prefs"]
195
+ # test_dataset = dataset_dict["test_prefs"]
196
+
197
+ train_dataset = train_dataset.map(
198
+ lambda x: format_func(x, tokenizer),
199
+ batched=False,
200
+ num_proc=args.num_workers,
201
+ remove_columns=train_dataset.column_names,
202
+ )
203
+
204
+ dpo_config = DPOConfig(
205
+ output_dir=args.output_model_dir,
206
+ num_train_epochs=1,
207
+ per_device_train_batch_size=1 if debug_mode else 2,
208
+ gradient_accumulation_steps=1 if debug_mode else 8,
209
+ save_strategy="steps",
210
+ save_steps=100,
211
+ save_total_limit=2,
212
+ logging_steps=10,
213
+ learning_rate=2e-5,
214
+ warmup_steps=100,
215
+ lr_scheduler_type="cosine",
216
+ fp16=True,
217
+ gradient_checkpointing=True, # 如果内存紧张,可以设为 True
218
+ optim="adamw_torch",
219
+ report_to="none",
220
+ max_length=1024 if debug_mode else 2048, # prompt + chosen 的最大长度
221
+ # DPO 特定参数
222
+ beta=args.beta, # DPO 的温度参数,控制对 preference 的置信度
223
+ remove_unused_columns=False,
224
+ dataloader_pin_memory=False,
225
+ )
226
+
227
+ trainer = DPOTrainer(
228
+ model=model,
229
+ ref_model=ref_model,
230
+ args=dpo_config,
231
+ train_dataset=train_dataset,
232
+ # DPOTrainer 会自动处理数据,不需要 data_collator
233
+ )
234
+
235
+ # 开始训练
236
+ print("开始 DPO 训练...")
237
+ trainer.train()
238
+
239
+ # 保存模型
240
+ print(f"保存模型到: {args.output_model_dir}")
241
+ trainer.save_model()
242
+ tokenizer.save_pretrained(args.output_model_dir)
243
+
244
+ print("DPO 训练完成!")
245
+ return
246
+
247
+
248
+ if __name__ == "__main__":
249
+ main()
examples/tutorials/grpo/step_2_train_grpo_model.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+
5
+ """
6
+
7
+
8
+
9
+
10
+ if __name__ == "__main__":
11
+ pass
examples/tutorials/{rlhf → ppo}/gpt2_sst2/step_1_prepare_data.py RENAMED
File without changes
examples/tutorials/{rlhf → ppo}/gpt2_sst2/step_2_train_sft_model.py RENAMED
File without changes
examples/tutorials/{rlhf → ppo}/gpt2_sst2/step_3_train_reward_model.py RENAMED
File without changes
examples/tutorials/{rlhf → ppo}/gpt2_sst2/step_4_test_reward_model.py RENAMED
File without changes
examples/tutorials/{rlhf → ppo}/gpt2_sst2/step_5_ppo_rlhf.py RENAMED
File without changes
examples/tutorials/{rlhf → ppo}/gpt2_sst2/step_5_ppo_rlhf2.py RENAMED
File without changes
examples/tutorials/{rlhf → ppo}/gpt2_sst2/step_5_pre_ppo_rlhf.py RENAMED
File without changes
examples/tutorials/{rlhf → ppo}/gpt2_sst2_generation/step_2_train_model.py RENAMED
File without changes
examples/tutorials/{rlhf → ppo}/gpt2_sst2_generation/step_3_generation.py RENAMED
File without changes
examples/tutorials/{rlhf → ppo}/gpt2_sst2_ppo/requirements.txt RENAMED
File without changes
examples/tutorials/{rlhf → ppo}/gpt2_sst2_ppo/step_1_prepare_data.py RENAMED
File without changes
examples/tutorials/{rlhf → ppo}/gpt2_sst2_ppo/step_2_train_model_ddp.py RENAMED
File without changes
examples/tutorials/{rlhf → ppo}/gpt2_sst2_ppo/step_2_train_model_on_cpu.py RENAMED
File without changes
examples/tutorials/{rlhf → ppo}/gpt2_sst2_ppo/step_3_generation.py RENAMED
File without changes
examples/tutorials/{rlhf → ppo}/gpt2_sst2_ppo/step_6_push_to_modelscope.py RENAMED
File without changes
examples/tutorials/{rlhf → ppo}/gpt2_sst2_reward/step_2_train_model.py RENAMED
File without changes
examples/tutorials/{rlhf → ppo}/gpt2_sst2_reward/step_3_test_model.py RENAMED
File without changes
examples/tutorials/{rlhf → ppo}/gpt2_sst2_reward/step_4_test_model.py RENAMED
File without changes