miyuki2026 commited on
Commit
cce54bc
·
1 Parent(s): e96faee
examples/tutorials/dpo/ultrafeedback-dpo/step_1_prepare_data.py CHANGED
@@ -47,11 +47,11 @@ def main():
47
  local_dir=args.local_dir,
48
  )
49
  #huggingface_hub
50
- snapshot_download(
51
- repo_type="model",
52
- repo_id=args.repo_id,
53
- local_dir=args.local_dir,
54
- )
55
  return
56
 
57
 
 
47
  local_dir=args.local_dir,
48
  )
49
  #huggingface_hub
50
+ # snapshot_download(
51
+ # repo_type="model",
52
+ # repo_id=args.repo_id,
53
+ # local_dir=args.local_dir,
54
+ # )
55
  return
56
 
57
 
examples/tutorials/dpo/ultrafeedback-dpo/step_2_train_dpo_model_ddp_qlora.py CHANGED
@@ -14,7 +14,8 @@ DPO本来就是风格微调,用LoRA 训练更合理,更科学。
14
  ----------
15
 
16
  nohup torchrun --nproc_per_node=2 step_2_train_dpo_model_ddp_qlora.py \
17
- --dpo_beta 0.5 \
 
18
  --lora_rank 32 \
19
  &
20
 
@@ -79,6 +80,7 @@ def get_args():
79
  type=str
80
  ),
81
 
 
82
  parser.add_argument("--dpo_beta", default=0.5, type=float),
83
  parser.add_argument("--lora_rank", default=32, type=int),
84
 
@@ -229,7 +231,7 @@ def main():
229
  save_steps=100,
230
  save_total_limit=2,
231
  logging_steps=10,
232
- learning_rate=2e-5,
233
  warmup_steps=100,
234
  lr_scheduler_type="cosine",
235
  fp16=True,
 
14
  ----------
15
 
16
  nohup torchrun --nproc_per_node=2 step_2_train_dpo_model_ddp_qlora.py \
17
+ --learning_rate 5e-5
18
+ --dpo_beta 0.05 \
19
  --lora_rank 32 \
20
  &
21
 
 
80
  type=str
81
  ),
82
 
83
+ parser.add_argument("--learning_rate", default=2e-5, type=float),
84
  parser.add_argument("--dpo_beta", default=0.5, type=float),
85
  parser.add_argument("--lora_rank", default=32, type=int),
86
 
 
231
  save_steps=100,
232
  save_total_limit=2,
233
  logging_steps=10,
234
+ learning_rate=args.learning_rate,
235
  warmup_steps=100,
236
  lr_scheduler_type="cosine",
237
  fp16=True,
examples/tutorials/grpo/requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ trl==0.28.0
2
+ transformers
examples/tutorials/grpo/step_1_download_model_ms.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 或使用命令行
5
+
6
+ python3 step_1_prepare_data.py \
7
+ --repo_id qgyd2021/gpt2-for-sequence-classification-sst2-reward \
8
+ --local_dir /root/autodl-tmp/OpenMiniMind/trained_models/gpt2-for-sequence-classification-sst2-reward
9
+
10
+ """
11
+ import argparse
12
+ import os
13
+ from pathlib import Path
14
+ import platform
15
+
16
+ if platform.system() in ("Windows", "Darwin"):
17
+ from project_settings import project_path, temp_directory
18
+ else:
19
+ project_path = os.path.abspath("../../../")
20
+ project_path = Path(project_path)
21
+ temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
22
+
23
+ from modelscope import snapshot_download
24
+
25
+
26
+ def get_args():
27
+ parser = argparse.ArgumentParser()
28
+ parser.add_argument("--repo_id", default="Qwen/Qwen2.5-3B-Instruct", type=str)
29
+ parser.add_argument(
30
+ "--local_dir",
31
+ default=(temp_directory / "../pretrained_models/Qwen/Qwen2.5-3B-Instruct").as_posix(),
32
+ type=str
33
+ )
34
+ args = parser.parse_args()
35
+ return args
36
+
37
+
38
+ def main():
39
+ args = get_args()
40
+
41
+ snapshot_download(
42
+ model_id=args.repo_id,
43
+ local_dir=args.local_dir,
44
+ )
45
+ return
46
+
47
+
48
+ if __name__ == "__main__":
49
+ main()
examples/tutorials/grpo/step_2_train_grpo_model.py CHANGED
@@ -2,10 +2,301 @@
2
  # -*- coding: utf-8 -*-
3
  """
4
 
 
 
 
5
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
 
 
 
 
 
 
 
 
 
 
7
 
 
8
 
9
 
10
  if __name__ == "__main__":
11
- pass
 
2
  # -*- coding: utf-8 -*-
3
  """
4
 
5
+ https://huggingface.co/docs/trl/v0.28.0/en/grpo_trainer
6
+
7
+
8
  """
9
+ import argparse
10
+ import os
11
+ from pathlib import Path
12
+ import platform
13
+ import re
14
+ from typing import Any, Dict, List, Optional
15
+
16
+ os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
17
+
18
+ # 路径配置
19
+ if platform.system() in ("Windows", "Darwin"):
20
+ from project_settings import project_path, temp_directory
21
+ else:
22
+ project_path = Path(os.path.abspath("../../../../"))
23
+ temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
24
+
25
+ from datasets import load_dataset
26
+ import torch
27
+ from transformers import (
28
+ AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification,
29
+ GPT2LMHeadModel, GPT2ForSequenceClassification,
30
+ DataCollatorWithPadding
31
+ )
32
+ from trl import GRPOConfig, GRPOTrainer
33
+
34
+
35
+ def get_args():
36
+ parser = argparse.ArgumentParser()
37
+ parser.add_argument(
38
+ "--model_name",
39
+ # default="Qwen/Qwen2.5-3B-Instruct",
40
+ # default=(project_path / "pretrained_models/Qwen/Qwen2.5-3B-Instruct").as_posix(),
41
+ default=(project_path / "pretrained_models/jingyaogong/MiniMind2").as_posix(),
42
+ type=str
43
+ )
44
+ parser.add_argument("--dataset_path", default="Jiayi-Pan/Countdown-Tasks-3to4", type=str)
45
+ parser.add_argument("--dataset_cache_dir",
46
+ default=(temp_directory / "hub_datasets").as_posix(), type=str)
47
+ parser.add_argument("--model_cache_dir",
48
+ default=(temp_directory / "hub_models").as_posix(), type=str)
49
+
50
+ # 训练参数
51
+ parser.add_argument("--valid_dataset_size", default=2000, type=int)
52
+
53
+ # 生成参数
54
+
55
+ parser.add_argument(
56
+ "--output_model_dir",
57
+ default=(project_path / "trained_models/qwen2_5-3B-Instruct-Countdown-GRPO").as_posix(),
58
+ type=str
59
+ ),
60
+
61
+ # 其他
62
+ parser.add_argument("--num_workers", default=0 if platform.system() == "Windows" else 2, type=int)
63
+ return parser.parse_args()
64
+
65
+
66
+ SYSTEM_MESSAGE = """
67
+ 你是一个有用的助手。你首先在脑海中思考推理过程,然后为用户提供答案。
68
+ """.strip()
69
+
70
+ USER_TEMPLATE = (
71
+ "使用这些数字 {numbers},创建一个等于 {target} 的等式。"
72
+ "你可以使用基本算术运算(+、-、*、/),每个数字只能使用一次。"
73
+ "在 <think> </think> 标签中展示你的解题过程。"
74
+ "并在 <answer> </answer> 标签中返回最终答案,例如 <answer> (1 + 2) / 3 </answer>。"
75
+ )
76
+
77
+ RESPONSE_PROMPT = "让我一步步来解决这个问题。\n<think>"
78
+
79
+
80
+ def format_func(example, tokenizer):
81
+ numbers: List[int] = example["nums"]
82
+ target: int = example["target"]
83
+
84
+ user_message = USER_TEMPLATE.format(
85
+ numbers=numbers,
86
+ target=target
87
+ )
88
+
89
+ messages = [
90
+ {"role": "system", "content": SYSTEM_MESSAGE},
91
+ {"role": "user", "content": user_message},
92
+ ]
93
+
94
+ formatted_prompt = tokenizer.apply_chat_template(
95
+ messages,
96
+ tokenize=False,
97
+ add_generation_prompt=True,
98
+ )
99
+ formatted_prompt = formatted_prompt + RESPONSE_PROMPT
100
+
101
+ tokenized = tokenizer(formatted_prompt,)
102
+ input_ids = tokenized["input_ids"]
103
+ attention_mask = tokenized["attention_mask"]
104
+ result = {
105
+ "prompt": formatted_prompt,
106
+ "input_ids": input_ids,
107
+ "attention_mask": attention_mask,
108
+
109
+ "numbers": numbers,
110
+ "target": target,
111
+ }
112
+ return result
113
+
114
+
115
+ def format_reward_function(
116
+ completions: str,
117
+ ) -> float:
118
+ """
119
+ 检查模型的回复是否符合格式 <think>...</think><answer>...</answer>
120
+ """
121
+ think_regex = r"<think>.*?<\/think>"
122
+ answer_regex = r"<answer>.*?<\/answer>"
123
+ full_format_regex = r"^<think>.*?<\/think>\n<answer>.*?<\/answer>$"
124
+
125
+ think_match = re.search(think_regex, completions, re.DOTALL)
126
+ answer_match = re.search(answer_regex, completions, re.DOTALL)
127
+ full_format_match = re.match(
128
+ full_format_regex,
129
+ completions,
130
+ re.DOTALL
131
+ )
132
+ # 如果完全匹配,则给1分
133
+ if full_format_match:
134
+ return 1.0
135
+ reward = 0.0
136
+ # 如果有<think></think>标签对,则奖励加0.1分
137
+ if think_match:
138
+ reward += 0.1
139
+ # 如果有<answer></answer>标签对,则奖励加0.5分
140
+ if answer_match:
141
+ reward += 0.5
142
+ # 返回奖励
143
+ return reward
144
+
145
+
146
+ def format_reward_func(
147
+ prompts: List[str],
148
+ completions: List[str],
149
+ completion_ids: List[List[int]],
150
+ # end_token: Optional[str] = None,
151
+ **kwargs) -> List[float]:
152
+ result = list()
153
+ for completion in completions:
154
+ reward = format_reward_function(completion)
155
+ result.append(reward)
156
+ return result
157
+
158
+
159
+ def answer_reward_function(
160
+ response: str,
161
+ numbers: List[int] = None,
162
+ target: int = None
163
+ ) -> float:
164
+ """
165
+ 检查答案中:
166
+ 1. 是否使用了所有给的数字
167
+ 2. 每个数字是否使用了一次
168
+ 3. 答案中包含的表达式的求��结果是否等于目标数字
169
+ """
170
+ # 答案的正则表达式
171
+ answer_regex = r"<answer>(.*?)<\/answer>"
172
+ # 回答中是否有答案标签对
173
+ answer_match = re.search(answer_regex, response, re.DOTALL)
174
+ # 如果在回答中没有搜索到答案,那么给0分
175
+ if not answer_match:
176
+ return 0.0
177
+ # 提取出答案的文本
178
+ answer_content = answer_match.group(1)
179
+ # 如果答案标签内没有东西,给0分
180
+ if not answer_content:
181
+ return 0.0
182
+ # 如果答案标签中,除了表达式以外,还有其它内容,给0分
183
+ allowed_chars = r"^[0-9+\-*/() ]+$"
184
+ if not re.match(allowed_chars, answer_content):
185
+ return 0.0
186
+ # 检查答案中,每个数字是否只使用了一次
187
+ used_numbers = [
188
+ int(n) for n in re.findall(r"\d+", answer_content)
189
+ ]
190
+ if sorted(used_numbers) != sorted(numbers):
191
+ return 0.0
192
+ # 检查答案中包含的表达式的求值结果是否为目标数字
193
+ try:
194
+ result = eval(answer_content, {"__builtins__": None}, {})
195
+ if abs(float(result) - float(target)) < 1e-5:
196
+ return 1.0
197
+ except:
198
+ pass
199
+ return 0.0
200
+
201
+
202
+ def answer_reward_func(
203
+ prompts: List[str],
204
+ completions: List[str],
205
+ completion_ids: List[List[int]],
206
+ **kwargs) -> List[float]:
207
+ target_list = kwargs["target"]
208
+ numbers_list = kwargs["numbers"]
209
+
210
+ result = list()
211
+ for completion, numbers, target in zip(completions, numbers_list, target_list):
212
+ reward = answer_reward_function(completion, numbers, target)
213
+ result.append(reward)
214
+ return result
215
+
216
+
217
+ def main():
218
+ args = get_args()
219
+
220
+ model = AutoModelForCausalLM.from_pretrained(
221
+ pretrained_model_name_or_path=args.model_name,
222
+ trust_remote_code=True,
223
+ cache_dir=args.model_cache_dir,
224
+ )
225
+
226
+ tokenizer = AutoTokenizer.from_pretrained(
227
+ args.model_name,
228
+ padding_side="left", # 对于生成任务很重要
229
+ cache_dir=args.model_cache_dir,
230
+ )
231
+ if tokenizer.pad_token is None:
232
+ tokenizer.pad_token = tokenizer.eos_token
233
+ tokenizer.pad_token_id = tokenizer.eos_token_id
234
+ print(f"eos_token: {tokenizer.eos_token}")
235
+ print(f"pad_token: {tokenizer.pad_token}")
236
+
237
+ dataset_dict = load_dataset(
238
+ path=args.dataset_path,
239
+ cache_dir=args.dataset_cache_dir,
240
+ )
241
+ dataset = dataset_dict["train"]
242
+ dataset = dataset.take(n=10000)
243
+
244
+ dataset = dataset.train_test_split(test_size=args.valid_dataset_size, seed=None)
245
+ train_dataset = dataset["train"]
246
+ valid_dataset = dataset["test"]
247
+
248
+ train_dataset = train_dataset.map(
249
+ lambda example: format_func(example, tokenizer),
250
+ batched=False,
251
+ remove_columns=train_dataset.column_names,
252
+ )
253
+ valid_dataset = valid_dataset.map(
254
+ lambda example: format_func(example, tokenizer),
255
+ batched=False,
256
+ remove_columns=valid_dataset.column_names,
257
+ )
258
+
259
+ grpo_config = GRPOConfig(
260
+ output_dir=args.output_model_dir,
261
+ num_train_epochs=3,
262
+ per_device_train_batch_size=4,
263
+ per_device_eval_batch_size=4,
264
+ gradient_accumulation_steps=4,
265
+ learning_rate=5e-6, # GRPO通常使用较小的学习率
266
+ warmup_ratio=0.1,
267
+ logging_steps=10,
268
+ eval_strategy="steps",
269
+ eval_steps=100,
270
+ save_strategy="steps",
271
+ save_steps=100,
272
+ save_total_limit=3,
273
+ load_best_model_at_end=True,
274
+ metric_for_best_model="reward", # 使用奖励作为评估指标
275
+ greater_is_better=True,
276
+ fp16=False,
277
+ bf16=False,
278
+ max_grad_norm=1.0,
279
+ report_to="none", # 可根据需要改为"wandb"等
280
+ # GRPO特有参数
281
+ num_generations=4, # 每个提示生成的响应数量
282
+ temperature=0.7,
283
+ max_completion_length=512, # 生成的最大长度
284
+ reward_weights=[0.1, 1.0],
285
+ )
286
 
287
+ grpo_trainer = GRPOTrainer(
288
+ model=model,
289
+ processing_class=tokenizer,
290
+ args=grpo_config,
291
+ train_dataset=train_dataset,
292
+ eval_dataset=valid_dataset,
293
+ reward_funcs=[format_reward_func, answer_reward_func],
294
+ )
295
+ grpo_trainer.train()
296
+ grpo_trainer.save_model(args.output_model_dir)
297
 
298
+ return
299
 
300
 
301
  if __name__ == "__main__":
302
+ main()