miyuki2026 commited on
Commit
8f7ca17
·
1 Parent(s): de47717
examples/download/{download_hub.py → download_hub_hf.py} RENAMED
File without changes
examples/playground/{chat.py → chat_minimind.py} RENAMED
File without changes
examples/playground/chat_modelscope.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ https://github.com/jingyaogong/minimind/blob/master/eval_llm.py
5
+ """
6
+ import argparse
7
+ import os
8
+ from pathlib import Path
9
+ import platform
10
+ import time
11
+
12
+ if platform.system() in ("Windows", "Darwin"):
13
+ from project_settings import project_path, temp_directory
14
+ else:
15
+ project_path = os.path.abspath("../../")
16
+ project_path = Path(project_path)
17
+ temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
18
+
19
+ import torch
20
+ from modelscope import AutoTokenizer, AutoModelForCausalLM
21
+ from transformers import TextStreamer
22
+
23
+
24
+ def get_args():
25
+ parser = argparse.ArgumentParser()
26
+ parser.add_argument(
27
+ "--pretrained_model_name_or_path",
28
+ default="qgyd2021/Qwen2.5-0.5B-ultrachat-sft-deepspeed",
29
+ type=str
30
+ )
31
+ parser.add_argument(
32
+ "--model_cache_dir",
33
+ default=(temp_directory / "hub_models").as_posix(),
34
+ type=str
35
+ )
36
+ parser.add_argument(
37
+ "--max_new_tokens",
38
+ default=8192, # 8192, 128
39
+ type=int, help="最大生成长度(注意:并非模型实际长文本能力)"
40
+ )
41
+ parser.add_argument("--top_p", default=0.85, type=float, help="nucleus采样阈值(0-1)")
42
+ parser.add_argument("--temperature", default=0.85, type=float, help="生成温度,控制随机性(0-1,越大越随机)")
43
+
44
+ parser.add_argument(
45
+ "--show_speed",
46
+ default=1, # 1, 0
47
+ type=int, help="显示decode速度(tokens/s)"
48
+ )
49
+
50
+ args = parser.parse_args()
51
+ return args
52
+
53
+
54
+ def main():
55
+ args = get_args()
56
+
57
+ os.environ["MODELSCOPE_CACHE"] = args.model_cache_dir
58
+
59
+ if torch.cuda.is_available():
60
+ device = "cuda"
61
+ elif torch.backends.mps.is_available():
62
+ # device = "mps"
63
+ device = "cpu"
64
+ else:
65
+ device = "cpu"
66
+ print(f"device: {device}")
67
+
68
+ model = AutoModelForCausalLM.from_pretrained(
69
+ args.pretrained_model_name_or_path,
70
+ cache_dir=args.model_cache_dir,
71
+ trust_remote_code=True,
72
+ )
73
+ tokenizer = AutoTokenizer.from_pretrained(
74
+ args.pretrained_model_name_or_path,
75
+ cache_dir=args.model_cache_dir,
76
+ trust_remote_code=True,
77
+ )
78
+ if tokenizer.pad_token is None:
79
+ tokenizer.pad_token = tokenizer.eos_token
80
+ tokenizer.pad_token_id = tokenizer.eos_token_id
81
+
82
+ model = model.eval().to(device)
83
+ # print(tokenizer)
84
+ # print(model)
85
+
86
+ prompts = [
87
+ "你有什么特长?",
88
+ "为什么天空是蓝色的",
89
+ "请用Python写一个计算斐波那契数列的函数",
90
+ '解释一下"光合作用"的基本过程',
91
+ "如果明天下雨,我应该如何出门",
92
+ "比较一下猫和狗作为宠物的优缺点",
93
+ "解释什么是机器学习",
94
+ "推荐一些中国的美食"
95
+ ]
96
+ input_mode = int(input("[0] 自动测试\n[1] 手动输入\n"))
97
+
98
+ streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
99
+
100
+ # conversation = list()
101
+ conversation = [
102
+ {"role": "system", "content": "You are a helpful assistant"}
103
+ ]
104
+ while True:
105
+ if input_mode == 0:
106
+ if len(prompts) == 0:
107
+ break
108
+ user_input = prompts.pop(0)
109
+ print(f"💬: {user_input}")
110
+ else:
111
+ user_input = input("💬: ")
112
+ user_input = str(user_input).strip()
113
+ conversation.append({"role": "user", "content": user_input})
114
+ inputs = tokenizer.apply_chat_template(
115
+ conversation=conversation,
116
+ tokenize=False,
117
+ add_generation_prompt=True
118
+ )
119
+ inputs = tokenizer.__call__(
120
+ inputs,
121
+ return_tensors="pt",
122
+ truncation=True
123
+ )
124
+ inputs = inputs.to(device)
125
+ # print(inputs)
126
+
127
+ print("🤖: ", end="")
128
+ st = time.time()
129
+ generated_ids = model.generate(
130
+ inputs=inputs["input_ids"], attention_mask=inputs["attention_mask"],
131
+ max_new_tokens=args.max_new_tokens, do_sample=True, streamer=streamer,
132
+ pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id,
133
+ top_p=args.top_p, temperature=args.temperature, repetition_penalty=3.0,
134
+ )
135
+ response = tokenizer.decode(generated_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
136
+ conversation.append({"role": "assistant", "content": response})
137
+ gen_tokens = len(generated_ids[0]) - len(inputs["input_ids"][0])
138
+ print(f"\n[Speed]: {gen_tokens / (time.time() - st):.2f} tokens/s\n\n") if args.show_speed else print("\n\n")
139
+
140
+ return
141
+
142
+
143
+ if __name__ == "__main__":
144
+ main()
examples/playground/generation.py CHANGED
@@ -16,8 +16,8 @@ def get_args():
16
  parser = argparse.ArgumentParser()
17
  parser.add_argument(
18
  "--pretrained_model_name_or_path",
19
- # default=(project_path / "trained_models/gpt2-sst2-generation"),
20
- default=(project_path / "trained_models/gpt2-sst2-generation-20260213-2048"),
21
  type=str
22
  )
23
  parser.add_argument(
@@ -50,9 +50,9 @@ def main():
50
 
51
  tokenized = tokenizer(
52
  # "this",
53
- # "this is ",
54
  # "who needs mind-bending",
55
- "eldom has a movie",
56
  # "thanks to scott 's charismatic",
57
  return_tensors="pt"
58
  )
 
16
  parser = argparse.ArgumentParser()
17
  parser.add_argument(
18
  "--pretrained_model_name_or_path",
19
+ default=(project_path / "trained_models/gpt2-sst2-generation"),
20
+ # default=(project_path / "trained_models/gpt2-sst2-ppo/checkpoint-150"),
21
  type=str
22
  )
23
  parser.add_argument(
 
50
 
51
  tokenized = tokenizer(
52
  # "this",
53
+ "this is ",
54
  # "who needs mind-bending",
55
+ # "eldom has a movie",
56
  # "thanks to scott 's charismatic",
57
  return_tensors="pt"
58
  )
examples/tutorials/dpo/ultrafeedback-dpo/requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ modelscope
4
+ datasets
5
+ trl
6
+ deepspeed
examples/tutorials/dpo/{ultrachat-sft → ultrafeedback-dpo}/step_1_prepare_data.py RENAMED
File without changes
examples/tutorials/dpo/ultrafeedback-dpo/step_2_train_dpo_model_single_gpu.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ https://huggingface.co/docs/trl/v0.16.1/en/sft_trainer
5
+
6
+ 单卡 V00 32G 全参微调
7
+ python3 step_2_train_sft_model_single_gpu.py
8
+
9
+ """
10
+ import argparse
11
+ import os
12
+ from pathlib import Path
13
+ import platform
14
+
15
+ # os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
16
+
17
+ debug_mode = True if platform.system() in ("Windows", "Darwin") else False
18
+ print(f"debug_mode: {debug_mode}")
19
+
20
+ if platform.system() in ("Windows", "Darwin"):
21
+ from project_settings import project_path, temp_directory
22
+ else:
23
+ project_path = os.path.abspath("../../../")
24
+ project_path = Path(project_path)
25
+ temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
26
+
27
+ from datasets import load_dataset
28
+ import torch
29
+
30
+ from modelscope import AutoModelForCausalLM
31
+ from transformers import AutoTokenizer
32
+ from trl import DPOConfig, DPOTrainer
33
+
34
+
35
+ def get_args():
36
+ parser = argparse.ArgumentParser()
37
+ parser.add_argument(
38
+ "--model_name",
39
+ default=(project_path / "pretrained_models/jingyaogong/MiniMind2").as_posix() if debug_mode else "qgyd2021/Qwen2.5-0.5B-ultrachat-sft-deepspeed",
40
+ type=str
41
+ ),
42
+ parser.add_argument(
43
+ "--dataset_path",
44
+ default="HuggingFaceH4/ultrafeedback_binarized",
45
+ # default="miyuki2026/tutorials" if debug_mode else "HuggingFaceH4/ultrachat_200k",
46
+ type=str
47
+ ),
48
+ parser.add_argument(
49
+ "--dataset_cache_dir",
50
+ default=(temp_directory / "hub_datasets").as_posix(),
51
+ type=str
52
+ ),
53
+ parser.add_argument(
54
+ "--model_cache_dir",
55
+ default=(temp_directory / "hub_models").as_posix(),
56
+ type=str
57
+ ),
58
+ parser.add_argument(
59
+ "--output_model_dir",
60
+ default=(temp_directory / "trained_models/qwen2_5-0_5B-ultrafeedback-dpo-single-gpu").as_posix(),
61
+ type=str
62
+ ),
63
+ parser.add_argument(
64
+ "--num_workers",
65
+ default=None if debug_mode else os.cpu_count() // 2,
66
+ type=int
67
+ ),
68
+ args = parser.parse_args()
69
+ return args
70
+
71
+
72
+ def format_func(examples, tokenizer):
73
+ chosen = examples["chosen"]
74
+ rejected = examples["rejected"]
75
+
76
+ chosen_prompt = chosen[:-1]
77
+ chosen_response = chosen[-1]
78
+
79
+ rejected_prompt = rejected[:-1]
80
+ rejected_response = rejected[-1]
81
+
82
+ chosen_prompt_text = tokenizer.apply_chat_template(
83
+ conversation=chosen_prompt,
84
+ tokenize=False,
85
+ add_generation_prompt=True, # DPO 需要添加生成提示,让模型知道要从这里开始生成
86
+ )
87
+ rejected_prompt_text = tokenizer.apply_chat_template(
88
+ conversation=rejected_prompt,
89
+ tokenize=False,
90
+ add_generation_prompt=True, # DPO 需要添加生成提示,让模型知道要从这里开始生成
91
+ )
92
+ if chosen_prompt_text != rejected_prompt_text:
93
+ raise AssertionError()
94
+
95
+ chosen_response_role = chosen_response["role"]
96
+ chosen_response_text = chosen_response["content"]
97
+ if chosen_response_role != "assistant":
98
+ raise AssertionError()
99
+
100
+ rejected_response_role = rejected_response["role"]
101
+ rejected_response_text = rejected_response["content"]
102
+ if rejected_response_role != "assistant":
103
+ raise AssertionError()
104
+
105
+ result = {
106
+ "prompt": chosen_prompt_text,
107
+ "chosen": chosen_response_text,
108
+ "rejected": rejected_response_text,
109
+ }
110
+ return result
111
+
112
+
113
+ def main():
114
+ args = get_args()
115
+
116
+ os.environ["MODELSCOPE_CACHE"] = args.model_cache_dir
117
+
118
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
119
+
120
+ model = AutoModelForCausalLM.from_pretrained(
121
+ args.model_name,
122
+ cache_dir=args.model_cache_dir,
123
+ trust_remote_code=True,
124
+ dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
125
+ )
126
+ ref_model = AutoModelForCausalLM.from_pretrained(
127
+ args.model_name,
128
+ cache_dir=args.model_cache_dir,
129
+ trust_remote_code=True,
130
+ dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
131
+ )
132
+ tokenizer = AutoTokenizer.from_pretrained(
133
+ args.model_name,
134
+ cache_dir=args.model_cache_dir,
135
+ trust_remote_code=True,
136
+ )
137
+ model = model.to(device)
138
+ ref_model = ref_model.to(device)
139
+
140
+ if tokenizer.pad_token is None:
141
+ tokenizer.pad_token = tokenizer.eos_token
142
+ tokenizer.pad_token_id = tokenizer.eos_token_id
143
+
144
+ print(model)
145
+ print(ref_model)
146
+ print(tokenizer)
147
+
148
+ dataset_dict = load_dataset(
149
+ path=args.dataset_path,
150
+ cache_dir=args.dataset_cache_dir,
151
+ )
152
+ train_dataset = dataset_dict["train_prefs"]
153
+ # test_dataset = dataset_dict["test_prefs"]
154
+
155
+ train_dataset = train_dataset.map(
156
+ lambda x: format_func(x, tokenizer),
157
+ batched=False,
158
+ num_proc=args.num_workers,
159
+ remove_columns=train_dataset.column_names,
160
+ )
161
+
162
+ dpo_config = DPOConfig(
163
+ output_dir=args.output_model_dir,
164
+ num_train_epochs=1,
165
+ per_device_train_batch_size=1 if debug_mode else 2,
166
+ gradient_accumulation_steps=1 if debug_mode else 8,
167
+ save_strategy="steps",
168
+ save_steps=100,
169
+ save_total_limit=2,
170
+ logging_steps=10,
171
+ learning_rate=2e-5,
172
+ warmup_steps=100,
173
+ lr_scheduler_type="cosine",
174
+ fp16=True if torch.cuda.is_available() else False,
175
+ gradient_checkpointing=False, # 如果内存紧张,可以设为 True
176
+ optim="adamw_torch",
177
+ report_to="none",
178
+ max_length=1024 if debug_mode else 2048, # prompt + chosen 的最大长度
179
+ max_prompt_length=512 if debug_mode else 1024, # prompt 的最大长度
180
+ # DPO 特定参数
181
+ beta=0.1, # DPO 的温度参数,控制对 preference 的置信度
182
+ remove_unused_columns=False,
183
+ dataloader_pin_memory=False,
184
+ )
185
+
186
+ trainer = DPOTrainer(
187
+ model=model,
188
+ ref_model=ref_model, # 提供参考模型
189
+ args=dpo_config,
190
+ train_dataset=train_dataset,
191
+ # DPOTrainer 会自动处理数据,不需要 data_collator
192
+ )
193
+
194
+ # 开始训练
195
+ print("开始 DPO 训练...")
196
+ trainer.train()
197
+
198
+ # 保存模型
199
+ print(f"保存模型到: {args.output_model_dir}")
200
+ trainer.save_model()
201
+ tokenizer.save_pretrained(args.output_model_dir)
202
+
203
+ print("DPO 训练完成!")
204
+ return
205
+
206
+
207
+ if __name__ == "__main__":
208
+ main()
examples/tutorials/rlhf/gpt2_sst2/step_5_ppo_rlhf.py CHANGED
@@ -1,12 +1,11 @@
1
  #!/usr/bin/python3
2
  # -*- coding: utf-8 -*-
3
  """
4
- https://huggingface.co/docs/trl/v0.16.1/en/ppo_trainer
 
5
  """
6
  import argparse
7
- import copy
8
  import os
9
- import random
10
  from pathlib import Path
11
  import platform
12
  from typing import Optional, Tuple, List, Dict, Union
@@ -14,12 +13,12 @@ from typing import Optional, Tuple, List, Dict, Union
14
  import numpy as np
15
  import torch
16
  import torch.nn as nn
17
- import torch.nn.functional as F
18
  from torch.utils.data import DataLoader
19
  from datasets import load_dataset
20
  from transformers import (
21
- AutoTokenizer, AutoModelForCausalLM, GPT2PreTrainedModel,
22
- GPT2Config, GPT2Model, GPT2LMHeadModel, DataCollatorWithPadding
 
23
  )
24
 
25
  # 路径配置
@@ -29,6 +28,8 @@ else:
29
  project_path = Path(os.path.abspath("../../../"))
30
  temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
31
 
 
 
32
 
33
  def get_args():
34
  parser = argparse.ArgumentParser()
@@ -44,7 +45,7 @@ def get_args():
44
  parser.add_argument("--valid_dataset_size", default=1000, type=int)
45
 
46
  # 训练参数
47
- parser.add_argument("--batch_size", default=16, type=int) # CPU上用小一点的batch
48
  parser.add_argument("--ppo_epochs", default=4, type=int)
49
  parser.add_argument("--mini_batch_size", default=4, type=int)
50
  parser.add_argument("--kl_beta", default=0.2, type=float)
@@ -63,371 +64,184 @@ def get_args():
63
 
64
  # 其他
65
  parser.add_argument("--num_workers", default=0 if platform.system() == "Windows" else 2, type=int)
66
- parser.add_argument("--device", default="cpu", type=str) # 强制用CPU
67
 
68
  return parser.parse_args()
69
 
70
 
71
- class ValueHead(nn.Module):
72
- """价值头,为每个token预测一个价值"""
73
-
74
- def __init__(self, hidden_size: int):
75
- super().__init__()
76
- self.linear = nn.Linear(hidden_size, 1)
77
- self._init_weights()
78
-
79
- def _init_weights(self):
80
- nn.init.normal_(self.linear.weight, std=1.0 / np.sqrt(self.linear.in_features + 1))
81
- nn.init.zeros_(self.linear.bias)
82
-
83
- def forward(self, hidden_states):
84
- return self.linear(hidden_states).squeeze(-1)
85
-
86
-
87
- class GPT2ActorCritic(GPT2PreTrainedModel):
88
- """Actor-Critic模型,同时输出logits和values"""
89
-
90
- def __init__(self, config: GPT2Config):
91
- super().__init__(config)
92
- self.lm = GPT2LMHeadModel(config)
93
- self.value_head = ValueHead(config.hidden_size)
94
- self.post_init()
95
-
96
- def forward(self, input_ids, attention_mask=None):
97
- outputs = self.lm(
98
- input_ids,
99
- attention_mask=attention_mask,
100
- output_hidden_states=True
101
- )
102
- # values来自最后一层hidden states
103
- values = self.value_head(outputs.hidden_states[-1])
104
- return outputs.logits, values
105
-
106
- def generate(self, *args, **kwargs):
107
- return self.lm.generate(*args, **kwargs)
108
-
109
- @classmethod
110
- def from_pretrained(cls, pretrained_model_name):
111
- """从预训练GPT2LMHeadModel加载"""
112
- config = GPT2Config.from_pretrained(pretrained_model_name)
113
- model = cls(config)
114
- pretrained = GPT2LMHeadModel.from_pretrained(pretrained_model_name)
115
- model.lm.load_state_dict(pretrained.state_dict(), strict=False)
116
- return model
117
-
118
-
119
- class GPT2RewardModel(GPT2PreTrainedModel):
120
- """奖励模型,为每个token预测奖励"""
121
-
122
- def __init__(self, config: GPT2Config):
123
- super().__init__(config)
124
- self.transformer = GPT2Model(config)
125
- self.reward_head = nn.Linear(config.hidden_size, 1)
126
- self.post_init()
127
-
128
- def forward(self, input_ids, attention_mask=None):
129
- outputs = self.transformer(
130
- input_ids,
131
- attention_mask=attention_mask,
132
- output_hidden_states=True
133
- )
134
- rewards = self.reward_head(outputs.hidden_states[-1]).squeeze(-1)
135
- return torch.sigmoid(rewards) # [batch, seq_len]
136
-
137
-
138
- class PPOAgent:
139
- """PPO训练Agent,封装所有训练逻辑"""
140
-
141
- def __init__(self, args):
142
- self.args = args
143
- self.device = torch.device(args.device)
144
-
145
- # 加载tokenizer
146
- self.tokenizer = AutoTokenizer.from_pretrained(args.sft_model_name)
147
- self.tokenizer.pad_token = self.tokenizer.eos_token
148
- self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
149
-
150
- # 加载模型
151
- print("Loading models...")
152
- self.actor_critic = GPT2ActorCritic.from_pretrained(args.sft_model_name).to(self.device)
153
- self.reward_model = GPT2RewardModel.from_pretrained(args.reward_model_name).to(self.device)
154
- self.reward_model.eval()
155
-
156
- # 参考模型(冻结)
157
- self.ref_model = copy.deepcopy(self.actor_critic).to(self.device)
158
- self.ref_model.eval()
159
-
160
- # 优化器
161
- self.optimizer = torch.optim.Adam(self.actor_critic.parameters(), lr=args.lr)
162
-
163
- # 训练状态
164
- self.training_step = 0
165
-
166
- def prepare_dataset(self):
167
- """准备训练数据集"""
168
- print("Loading dataset...")
169
- dataset = load_dataset(
170
- path=self.args.dataset_path,
171
- cache_dir=self.args.dataset_cache_dir,
172
- split="train"
173
- )
174
-
175
- def filter_and_truncate(example):
176
- # 只保留足够长的句子
177
- tokens = self.tokenizer(example["sentence"])["input_ids"]
178
- if len(tokens) <= 8:
179
- return False
180
-
181
- # 随机截取前2-6个token作为query
182
- example["query_ids"] = tokens[:random.randint(2, 6)]
183
- return True
184
-
185
- dataset = dataset.filter(filter_and_truncate)
186
- dataset = dataset.select(range(min(len(dataset), 5000))) # CPU上用小数据集
187
-
188
- return dataset
189
-
190
- def collect_rollouts(self, batch):
191
- """收集一轮交互数据"""
192
- query_ids_list = []
193
- response_ids_list = []
194
- rewards_list = []
195
-
196
- for i in range(len(batch["query_ids"])):
197
- query_ids = torch.tensor(batch["query_ids"][i]).to(self.device)
198
- query_ids_list.append(query_ids)
199
-
200
- # 生成response
201
- with torch.no_grad():
202
- response_len = random.randint(
203
- self.args.min_response_len,
204
- self.args.max_response_len
205
- )
206
- full_ids = self.actor_critic.generate(
207
- input_ids=query_ids.unsqueeze(0),
208
- max_new_tokens=response_len,
209
- do_sample=True,
210
- top_p=self.args.top_p,
211
- temperature=self.args.temperature,
212
- pad_token_id=self.tokenizer.pad_token_id,
213
- eos_token_id=self.tokenizer.eos_token_id,
214
- )[0]
215
-
216
- response_ids = full_ids[len(query_ids):]
217
- response_ids_list.append(response_ids)
218
-
219
- # 计算奖励(只取最后一个token的奖励)
220
- reward = self.reward_model(
221
- full_ids.unsqueeze(0),
222
- attention_mask=torch.ones_like(full_ids).unsqueeze(0)
223
- )[0, -1]
224
- # 缩放到[-1, 1]
225
- rewards_list.append(2 * (reward - 0.5))
226
-
227
- return query_ids_list, response_ids_list, rewards_list
228
-
229
- def compute_advantages_and_returns(self, log_probs, values, rewards, masks):
230
- """计算GAE advantages和returns"""
231
- seq_len = rewards.shape[1]
232
- advantages = torch.zeros_like(rewards)
233
- returns = torch.zeros_like(rewards)
234
-
235
- gae = 0
236
- for t in reversed(range(seq_len)):
237
- if t == seq_len - 1:
238
- next_value = 0
239
- else:
240
- next_value = values[:, t + 1]
241
-
242
- delta = rewards[:, t] + self.args.gamma * next_value - values[:, t]
243
- gae = delta + self.args.gamma * self.args.lam * gae
244
- advantages[:, t] = gae
245
- returns[:, t] = advantages[:, t] + values[:, t]
246
-
247
- # 只对有效位置进行whiten
248
- advantages = self.masked_whiten(advantages, masks)
249
- return advantages, returns
250
-
251
- def masked_whiten(self, values, mask):
252
- """带mask的whitening"""
253
- mask = mask.float()
254
- mean = (values * mask).sum() / mask.sum()
255
- var = (((values - mean) * mask) ** 2).sum() / mask.sum()
256
- whitened = (values - mean) * torch.rsqrt(var + 1e-8)
257
- return whitened * mask
258
-
259
- def ppo_step(self, batch_data):
260
- """单步PPO更新"""
261
- (query_ids_list, response_ids_list, old_log_probs,
262
- advantages, returns, masks) = batch_data
263
-
264
- # 拼接完整的query+response
265
- full_ids_list = []
266
- for q, r in zip(query_ids_list, response_ids_list):
267
- full_ids_list.append(torch.cat([q, r]))
268
-
269
- # padding
270
- padded = self.tokenizer.pad(
271
- {"input_ids": full_ids_list},
272
- padding=True,
273
- return_tensors="pt"
274
- )
275
- input_ids = padded["input_ids"].to(self.device)
276
- attention_mask = padded["attention_mask"].to(self.device)
277
-
278
- # 前向传播
279
- logits, values = self.actor_critic(input_ids, attention_mask)
280
-
281
- # 计算新的log_probs
282
- log_probs = F.log_softmax(logits[:, :-1, :], dim=-1)
283
- log_probs = torch.gather(
284
- log_probs, 2,
285
- input_ids[:, 1:].unsqueeze(-1)
286
- ).squeeze(-1)
287
-
288
- # 只保留response部分的log_probs
289
- response_start = [len(q) for q in query_ids_list]
290
- new_log_probs = []
291
- for i, start in enumerate(response_start):
292
- new_log_probs.append(log_probs[i, start - 1:start - 1 + len(response_ids_list[i])])
293
- new_log_probs = torch.cat(new_log_probs)
294
-
295
- # 计算ratio和PPO损失
296
- old_log_probs = old_log_probs.detach()
297
- ratio = torch.exp(new_log_probs - old_log_probs)
298
-
299
- # 裁剪的policy loss
300
- surr1 = ratio * advantages
301
- surr2 = torch.clamp(ratio, 1 - self.args.clip_epsilon,
302
- 1 + self.args.clip_epsilon) * advantages
303
- policy_loss = -torch.min(surr1, surr2).mean()
304
-
305
- # value loss
306
- value_pred = []
307
- for i, start in enumerate(response_start):
308
- value_pred.append(values[i, start - 1:start - 1 + len(response_ids_list[i])])
309
- value_pred = torch.cat(value_pred)
310
- value_loss = F.mse_loss(value_pred, returns)
311
-
312
- # 总loss
313
- loss = policy_loss + 0.5 * value_loss
314
-
315
- return loss, policy_loss, value_loss
316
-
317
- def train_epoch(self, dataset):
318
- """训练一个epoch"""
319
- total_policy_loss = 0
320
- total_value_loss = 0
321
- num_batches = 0
322
-
323
- for batch_idx in range(0, len(dataset), self.args.batch_size):
324
- # 1. 收集数据
325
- batch = dataset[batch_idx:batch_idx + self.args.batch_size]
326
- query_ids_list, response_ids_list, rewards_list = self.collect_rollouts(batch)
327
-
328
- # 2. 计算旧的log_probs和values
329
- old_log_probs_list = []
330
- values_list = []
331
- masks_list = []
332
-
333
- with torch.no_grad():
334
- for q_ids, r_ids in zip(query_ids_list, response_ids_list):
335
- full_ids = torch.cat([q_ids, r_ids]).unsqueeze(0).to(self.device)
336
- attn_mask = torch.ones_like(full_ids)
337
-
338
- logits, values = self.actor_critic(full_ids, attn_mask)
339
-
340
- # 计算response部分的log_probs
341
- log_probs = F.log_softmax(logits[:, :-1, :], dim=-1)
342
- log_probs = torch.gather(
343
- log_probs, 2,
344
- full_ids[:, 1:].unsqueeze(-1)
345
- ).squeeze(-1)
346
-
347
- start = len(q_ids) - 1
348
- end = start + len(r_ids)
349
- old_log_probs_list.append(log_probs[0, start:end])
350
- values_list.append(values[0, start:end])
351
-
352
- # 创建mask
353
- mask = torch.zeros(len(r_ids))
354
- mask[-1] = 1 # 最后一个token有真实奖励
355
- masks_list.append(mask)
356
-
357
- # 转换为tensor
358
- old_log_probs = torch.cat(old_log_probs_list).to(self.device)
359
- values = torch.cat(values_list).to(self.device)
360
- masks = torch.cat(masks_list).to(self.device)
361
- rewards = torch.zeros_like(values).to(self.device)
362
-
363
- # 设置奖励(只在最后一个token加上环境奖励)
364
- for i, (r, mask) in enumerate(zip(rewards_list, masks_list)):
365
- if mask[-1] > 0:
366
- # KL惩罚
367
- kl = old_log_probs[i] - old_log_probs[i] # 这里简化了,实际要用ref_model
368
- kl_penalty = -self.args.kl_beta * kl
369
- rewards[i] = kl_penalty + r
370
-
371
- # 3. 计算advantages和returns
372
- advantages, returns = self.compute_advantages_and_returns(
373
- old_log_probs.unsqueeze(0),
374
- values.unsqueeze(0),
375
- rewards.unsqueeze(0),
376
- masks.unsqueeze(0)
377
- )
378
-
379
- # 4. PPO多次更新
380
- batch_data = (query_ids_list, response_ids_list, old_log_probs,
381
- advantages.squeeze(0), returns.squeeze(0), masks)
382
-
383
- for _ in range(self.args.ppo_epochs):
384
- loss, policy_loss, value_loss = self.ppo_step(batch_data)
385
-
386
- self.optimizer.zero_grad()
387
- loss.backward()
388
- torch.nn.utils.clip_grad_norm_(self.actor_critic.parameters(), 1.0)
389
- self.optimizer.step()
390
-
391
- total_policy_loss += policy_loss.item()
392
- total_value_loss += value_loss.item()
393
- num_batches += 1
394
- self.training_step += 1
395
-
396
- if batch_idx % 100 == 0:
397
- print(f"Batch {batch_idx}/{len(dataset)}: "
398
- f"policy_loss={total_policy_loss / num_batches:.4f}, "
399
- f"value_loss={total_value_loss / num_batches:.4f}")
400
-
401
- return total_policy_loss / num_batches, total_value_loss / num_batches
402
-
403
- def train(self):
404
- """主训练循环"""
405
- dataset = self.prepare_dataset()
406
- print(f"Dataset size: {len(dataset)}")
407
-
408
- for epoch in range(self.args.max_epochs):
409
- print(f"\n=== Epoch {epoch + 1}/{self.args.max_epochs} ===")
410
- policy_loss, value_loss = self.train_epoch(dataset)
411
- print(f"Epoch {epoch + 1} finished: "
412
- f"policy_loss={policy_loss:.4f}, value_loss={value_loss:.4f}")
413
 
414
 
415
  def main():
416
  args = get_args()
417
  print("PPO Training with CPU")
418
- print(f"Arguments: {args}")
419
 
420
- # 创建agent并开始训练
421
- agent = PPOAgent(args)
422
- agent.train()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423
 
424
- # 保存模型
425
- output_dir = Path(args.sft_model_name) / "ppo_trained"
426
- output_dir.mkdir(exist_ok=True, parents=True)
427
- agent.actor_critic.save_pretrained(output_dir)
428
- agent.tokenizer.save_pretrained(output_dir)
429
- print(f"Model saved to {output_dir}")
430
 
431
 
432
  if __name__ == "__main__":
433
- main()
 
1
  #!/usr/bin/python3
2
  # -*- coding: utf-8 -*-
3
  """
4
+ PPO Training with TRL on SST-2 dataset
5
+ 基于 https://huggingface.co/docs/trl/v0.16.1/en/ppo_trainer 的实现
6
  """
7
  import argparse
 
8
  import os
 
9
  from pathlib import Path
10
  import platform
11
  from typing import Optional, Tuple, List, Dict, Union
 
13
  import numpy as np
14
  import torch
15
  import torch.nn as nn
 
16
  from torch.utils.data import DataLoader
17
  from datasets import load_dataset
18
  from transformers import (
19
+ AutoTokenizer,
20
+ GPT2LMHeadModel,
21
+ DataCollatorWithPadding
22
  )
23
 
24
  # 路径配置
 
28
  project_path = Path(os.path.abspath("../../../"))
29
  temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
30
 
31
+ from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
32
+
33
 
34
  def get_args():
35
  parser = argparse.ArgumentParser()
 
45
  parser.add_argument("--valid_dataset_size", default=1000, type=int)
46
 
47
  # 训练参数
48
+ parser.add_argument("--batch_size", default=16, type=int)
49
  parser.add_argument("--ppo_epochs", default=4, type=int)
50
  parser.add_argument("--mini_batch_size", default=4, type=int)
51
  parser.add_argument("--kl_beta", default=0.2, type=float)
 
64
 
65
  # 其他
66
  parser.add_argument("--num_workers", default=0 if platform.system() == "Windows" else 2, type=int)
67
+ parser.add_argument("--device", default="cpu", type=str)
68
 
69
  return parser.parse_args()
70
 
71
 
72
+ def build_dataset(tokenizer, dataset_path, dataset_cache_dir, valid_dataset_size):
73
+ """
74
+ 构建SST-2数据集,返回query列表
75
+ """
76
+ dataset = load_dataset(
77
+ dataset_path,
78
+ cache_dir=dataset_cache_dir,
79
+ split="train"
80
+ )
81
+
82
+ # 只取前valid_dataset_size条数据用于演示
83
+ dataset = dataset.select(range(min(valid_dataset_size, len(dataset))))
84
+
85
+ def tokenize_function(examples):
86
+ return tokenizer(examples["sentence"], truncation=True, max_length=128)
87
+
88
+ dataset = dataset.map(tokenize_function, batched=True)
89
+ dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
90
+
91
+ return dataset
92
+
93
+
94
+ class RewardModelWrapper:
95
+ """
96
+ 奖励模型包装器,用于计算生成文本的奖励
97
+ """
98
+
99
+ def __init__(self, reward_model_name, tokenizer, device):
100
+ self.device = device
101
+ self.tokenizer = tokenizer
102
+ # 加载你的GPT2RewardModel或标准模型
103
+ from transformers import GPT2ForSequenceClassification
104
+ self.model = GPT2ForSequenceClassification.from_pretrained(reward_model_name).to(device)
105
+ self.model.eval()
106
+
107
+ def get_reward(self, texts: List[str]) -> List[float]:
108
+ """
109
+ 计算文本的奖励分数(SST-2情感分类)
110
+ """
111
+ inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(self.device)
112
+ with torch.no_grad():
113
+ outputs = self.model(**inputs)
114
+ # SST-2是二分类,取正面情感的logits作为奖励
115
+ logits = outputs.logits
116
+ # 如果模型输出是logits,使用softmax获取正面概率
117
+ probs = torch.softmax(logits, dim=-1)
118
+ # 假设标签1是正面
119
+ rewards = probs[:, 1].cpu().tolist()
120
+ return rewards
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
 
123
  def main():
124
  args = get_args()
125
  print("PPO Training with CPU")
 
126
 
127
+ # 设备设置
128
+ device = torch.device(args.device)
129
+
130
+ # 1. 加载tokenizer
131
+ tokenizer = AutoTokenizer.from_pretrained(args.sft_model_name, cache_dir=args.model_cache_dir)
132
+ tokenizer.pad_token = tokenizer.eos_token
133
+
134
+ # 2. 构建数据集
135
+ dataset = build_dataset(
136
+ tokenizer,
137
+ args.dataset_path,
138
+ args.dataset_cache_dir,
139
+ args.valid_dataset_size
140
+ )
141
+
142
+ # 3. 加载模型(使用TRL的AutoModelForCausalLMWithValueHead)
143
+ # 这会在原有LM基础上自动添加value head
144
+ model = AutoModelForCausalLMWithValueHead.from_pretrained(args.sft_model_name)
145
+ model.to(device)
146
+
147
+ # 4. 加载参考模型(用于KL散度计算)
148
+ ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(args.sft_model_name)
149
+ ref_model.to(device)
150
+
151
+ # 5. 加载奖励模型
152
+ reward_model = RewardModelWrapper(args.reward_model_name, tokenizer, device)
153
+
154
+ # 6. 配置PPO参数
155
+ ppo_config = PPOConfig(
156
+ model_name=args.sft_model_name,
157
+ learning_rate=args.lr,
158
+ batch_size=args.batch_size,
159
+ mini_batch_size=args.mini_batch_size,
160
+ ppo_epochs=args.ppo_epochs,
161
+ clip_epsilon=args.clip_epsilon,
162
+ gamma=args.gamma,
163
+ lam=args.lam,
164
+ kl_penalty=args.kl_beta,
165
+ device=device,
166
+ log_with=None, # 可设置"wandb"等
167
+ project_kwargs={"logging_dir": "./logs"},
168
+ )
169
+
170
+ # 7. 初始化PPO Trainer
171
+ ppo_trainer = PPOTrainer(
172
+ config=ppo_config,
173
+ model=model,
174
+ ref_model=ref_model,
175
+ tokenizer=tokenizer,
176
+ dataset=dataset,
177
+ data_collator=DataCollatorWithPadding(tokenizer)
178
+ )
179
+
180
+ # 8. 定义response长度采样器(在min和max之间随机)
181
+ response_length_sampler = LengthSampler(args.min_response_len, args.max_response_len)
182
+
183
+ # 9. 训练循环
184
+ generation_kwargs = {
185
+ "min_length": -1, # 不强制最小长度
186
+ "top_p": args.top_p,
187
+ "temperature": args.temperature,
188
+ "do_sample": True,
189
+ "pad_token_id": tokenizer.eos_token_id,
190
+ "max_new_tokens": args.max_new_tokens,
191
+ }
192
+
193
+ for epoch in range(args.max_epochs):
194
+ print(f"Epoch {epoch + 1}/{args.max_epochs}")
195
+
196
+ for batch_idx, batch in enumerate(ppo_trainer.dataloader):
197
+ # 获取query的input_ids
198
+ query_tensors = batch["input_ids"]
199
+
200
+ # 使用模型生成response
201
+ response_tensors = []
202
+ for query in query_tensors:
203
+ # query已经是tensor,添加batch维度
204
+ query = query.unsqueeze(0).to(device)
205
+
206
+ # 生成response(这里使用respond_to_batch工具函数)
207
+ response = respond_to_batch(
208
+ model,
209
+ query,
210
+ length_sampler=response_length_sampler,
211
+ **generation_kwargs
212
+ )
213
+ response_tensors.append(response.squeeze())
214
+
215
+ # 解码生成的文本
216
+ responses = [tokenizer.decode(r, skip_special_tokens=True) for r in response_tensors]
217
+ queries = [tokenizer.decode(q, skip_special_tokens=True) for q in query_tensors]
218
+
219
+ # 使用奖励模型计算奖励
220
+ # 这里我们结合query和response作为完整文本进行情感分析
221
+ full_texts = [q + " " + r for q, r in zip(queries, responses)]
222
+ rewards = reward_model.get_reward(full_texts)
223
+
224
+ # 转换为tensor
225
+ rewards = [torch.tensor(r, device=device) for r in rewards]
226
+
227
+ # 执行PPO更新步骤
228
+ stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
229
+
230
+ # 打印统计信息
231
+ if batch_idx % 10 == 0:
232
+ print(f"Batch {batch_idx}, mean reward: {np.mean(rewards):.4f}")
233
+ print(f"Stats: {stats}")
234
+ print(f"Example query: {queries[0]}")
235
+ print(f"Example response: {responses[0]}")
236
+ print(f"Reward: {rewards[0].item():.4f}")
237
+ print("-" * 50)
238
 
239
+ # 每个epoch保存一次模型
240
+ save_path = Path("ppo_models") / f"epoch_{epoch}"
241
+ ppo_trainer.save_pretrained(save_path)
242
+ tokenizer.save_pretrained(save_path)
243
+ print(f"Model saved to {save_path}")
 
244
 
245
 
246
  if __name__ == "__main__":
247
+ main()
examples/tutorials/rlhf/gpt2_sst2_generation/step_2_train_model.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 用sst的句子训练gpt2模型,让其随机生成一些评论。
5
+ """
6
+ import argparse
7
+ import os
8
+ from pathlib import Path
9
+ import platform
10
+
11
+ if platform.system() in ("Windows", "Darwin"):
12
+ from project_settings import project_path, temp_directory
13
+ else:
14
+ project_path = os.path.abspath("../../../")
15
+ project_path = Path(project_path)
16
+ temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
17
+
18
+ from datasets import load_dataset
19
+ import torch
20
+ from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
21
+ from transformers import GPT2LMHeadModel
22
+
23
+
24
+ def get_args():
25
+ parser = argparse.ArgumentParser()
26
+ parser.add_argument(
27
+ "--model_name",
28
+ # default="openai-community/gpt2",
29
+ default=(project_path / "pretrained_models/openai-community/gpt2").as_posix(),
30
+ type=str
31
+ ),
32
+ parser.add_argument(
33
+ "--dataset_path",
34
+ default="stanfordnlp/sst2",
35
+ type=str
36
+ ),
37
+ parser.add_argument("--dataset_name", default=None, type=str),
38
+ parser.add_argument("--dataset_split", default=None, type=str),
39
+ parser.add_argument(
40
+ "--dataset_cache_dir",
41
+ default=(temp_directory / "hub_datasets").as_posix(),
42
+ type=str
43
+ ),
44
+ parser.add_argument(
45
+ "--model_cache_dir",
46
+ default=(temp_directory / "hub_models").as_posix(),
47
+ type=str
48
+ ),
49
+ parser.add_argument("--dataset_streaming", default=None, type=str),
50
+ parser.add_argument("--valid_dataset_size", default=1000, type=int),
51
+ parser.add_argument("--shuffle_buffer_size", default=5000, type=int),
52
+
53
+ parser.add_argument(
54
+ "--output_model_dir",
55
+ default=(project_path / "trained_models/gpt2-sst2-generation-epoch-3").as_posix(),
56
+ type=str
57
+ ),
58
+
59
+ parser.add_argument(
60
+ "--num_workers",
61
+ default=None if platform.system() in ("Windows", "Darwin") else os.cpu_count() // 2,
62
+ type=int
63
+ ),
64
+ parser.add_argument(
65
+ "--device",
66
+ default=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
67
+ type=int
68
+ ),
69
+ args = parser.parse_args()
70
+ return args
71
+
72
+
73
+ def main():
74
+ args = get_args()
75
+
76
+ model = AutoModelForCausalLM.from_pretrained(args.model_name)
77
+ model = model.to(args.device)
78
+ tokenizer = AutoTokenizer.from_pretrained(args.model_name)
79
+ tokenizer.pad_token = tokenizer.eos_token
80
+
81
+ dataset_dict = load_dataset(
82
+ path=args.dataset_path,
83
+ name=args.dataset_name,
84
+ split=args.dataset_split,
85
+ cache_dir=args.dataset_cache_dir,
86
+ # num_proc=args.num_workers if not args.dataset_streaming else None,
87
+ streaming=args.dataset_streaming,
88
+ )
89
+ train_dataset = dataset_dict["train"]
90
+ valid_dataset = dataset_dict["validation"]
91
+ # test_dataset = dataset_dict["test"]
92
+
93
+ def format_func(example):
94
+ sentence = example["sentence"]
95
+ sentence += tokenizer.eos_token
96
+ tokenized = tokenizer(sentence)
97
+ input_ids = tokenized["input_ids"]
98
+ attention_mask = tokenized["attention_mask"]
99
+ # print(input_ids)
100
+ # print(attention_mask)
101
+ result = {
102
+ "input_ids": input_ids,
103
+ "attention_mask": attention_mask,
104
+ }
105
+ return result
106
+
107
+ train_dataset = train_dataset.map(
108
+ format_func,
109
+ batched=False,
110
+ remove_columns=train_dataset.column_names,
111
+ )
112
+ valid_dataset = valid_dataset.map(
113
+ format_func,
114
+ batched=False,
115
+ remove_columns=valid_dataset.column_names,
116
+ )
117
+ print(f"train_dataset size: {len(train_dataset)}")
118
+ print(f"valid_dataset size: {len(valid_dataset)}")
119
+
120
+ train_dataset = train_dataset.filter(
121
+ function=lambda x: 5 < len(x["input_ids"]) < 1024
122
+ )
123
+ valid_dataset = valid_dataset.filter(
124
+ function=lambda x: 5 < len(x["input_ids"]) < 1024
125
+ )
126
+ print(f"train_dataset size: {len(train_dataset)}")
127
+ print(f"valid_dataset size: {len(valid_dataset)}")
128
+
129
+ data_collator = DataCollatorForLanguageModeling(
130
+ tokenizer,
131
+ mlm=False
132
+ )
133
+
134
+ training_args = TrainingArguments(
135
+ output_dir=args.output_model_dir,
136
+ # overwrite_output_dir=True,
137
+ num_train_epochs=3,
138
+ per_device_train_batch_size=16,
139
+ per_device_eval_batch_size=16,
140
+ eval_strategy="steps",
141
+ eval_steps=100,
142
+ save_strategy="steps",
143
+ save_steps=100,
144
+ save_total_limit=2,
145
+ logging_steps=100,
146
+ learning_rate=5e-5,
147
+ warmup_steps=500,
148
+ weight_decay=0.01,
149
+ fp16=torch.cuda.is_available(),
150
+ dataloader_num_workers=args.num_workers or 0,
151
+ remove_unused_columns=False,
152
+ load_best_model_at_end=False,
153
+ # metric_for_best_model="eval_loss",
154
+ # greater_is_better=False,
155
+ )
156
+
157
+ trainer = Trainer(
158
+ model=model,
159
+ args=training_args,
160
+ data_collator=data_collator,
161
+ train_dataset=train_dataset,
162
+ eval_dataset=valid_dataset,
163
+ tokenizer=tokenizer,
164
+ )
165
+
166
+ trainer.train()
167
+ trainer.save_model()
168
+ return
169
+
170
+
171
+ if __name__ == "__main__":
172
+ main()
examples/tutorials/rlhf/gpt2_sst2_generation/step_3_generation.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import time
5
+
6
+ import torch
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
8
+
9
+ from project_settings import project_path
10
+
11
+
12
+ def get_args():
13
+ parser = argparse.ArgumentParser()
14
+ parser.add_argument(
15
+ "--pretrained_model_name_or_path",
16
+ default=(project_path / "trained_models/gpt2-sst2-generation-epoch-3"),
17
+ # default=(project_path / "trained_models/gpt2-sst2-ppo/checkpoint-150"),
18
+ # default=(project_path / "trained_models/gpt2-sst2-generation-epoch-3/checkpoint-5400"),
19
+ type=str
20
+ )
21
+ parser.add_argument(
22
+ "--max_new_tokens",
23
+ default=1024, # 8192, 128
24
+ type=int, help="最大生成长度(注意:并非模型实际长文本能力)"
25
+ )
26
+ parser.add_argument("--top_p", default=0.85, type=float, help="nucleus采样阈值(0-1)")
27
+ parser.add_argument("--temperature", default=0.85, type=float, help="生成温度,控制随机性(0-1,越大越随机)")
28
+
29
+ args = parser.parse_args()
30
+ return args
31
+
32
+
33
+ def main():
34
+ args = get_args()
35
+
36
+ if torch.cuda.is_available():
37
+ device = "cuda"
38
+ elif torch.backends.mps.is_available():
39
+ # device = "mps"
40
+ device = "cpu"
41
+ else:
42
+ device = "cpu"
43
+ print(f"device: {device}")
44
+
45
+ tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_name_or_path)
46
+ model = AutoModelForCausalLM.from_pretrained(args.pretrained_model_name_or_path)
47
+ model = model.eval().to(device)
48
+
49
+ tokenized = tokenizer(
50
+ # "this",
51
+ # "this is ",
52
+ "it 's ",
53
+ # "please listen ",
54
+ # "eldom has a movie",
55
+ # "thanks to scott 's charismatic",
56
+ return_tensors="pt"
57
+ )
58
+
59
+ streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
60
+
61
+ generated_ids = model.generate(
62
+ inputs=tokenized["input_ids"], attention_mask=tokenized["attention_mask"],
63
+ max_new_tokens=args.max_new_tokens, do_sample=True, streamer=streamer,
64
+ pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id,
65
+ top_p=args.top_p, temperature=args.temperature, repetition_penalty=3.0,
66
+ early_stopping=True,
67
+ )
68
+ # response = tokenizer.decode(generated_ids[0][len(tokenized["input_ids"][0]):], skip_special_tokens=True)
69
+ response = tokenizer.decode(generated_ids[0], skip_special_tokens=False)
70
+ print(response)
71
+ # print(generated_ids)
72
+ print(f"count: {generated_ids.shape}")
73
+
74
+ return
75
+
76
+
77
+ if __name__ == "__main__":
78
+ main()
examples/tutorials/rlhf/gpt2_sst2_ppo/requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ trl==0.16.1
2
+ transformers==4.50.2
examples/tutorials/rlhf/gpt2_sst2_ppo/step_1_prepare_data.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 或使用命令行
5
+ pip install modelscope
6
+ modelscope download \
7
+ --model 'qgyd2021/gpt2-for-sequence-classification-sst2-reward' \
8
+ --local_dir '/root/autodl-tmp/trained_models/Qwen3-8B-sft-deepspeed'
9
+
10
+ python3 step_1_prepare_data.py \
11
+ --repo_id qgyd2021/gpt2-for-sequence-classification-sst2-reward \
12
+ --local_dir /root/autodl-tmp/OpenMiniMind/trained_models/gpt2-for-sequence-classification-sst2-reward
13
+
14
+ python3 step_1_prepare_data.py \
15
+ --repo_id qgyd2021/gpt2-sst2-generation-epoch-3 \
16
+ --local_dir /root/autodl-tmp/OpenMiniMind/trained_models/gpt2-sst2-generation-epoch-3
17
+
18
+
19
+ """
20
+ import argparse
21
+ import os
22
+ from pathlib import Path
23
+ import platform
24
+
25
+ if platform.system() in ("Windows", "Darwin"):
26
+ from project_settings import project_path, temp_directory
27
+ else:
28
+ project_path = os.path.abspath("../../../")
29
+ project_path = Path(project_path)
30
+ temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
31
+
32
+ from modelscope import snapshot_download
33
+
34
+
35
+ def get_args():
36
+ parser = argparse.ArgumentParser()
37
+ parser.add_argument("--repo_id", default="qgyd2021/Qwen3-8B-sft-deepspeed", type=str)
38
+ parser.add_argument(
39
+ "--local_dir",
40
+ default=(temp_directory / "../trained_models/Qwen3-8B-sft-deepspeed").as_posix(),
41
+ type=str
42
+ )
43
+ args = parser.parse_args()
44
+ return args
45
+
46
+
47
+ def main():
48
+ args = get_args()
49
+
50
+ snapshot_download(
51
+ model_id=args.repo_id,
52
+ local_dir=args.local_dir,
53
+ )
54
+ return
55
+
56
+
57
+ if __name__ == "__main__":
58
+ main()
examples/tutorials/rlhf/gpt2_sst2_ppo/step_2_train_model_on_cpu.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ PPO Training with TRL on SST-2 dataset
5
+ 基于 https://huggingface.co/docs/trl/v0.16.1/en/ppo_trainer 的实现
6
+
7
+
8
+ (1)策略模型 policy 根据 queries 生成 query_response 和 logits
9
+ (2)根据 logits 计算 logprob 概率,再索引出 response 对应的概率。
10
+ (3)参考模型 ref_policy 根据 query_response 计算其 ref_logits
11
+ (4)根据 ref_logits 计算 ref_logprob 概率,再索引出 response 对应的概率。
12
+ (5)query_response 中生成的第一个 eos_token 开始后面的 token 都替换为 pad_token。
13
+ (6)价值模型 value_model 计算 query_response 中 response 部分每个token的价值。
14
+ (7)奖励模型 reward_model 计算 postprocessed_query_response 中最后一个非 pad_token 的奖励。
15
+ (8)得到:
16
+ (9)kl = logprobs - ref_logprobs
17
+ non_score_reward = -args.kl_coef * kl
18
+ advantages
19
+ returns = advantages + values
20
+
21
+ """
22
+ import argparse
23
+ import os
24
+ from pathlib import Path
25
+ import platform
26
+
27
+ import torch
28
+ from datasets import load_dataset
29
+ from transformers import (
30
+ AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification,
31
+ GPT2LMHeadModel, GPT2ForSequenceClassification,
32
+ DataCollatorWithPadding
33
+ )
34
+
35
+ # 路径配置
36
+ if platform.system() in ("Windows", "Darwin"):
37
+ from project_settings import project_path, temp_directory
38
+ else:
39
+ project_path = Path(os.path.abspath("../../../"))
40
+ temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
41
+
42
+ from trl import PPOTrainer, PPOConfig
43
+
44
+
45
+ def get_args():
46
+ parser = argparse.ArgumentParser()
47
+ parser.add_argument("--sft_model_name", type=str,
48
+ default=(project_path / "trained_models/gpt2-sst2-generation-epoch-3").as_posix())
49
+ parser.add_argument("--reward_model_name", type=str,
50
+ default=(project_path / "trained_models/gpt2-for-sequence-classification-sst2-reward").as_posix())
51
+ parser.add_argument("--dataset_path", default="stanfordnlp/sst2", type=str)
52
+ parser.add_argument("--dataset_cache_dir",
53
+ default=(temp_directory / "hub_datasets").as_posix(), type=str)
54
+ parser.add_argument("--model_cache_dir",
55
+ default=(temp_directory / "hub_models").as_posix(), type=str)
56
+
57
+ # 训练参数
58
+
59
+ # 生成参数
60
+
61
+ parser.add_argument(
62
+ "--output_model_dir",
63
+ default=(project_path / "trained_models/gpt2-sst2-ppo").as_posix(),
64
+ type=str
65
+ ),
66
+
67
+ # 其他
68
+ parser.add_argument("--num_workers", default=0 if platform.system() == "Windows" else 2, type=int)
69
+ parser.add_argument("--device", default="cpu", type=str)
70
+
71
+ return parser.parse_args()
72
+
73
+
74
+ def format_func(example, tokenizer):
75
+ sentence: str = example["sentence"]
76
+ # score: float = float(example["label"])
77
+ tokenized = tokenizer(sentence)
78
+ input_ids = tokenized["input_ids"]
79
+ attention_mask = tokenized["attention_mask"]
80
+ result = {
81
+ "input_ids": input_ids,
82
+ "attention_mask": attention_mask,
83
+ }
84
+ return result
85
+
86
+
87
+ def token_truncate(example, tokenizer):
88
+ input_ids = example["input_ids"]
89
+ attention_mask = example["attention_mask"]
90
+ input_ids = input_ids[:3]
91
+ attention_mask = attention_mask[:3]
92
+ # text = tokenizer.decode(input_ids)
93
+ result = {
94
+ "input_ids": input_ids,
95
+ "attention_mask": attention_mask,
96
+ # "text": text,
97
+ }
98
+ return result
99
+
100
+
101
+ def main():
102
+ args = get_args()
103
+
104
+ # 设备设置
105
+ device = torch.device(args.device)
106
+
107
+ # 1. 加载tokenizer
108
+ tokenizer = AutoTokenizer.from_pretrained(
109
+ args.sft_model_name,
110
+ padding_side="left", # 对于生成任务很重要
111
+ cache_dir=args.model_cache_dir,
112
+ )
113
+ if tokenizer.pad_token is None:
114
+ tokenizer.pad_token = tokenizer.eos_token
115
+ tokenizer.pad_token_id = tokenizer.eos_token_id
116
+ print(f"eos_token: {tokenizer.eos_token}")
117
+ print(f"pad_token: {tokenizer.pad_token}")
118
+
119
+ model = AutoModelForCausalLM.from_pretrained(args.sft_model_name)
120
+ value_model = AutoModelForSequenceClassification.from_pretrained(
121
+ args.sft_model_name,
122
+ num_labels=1
123
+ )
124
+ value_model.transformer = model.transformer
125
+
126
+ ref_model = AutoModelForCausalLM.from_pretrained(args.sft_model_name)
127
+
128
+ reward_model = AutoModelForSequenceClassification.from_pretrained(
129
+ args.reward_model_name,
130
+ num_labels=1
131
+ )
132
+
133
+ dataset_dict = load_dataset(
134
+ path=args.dataset_path,
135
+ cache_dir=args.dataset_cache_dir,
136
+ )
137
+ train_dataset = dataset_dict["train"]
138
+ valid_dataset = dataset_dict["validation"]
139
+ # test_dataset = dataset_dict["test"]
140
+
141
+ train_dataset = train_dataset.map(
142
+ lambda example: format_func(example, tokenizer),
143
+ batched=False,
144
+ remove_columns=train_dataset.column_names,
145
+ )
146
+ valid_dataset = valid_dataset.map(
147
+ lambda example: format_func(example, tokenizer),
148
+ batched=False,
149
+ remove_columns=valid_dataset.column_names,
150
+ )
151
+ train_dataset = train_dataset.filter(
152
+ function=lambda x: len(x["input_ids"]) > 8
153
+ )
154
+ valid_dataset = valid_dataset.filter(
155
+ function=lambda x: len(x["input_ids"]) > 8
156
+ )
157
+ train_dataset = train_dataset.map(
158
+ lambda example: token_truncate(example, tokenizer),
159
+ batched=False,
160
+ remove_columns=train_dataset.column_names,
161
+ )
162
+ valid_dataset = valid_dataset.map(
163
+ lambda example: token_truncate(example, tokenizer),
164
+ batched=False,
165
+ remove_columns=valid_dataset.column_names,
166
+ )
167
+
168
+ ppo_config = PPOConfig(
169
+ output_dir=args.output_model_dir,
170
+ num_train_epochs=1,
171
+ eval_strategy="steps",
172
+ eval_steps=50,
173
+ save_strategy="steps",
174
+ save_steps=50,
175
+ save_total_limit=2,
176
+ logging_steps=50,
177
+ learning_rate=1e-5,
178
+ warmup_steps=50,
179
+ per_device_eval_batch_size=10,
180
+
181
+ num_mini_batches=2,
182
+ num_sample_generations=100,
183
+ # total_episodes=100000, # 最多训练多少个样本。
184
+ response_length=64,
185
+ # stop_token=tokenizer.eos_token, # stop_token 和 stop_token_id 只设置一个。
186
+ stop_token_id=tokenizer.eos_token_id,
187
+ batch_size=16,
188
+
189
+ num_ppo_epochs=1,
190
+ whiten_rewards=True,
191
+ gamma=1.0,
192
+ lam=0.95,
193
+
194
+ dataset_num_proc=args.num_workers,
195
+ )
196
+
197
+ data_collator = DataCollatorWithPadding(tokenizer)
198
+
199
+ ppo_trainer = PPOTrainer(
200
+ args=ppo_config,
201
+ processing_class=tokenizer,
202
+ model=model,
203
+ ref_model=ref_model,
204
+ reward_model=reward_model,
205
+ train_dataset=train_dataset,
206
+ value_model=value_model,
207
+ data_collator=data_collator,
208
+ eval_dataset=valid_dataset,
209
+ )
210
+ ppo_trainer.train()
211
+ ppo_trainer.save_model()
212
+
213
+ return
214
+
215
+
216
+ if __name__ == "__main__":
217
+ main()
examples/tutorials/rlhf/gpt2_sst2_ppo/step_2_train_model_two_gpu.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ PPO Training with TRL on SST-2 dataset
5
+ 基于 https://huggingface.co/docs/trl/v0.16.1/en/ppo_trainer 的实现
6
+
7
+
8
+ 双卡 V100
9
+
10
+ """
11
+ import argparse
12
+ import os
13
+ from pathlib import Path
14
+ import platform
15
+
16
+ import torch
17
+ from datasets import load_dataset
18
+ from transformers import (
19
+ AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification,
20
+ GPT2LMHeadModel, GPT2ForSequenceClassification,
21
+ DataCollatorWithPadding
22
+ )
23
+
24
+ # 路径配置
25
+ if platform.system() in ("Windows", "Darwin"):
26
+ from project_settings import project_path, temp_directory
27
+ else:
28
+ project_path = Path(os.path.abspath("../../../"))
29
+ temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
30
+
31
+ from trl import PPOTrainer, PPOConfig
32
+
33
+
34
+ def get_args():
35
+ parser = argparse.ArgumentParser()
36
+ parser.add_argument("--sft_model_name", type=str,
37
+ default=(project_path / "trained_models/gpt2-sst2-generation-epoch-3").as_posix())
38
+ parser.add_argument("--reward_model_name", type=str,
39
+ default=(project_path / "trained_models/gpt2-for-sequence-classification-sst2-reward").as_posix())
40
+ parser.add_argument("--dataset_path", default="stanfordnlp/sst2", type=str)
41
+ parser.add_argument("--dataset_cache_dir",
42
+ default=(temp_directory / "hub_datasets").as_posix(), type=str)
43
+ parser.add_argument("--model_cache_dir",
44
+ default=(temp_directory / "hub_models").as_posix(), type=str)
45
+
46
+ # 训练参数
47
+
48
+ # 生成参数
49
+
50
+ parser.add_argument(
51
+ "--output_model_dir",
52
+ default=(project_path / "trained_models/gpt2-sst2-ppo").as_posix(),
53
+ type=str
54
+ ),
55
+
56
+ # 其他
57
+ parser.add_argument("--num_workers", default=0 if platform.system() == "Windows" else 2, type=int)
58
+
59
+ return parser.parse_args()
60
+
61
+
62
+ def format_func(example, tokenizer):
63
+ sentence: str = example["sentence"]
64
+ # score: float = float(example["label"])
65
+ tokenized = tokenizer(sentence)
66
+ input_ids = tokenized["input_ids"]
67
+ attention_mask = tokenized["attention_mask"]
68
+ result = {
69
+ "input_ids": input_ids,
70
+ "attention_mask": attention_mask,
71
+ }
72
+ return result
73
+
74
+
75
+ def token_truncate(example, tokenizer):
76
+ input_ids = example["input_ids"]
77
+ attention_mask = example["attention_mask"]
78
+ input_ids = input_ids[:3]
79
+ attention_mask = attention_mask[:3]
80
+ # text = tokenizer.decode(input_ids)
81
+ result = {
82
+ "input_ids": input_ids,
83
+ "attention_mask": attention_mask,
84
+ # "text": text,
85
+ }
86
+ return result
87
+
88
+
89
+ def main():
90
+ args = get_args()
91
+
92
+ tokenizer = AutoTokenizer.from_pretrained(
93
+ args.sft_model_name,
94
+ padding_side="left", # 对于生成任务很重要
95
+ cache_dir=args.model_cache_dir,
96
+ )
97
+ if tokenizer.pad_token is None:
98
+ tokenizer.pad_token = tokenizer.eos_token
99
+ tokenizer.pad_token_id = tokenizer.eos_token_id
100
+ print(f"eos_token: {tokenizer.eos_token}")
101
+ print(f"pad_token: {tokenizer.pad_token}")
102
+
103
+ model = AutoModelForCausalLM.from_pretrained(args.sft_model_name)
104
+ value_model = AutoModelForSequenceClassification.from_pretrained(
105
+ args.sft_model_name,
106
+ num_labels=1
107
+ )
108
+ value_model.transformer = model.transformer
109
+
110
+ ref_model = AutoModelForCausalLM.from_pretrained(args.sft_model_name)
111
+ reward_model = AutoModelForSequenceClassification.from_pretrained(
112
+ args.reward_model_name,
113
+ num_labels=1
114
+ )
115
+
116
+ dataset_dict = load_dataset(
117
+ path=args.dataset_path,
118
+ cache_dir=args.dataset_cache_dir,
119
+ )
120
+ train_dataset = dataset_dict["train"]
121
+ valid_dataset = dataset_dict["validation"]
122
+ # test_dataset = dataset_dict["test"]
123
+
124
+ train_dataset = train_dataset.map(
125
+ lambda example: format_func(example, tokenizer),
126
+ batched=False,
127
+ remove_columns=train_dataset.column_names,
128
+ )
129
+ valid_dataset = valid_dataset.map(
130
+ lambda example: format_func(example, tokenizer),
131
+ batched=False,
132
+ remove_columns=valid_dataset.column_names,
133
+ )
134
+ train_dataset = train_dataset.filter(
135
+ function=lambda x: len(x["input_ids"]) > 8
136
+ )
137
+ valid_dataset = valid_dataset.filter(
138
+ function=lambda x: len(x["input_ids"]) > 8
139
+ )
140
+ train_dataset = train_dataset.map(
141
+ lambda example: token_truncate(example, tokenizer),
142
+ batched=False,
143
+ remove_columns=train_dataset.column_names,
144
+ )
145
+ valid_dataset = valid_dataset.map(
146
+ lambda example: token_truncate(example, tokenizer),
147
+ batched=False,
148
+ remove_columns=valid_dataset.column_names,
149
+ )
150
+
151
+ ppo_config = PPOConfig(
152
+ output_dir=args.output_model_dir,
153
+ num_train_epochs=1,
154
+ eval_strategy="steps",
155
+ eval_steps=50,
156
+ save_strategy="steps",
157
+ save_steps=50,
158
+ save_total_limit=2,
159
+ logging_steps=50,
160
+ learning_rate=1e-5,
161
+ warmup_steps=50,
162
+ per_device_eval_batch_size=10,
163
+
164
+ num_mini_batches=2,
165
+ num_sample_generations=100,
166
+ # total_episodes=100000, # 最多训练多少个样本。
167
+ response_length=1024,
168
+ # stop_token=tokenizer.eos_token, # stop_token 和 stop_token_id 只设置一个。
169
+ stop_token_id=tokenizer.eos_token_id,
170
+ batch_size=16,
171
+
172
+ num_ppo_epochs=1,
173
+ whiten_rewards=True,
174
+ gamma=1.0,
175
+ lam=0.95,
176
+
177
+ dataset_num_proc=args.num_workers,
178
+ )
179
+
180
+ data_collator = DataCollatorWithPadding(tokenizer)
181
+
182
+ ppo_trainer = PPOTrainer(
183
+ args=ppo_config,
184
+ processing_class=tokenizer,
185
+ model=model,
186
+ ref_model=ref_model,
187
+ reward_model=reward_model,
188
+ train_dataset=train_dataset,
189
+ value_model=value_model,
190
+ data_collator=data_collator,
191
+ eval_dataset=valid_dataset,
192
+ )
193
+
194
+ ppo_trainer.train()
195
+ ppo_trainer.save_model()
196
+
197
+ return
198
+
199
+
200
+ if __name__ == "__main__":
201
+ main()
examples/tutorials/rlhf/gpt2_sst2_ppo/step_3_generation.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import time
5
+
6
+ import torch
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
8
+
9
+ from project_settings import project_path
10
+
11
+
12
+ def get_args():
13
+ parser = argparse.ArgumentParser()
14
+ parser.add_argument(
15
+ "--pretrained_model_name_or_path",
16
+ default=(project_path / "trained_models/gpt2-sst2-ppo/checkpoint-1250"),
17
+ # default=(project_path / "trained_models/gpt2-sst2-ppo/checkpoint-150"),
18
+ # default=(project_path / "trained_models/gpt2-sst2-generation-epoch-3/checkpoint-5400"),
19
+ type=str
20
+ )
21
+ parser.add_argument(
22
+ "--max_new_tokens",
23
+ default=512, # 8192, 128
24
+ type=int, help="最大生成长度(注意:并非模型实际长文本能力)"
25
+ )
26
+ parser.add_argument("--top_p", default=0.85, type=float, help="nucleus采样阈值(0-1)")
27
+ parser.add_argument("--temperature", default=0.85, type=float, help="生成温度,控制随机性(0-1,越大越随机)")
28
+
29
+ args = parser.parse_args()
30
+ return args
31
+
32
+
33
+ def main():
34
+ args = get_args()
35
+
36
+ if torch.cuda.is_available():
37
+ device = "cuda"
38
+ elif torch.backends.mps.is_available():
39
+ # device = "mps"
40
+ device = "cpu"
41
+ else:
42
+ device = "cpu"
43
+ print(f"device: {device}")
44
+
45
+ tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_name_or_path)
46
+ model = AutoModelForCausalLM.from_pretrained(args.pretrained_model_name_or_path)
47
+ model = model.eval().to(device)
48
+
49
+ tokenized = tokenizer(
50
+ # "this",
51
+ "this is ",
52
+ # "please listen ",
53
+ # "eldom has a movie",
54
+ # "thanks to scott 's charismatic",
55
+ return_tensors="pt"
56
+ )
57
+
58
+ streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
59
+
60
+ generated_ids = model.generate(
61
+ inputs=tokenized["input_ids"], attention_mask=tokenized["attention_mask"],
62
+ max_new_tokens=args.max_new_tokens, do_sample=True, streamer=streamer,
63
+ pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id,
64
+ top_p=args.top_p, temperature=args.temperature, repetition_penalty=3.0,
65
+ early_stopping=True,
66
+ )
67
+ # response = tokenizer.decode(generated_ids[0][len(tokenized["input_ids"][0]):], skip_special_tokens=True)
68
+ response = tokenizer.decode(generated_ids[0], skip_special_tokens=False)
69
+ print(response)
70
+ # print(generated_ids)
71
+ print(f"count: {generated_ids.shape}")
72
+
73
+ return
74
+
75
+
76
+ if __name__ == "__main__":
77
+ main()
examples/tutorials/{dpo/ultrachat-sft/step_2_train_sft_model2.py → rlhf/gpt2_sst2_reward/step_2_train_model.py} RENAMED
@@ -1,3 +1,5 @@
 
 
1
  import argparse
2
  import os
3
  from pathlib import Path
@@ -11,26 +13,25 @@ else:
11
  temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
12
 
13
  from datasets import load_dataset
 
14
  import torch
15
- from torch.utils.data import DataLoader
16
- from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, DataCollatorForLanguageModeling
17
- from transformers.models.llama.modeling_llama import LlamaModel
18
- from transformers.models.llama.tokenization_llama import LlamaTokenizer
19
- from trl import SFTTrainer, SFTConfig, DataCollatorForCompletionOnlyLM
20
 
21
 
22
  def get_args():
23
  parser = argparse.ArgumentParser()
24
  parser.add_argument(
25
  "--model_name",
26
- default=(project_path / "pretrained_models/jingyaogong/MiniMind2").as_posix() if platform.system() in ("Windows", "Darwin") else "jingyaogong/MiniMind2",
27
- # default=(project_path / "pretrained_models/Qwen/Qwen2.5-0.5B").as_posix() if platform.system() in ("Windows", "Darwin") else "Qwen/Qwen2.5-0.5B",
28
  type=str
29
  ),
30
  parser.add_argument(
31
  "--dataset_path",
32
- # default="HuggingFaceH4/ultrachat_200k",
33
- default="miyuki2026/tutorials",
34
  type=str
35
  ),
36
  parser.add_argument("--dataset_name", default=None, type=str),
@@ -45,15 +46,13 @@ def get_args():
45
  default=(temp_directory / "hub_models").as_posix(),
46
  type=str
47
  ),
48
- parser.add_argument("--dataset_streaming", action="store_true"),
49
  parser.add_argument("--valid_dataset_size", default=1000, type=int),
50
  parser.add_argument("--shuffle_buffer_size", default=5000, type=int),
51
 
52
- parser.add_argument("--max_seq_length", default=2048, type=int)
53
-
54
  parser.add_argument(
55
  "--output_model_dir",
56
- default=(project_path / "trained_models/qwen2_5-0_5B-ultrachat-sft").as_posix(),
57
  type=str
58
  ),
59
  parser.add_argument(
@@ -61,105 +60,110 @@ def get_args():
61
  default=None if platform.system() in ("Windows", "Darwin") else os.cpu_count() // 2,
62
  type=int
63
  ),
 
 
 
 
 
64
  args = parser.parse_args()
65
  return args
66
 
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  def main():
69
  args = get_args()
70
 
71
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
72
-
73
- model = AutoModelForCausalLM.from_pretrained(args.model_name)
74
- model = model.to(device)
75
  tokenizer = AutoTokenizer.from_pretrained(args.model_name)
76
  if tokenizer.pad_token is None:
77
  tokenizer.pad_token = tokenizer.eos_token
78
  tokenizer.pad_token_id = tokenizer.eos_token_id
79
 
 
 
 
 
 
 
 
80
  dataset_dict = load_dataset(
81
  path=args.dataset_path,
82
  name=args.dataset_name,
83
- data_dir="finetome-100k",
84
- # split="train_sft",
85
- # split="test_sft",
86
  cache_dir=args.dataset_cache_dir,
87
- # streaming=True,
 
88
  )
89
- dataset = dataset_dict["train"]
90
-
91
- if args.dataset_streaming:
92
- valid_dataset = dataset.take(args.valid_dataset_size)
93
- train_dataset = dataset.skip(args.valid_dataset_size)
94
- train_dataset = train_dataset.shuffle(buffer_size=args.shuffle_buffer_size, seed=None)
95
- else:
96
- dataset = dataset.train_test_split(test_size=args.valid_dataset_size, seed=None)
97
- train_dataset = dataset["train"]
98
- valid_dataset = dataset["test"]
99
- train_dataset = valid_dataset
100
-
101
- response_template = "<|im_end|>\n<|im_start|>assistant"
102
- instruction_template = "<|im_end|>\n<|im_start|>user"
103
- data_collator = DataCollatorForCompletionOnlyLM(
104
- response_template=response_template,
105
- instruction_template=instruction_template,
106
- tokenizer=tokenizer,
107
- mlm=False,
108
  )
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
- def formatting_prompts_func(examples):
111
- # print(examples)
112
- formated_text: str = tokenizer.apply_chat_template(
113
- conversation=examples["conversations"],
114
- tokenize=False,
115
- add_generation_prompt=False,
116
- )
117
- return formated_text
118
-
119
- sft_config = SFTConfig(
120
  output_dir=args.output_model_dir,
 
121
  num_train_epochs=1,
122
- # per_device_train_batch_size=8,
123
- # gradient_accumulation_steps=4,
 
 
124
  save_strategy="steps",
125
- save_steps=100,
126
  save_total_limit=2,
127
- logging_steps=100,
128
- learning_rate=2e-5,
129
- warmup_steps=100,
130
- lr_scheduler_type="cosine",
131
- fp16=True if torch.cuda.is_available() else False,
132
- gradient_checkpointing=False,
133
- optim="adamw_torch",
134
- report_to="none",
135
- max_length=1024,
136
- dataset_kwargs=dict(
137
- skip_prepare_dataset=True
138
- ),
139
  )
140
 
141
- trainer = SFTTrainer(
142
  model=model,
143
- args=sft_config,
144
  data_collator=data_collator,
145
  train_dataset=train_dataset,
146
- formatting_func=formatting_prompts_func,
147
  )
148
 
149
- # 开始训练
150
- print("开始训练...")
151
  trainer.train()
152
-
153
- # 保存模型
154
- print(f"保存模型到: {args.output_model_dir}")
155
  trainer.save_model()
156
- tokenizer.save_pretrained(args.output_model_dir)
157
-
158
- print("训练完成!")
159
-
160
  return
161
 
162
 
163
  if __name__ == "__main__":
164
  main()
165
-
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
  import argparse
4
  import os
5
  from pathlib import Path
 
13
  temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
14
 
15
  from datasets import load_dataset
16
+ import numpy as np
17
  import torch
18
+ from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
19
+ DataCollatorWithPadding,
20
+ Trainer, TrainingArguments
21
+ )
 
22
 
23
 
24
  def get_args():
25
  parser = argparse.ArgumentParser()
26
  parser.add_argument(
27
  "--model_name",
28
+ # default="openai-community/gpt2",
29
+ default=(project_path / "pretrained_models/openai-community/gpt2").as_posix(),
30
  type=str
31
  ),
32
  parser.add_argument(
33
  "--dataset_path",
34
+ default="stanfordnlp/sst2",
 
35
  type=str
36
  ),
37
  parser.add_argument("--dataset_name", default=None, type=str),
 
46
  default=(temp_directory / "hub_models").as_posix(),
47
  type=str
48
  ),
49
+ parser.add_argument("--dataset_streaming", default=None, type=str),
50
  parser.add_argument("--valid_dataset_size", default=1000, type=int),
51
  parser.add_argument("--shuffle_buffer_size", default=5000, type=int),
52
 
 
 
53
  parser.add_argument(
54
  "--output_model_dir",
55
+ default=(project_path / "trained_models/gpt2-for-sequence-classification-sst2-reward").as_posix(),
56
  type=str
57
  ),
58
  parser.add_argument(
 
60
  default=None if platform.system() in ("Windows", "Darwin") else os.cpu_count() // 2,
61
  type=int
62
  ),
63
+ parser.add_argument(
64
+ "--device",
65
+ default=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
66
+ type=str
67
+ ),
68
  args = parser.parse_args()
69
  return args
70
 
71
 
72
+ def format_func(example, tokenizer):
73
+ sentence: str = example["sentence"]
74
+ labels: float = float(example["label"])
75
+ tokenized = tokenizer(sentence)
76
+ input_ids = tokenized["input_ids"]
77
+ attention_mask = tokenized["attention_mask"]
78
+ result = {
79
+ "input_ids": input_ids,
80
+ "attention_mask": attention_mask,
81
+ "labels": labels,
82
+ }
83
+ return result
84
+
85
+
86
  def main():
87
  args = get_args()
88
 
 
 
 
 
89
  tokenizer = AutoTokenizer.from_pretrained(args.model_name)
90
  if tokenizer.pad_token is None:
91
  tokenizer.pad_token = tokenizer.eos_token
92
  tokenizer.pad_token_id = tokenizer.eos_token_id
93
 
94
+ model = AutoModelForSequenceClassification.from_pretrained(
95
+ args.model_name,
96
+ num_labels=1,
97
+ pad_token_id=tokenizer.pad_token_id
98
+ )
99
+ print(f"model.num_labels: {model.num_labels}")
100
+
101
  dataset_dict = load_dataset(
102
  path=args.dataset_path,
103
  name=args.dataset_name,
104
+ split=args.dataset_split,
 
 
105
  cache_dir=args.dataset_cache_dir,
106
+ # num_proc=args.num_workers if not args.dataset_streaming else None,
107
+ streaming=args.dataset_streaming,
108
  )
109
+ train_dataset = dataset_dict["train"]
110
+ valid_dataset = dataset_dict["validation"]
111
+ # test_dataset = dataset_dict["test"]
112
+
113
+ train_dataset = train_dataset.map(
114
+ lambda example: format_func(example, tokenizer),
115
+ batched=False,
116
+ remove_columns=train_dataset.column_names,
 
 
 
 
 
 
 
 
 
 
 
117
  )
118
+ valid_dataset = valid_dataset.map(
119
+ lambda example: format_func(example, tokenizer),
120
+ batched=False,
121
+ remove_columns=valid_dataset.column_names,
122
+ )
123
+ train_dataset = train_dataset.filter(
124
+ function=lambda x: len(x["input_ids"]) > 6
125
+ )
126
+ valid_dataset = valid_dataset.filter(
127
+ function=lambda x: len(x["input_ids"]) > 6
128
+ )
129
+ data_collator = DataCollatorWithPadding(tokenizer)
130
 
131
+ training_args = TrainingArguments(
 
 
 
 
 
 
 
 
 
132
  output_dir=args.output_model_dir,
133
+ # overwrite_output_dir=True,
134
  num_train_epochs=1,
135
+ per_device_train_batch_size=16,
136
+ per_device_eval_batch_size=16,
137
+ eval_strategy="steps",
138
+ eval_steps=200,
139
  save_strategy="steps",
140
+ save_steps=200,
141
  save_total_limit=2,
142
+ logging_steps=200,
143
+ learning_rate=5e-5,
144
+ warmup_steps=200,
145
+ weight_decay=0.01,
146
+ fp16=torch.cuda.is_available(),
147
+ dataloader_num_workers=args.num_workers or 0,
148
+ remove_unused_columns=False,
149
+ load_best_model_at_end=True,
150
+ metric_for_best_model="eval_loss",
151
+ greater_is_better=False,
152
+ logging_dir=(Path(args.output_model_dir) / "logs").as_posix(),
 
153
  )
154
 
155
+ trainer = Trainer(
156
  model=model,
157
+ args=training_args,
158
  data_collator=data_collator,
159
  train_dataset=train_dataset,
160
+ eval_dataset=valid_dataset,
161
  )
162
 
 
 
163
  trainer.train()
 
 
 
164
  trainer.save_model()
 
 
 
 
165
  return
166
 
167
 
168
  if __name__ == "__main__":
169
  main()
 
examples/tutorials/rlhf/gpt2_sst2_reward/step_3_test_model.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import os
5
+ from pathlib import Path
6
+ import platform
7
+ from typing import Any, Dict, List, Optional, Union, Tuple
8
+
9
+ if platform.system() in ("Windows", "Darwin"):
10
+ from project_settings import project_path, temp_directory
11
+ else:
12
+ project_path = os.path.abspath("../../../")
13
+ project_path = Path(project_path)
14
+ temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
15
+
16
+ from datasets import load_dataset
17
+ import numpy as np
18
+ import torch
19
+ import torch.nn as nn
20
+ from transformers import (AutoTokenizer, AutoModelForSequenceClassification, GPT2ForSequenceClassification
21
+ )
22
+
23
+
24
+ def get_args():
25
+ parser = argparse.ArgumentParser()
26
+ parser.add_argument(
27
+ "--model_name",
28
+ default=(project_path / "trained_models/gpt2-for-sequence-classification-sst2-reward").as_posix(),
29
+ type=str
30
+ ),
31
+ parser.add_argument(
32
+ "--dataset_path",
33
+ default="stanfordnlp/sst2",
34
+ type=str
35
+ ),
36
+ parser.add_argument("--dataset_name", default=None, type=str),
37
+ parser.add_argument("--dataset_split", default=None, type=str),
38
+ parser.add_argument(
39
+ "--dataset_cache_dir",
40
+ default=(temp_directory / "hub_datasets").as_posix(),
41
+ type=str
42
+ ),
43
+ parser.add_argument(
44
+ "--model_cache_dir",
45
+ default=(temp_directory / "hub_models").as_posix(),
46
+ type=str
47
+ ),
48
+ parser.add_argument("--dataset_streaming", default=None, type=str),
49
+
50
+ parser.add_argument(
51
+ "--num_workers",
52
+ default=None if platform.system() in ("Windows", "Darwin") else os.cpu_count() // 2,
53
+ type=int
54
+ ),
55
+ parser.add_argument(
56
+ "--device",
57
+ default=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
58
+ type=str
59
+ ),
60
+ args = parser.parse_args()
61
+ return args
62
+
63
+
64
+ class RewardModelWrapper:
65
+ """
66
+ 奖励模型包装器,用于计算生成文本的奖励
67
+ """
68
+
69
+ def __init__(self, reward_model_name, tokenizer, device):
70
+ self.device = device
71
+ self.tokenizer = tokenizer
72
+ # 加载你的GPT2RewardModel或标准模型
73
+ from transformers import GPT2ForSequenceClassification
74
+ self.model = GPT2ForSequenceClassification.from_pretrained(reward_model_name).to(device)
75
+ self.model.eval()
76
+
77
+ def get_reward(self, texts: List[str]) -> List[float]:
78
+ """
79
+ 计算文本的奖励分数(SST-2情感分类)
80
+ """
81
+ inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(self.device)
82
+ with torch.no_grad():
83
+ outputs = self.model(**inputs)
84
+ # SST-2是二分类,取正面情感的logits作为奖励
85
+ logits = outputs.logits
86
+ # 如果模型输出是logits,使用softmax获取正面概率
87
+ probs = torch.softmax(logits, dim=-1)
88
+ # 假设标签1是正面
89
+ rewards = probs[:, 1].cpu().tolist()
90
+ return rewards
91
+
92
+
93
+ def main():
94
+ args = get_args()
95
+
96
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
97
+
98
+ tokenizer = AutoTokenizer.from_pretrained(args.model_name)
99
+ if tokenizer.pad_token is None:
100
+ tokenizer.pad_token = tokenizer.eos_token
101
+ tokenizer.pad_token_id = tokenizer.eos_token_id
102
+
103
+ reward_model = AutoModelForSequenceClassification.from_pretrained(
104
+ args.model_name,
105
+ )
106
+ print(f"reward_model.num_labels: {reward_model.num_labels}")
107
+
108
+ dataset_dict = load_dataset(
109
+ path=args.dataset_path,
110
+ name=args.dataset_name,
111
+ split=args.dataset_split,
112
+ cache_dir=args.dataset_cache_dir,
113
+ # num_proc=args.num_workers if not args.dataset_streaming else None,
114
+ streaming=args.dataset_streaming,
115
+ )
116
+ # dataset = dataset_dict["train"]
117
+ dataset = dataset_dict["validation"]
118
+ # dataset = dataset_dict["test"]
119
+
120
+
121
+ for example in dataset:
122
+ sentence: str = example["sentence"]
123
+ score: float = float(example["label"])
124
+
125
+ outputs = tokenizer(
126
+ sentence,
127
+ return_tensors="pt"
128
+ )
129
+ input_ids = outputs["input_ids"]
130
+
131
+ with torch.no_grad():
132
+ rewards = reward_model.forward(input_ids)
133
+ logits = rewards.logits
134
+ logits = logits.detach().cpu().numpy()
135
+ reward = logits[0][0]
136
+ msg = f"reward: {reward}\nscore: {score}\nsentence: {sentence}\n"
137
+ print(msg)
138
+ return
139
+
140
+
141
+ if __name__ == "__main__":
142
+ main()
examples/tutorials/rlhf/gpt2_sst2_reward/step_4_test_model.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import os
5
+ from pathlib import Path
6
+ import platform
7
+ from typing import Any, Dict, List, Optional, Union, Tuple
8
+
9
+ if platform.system() in ("Windows", "Darwin"):
10
+ from project_settings import project_path, temp_directory
11
+ else:
12
+ project_path = os.path.abspath("../../../")
13
+ project_path = Path(project_path)
14
+ temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
15
+
16
+ from datasets import load_dataset
17
+ import numpy as np
18
+ import torch
19
+ import torch.nn as nn
20
+ from transformers import (AutoTokenizer, AutoModelForSequenceClassification
21
+ )
22
+ from trl.trainer.utils import get_reward
23
+ from transformers import GPT2ForSequenceClassification
24
+
25
+
26
+ def get_args():
27
+ parser = argparse.ArgumentParser()
28
+ parser.add_argument(
29
+ "--model_name",
30
+ default=(project_path / "trained_models/gpt2-for-sequence-classification-sst2-reward").as_posix(),
31
+ type=str
32
+ ),
33
+ parser.add_argument(
34
+ "--dataset_path",
35
+ default="stanfordnlp/sst2",
36
+ type=str
37
+ ),
38
+ parser.add_argument("--dataset_name", default=None, type=str),
39
+ parser.add_argument("--dataset_split", default=None, type=str),
40
+ parser.add_argument(
41
+ "--dataset_cache_dir",
42
+ default=(temp_directory / "hub_datasets").as_posix(),
43
+ type=str
44
+ ),
45
+ parser.add_argument(
46
+ "--model_cache_dir",
47
+ default=(temp_directory / "hub_models").as_posix(),
48
+ type=str
49
+ ),
50
+ parser.add_argument("--dataset_streaming", default=None, type=str),
51
+
52
+ parser.add_argument(
53
+ "--num_workers",
54
+ default=None if platform.system() in ("Windows", "Darwin") else os.cpu_count() // 2,
55
+ type=int
56
+ ),
57
+ parser.add_argument(
58
+ "--device",
59
+ default=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
60
+ type=str
61
+ ),
62
+ args = parser.parse_args()
63
+ return args
64
+
65
+
66
+ def main():
67
+ args = get_args()
68
+
69
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
70
+
71
+ tokenizer = AutoTokenizer.from_pretrained(args.model_name)
72
+ if tokenizer.pad_token is None:
73
+ tokenizer.pad_token = tokenizer.eos_token
74
+ tokenizer.pad_token_id = tokenizer.eos_token_id
75
+
76
+ reward_model = AutoModelForSequenceClassification.from_pretrained(args.model_name)
77
+
78
+ dataset_dict = load_dataset(
79
+ path=args.dataset_path,
80
+ name=args.dataset_name,
81
+ split=args.dataset_split,
82
+ cache_dir=args.dataset_cache_dir,
83
+ # num_proc=args.num_workers if not args.dataset_streaming else None,
84
+ streaming=args.dataset_streaming,
85
+ )
86
+ # dataset = dataset_dict["train"]
87
+ dataset = dataset_dict["validation"]
88
+ # dataset = dataset_dict["test"]
89
+
90
+ count = 0
91
+ batch_text = list()
92
+ for example in dataset:
93
+
94
+ sentence: str = example["sentence"]
95
+ # labels: int = int(example["label"])
96
+ batch_text.append(sentence)
97
+ count += 1
98
+ if count >= 4:
99
+ break
100
+
101
+ outputs = tokenizer(
102
+ batch_text,
103
+ padding=True,
104
+ truncation=True,
105
+ return_tensors="pt"
106
+ )
107
+ input_ids = outputs["input_ids"]
108
+ attention_mask = outputs["attention_mask"]
109
+
110
+ # last_token_idx = attention_mask.sum(dim=1) - 1
111
+ # print(last_token_idx)
112
+
113
+ reward_logits, score, sequence_lengths = get_reward(
114
+ model=reward_model,
115
+ query_responses=input_ids,
116
+ pad_token_id=tokenizer.pad_token_id,
117
+ context_length=0,
118
+ )
119
+ print(reward_logits)
120
+ print(score)
121
+ print(sequence_lengths)
122
+
123
+ return
124
+
125
+
126
+ if __name__ == "__main__":
127
+ main()