Spaces:
Sleeping
Sleeping
Commit ·
8f7ca17
1
Parent(s): de47717
update
Browse files- examples/download/{download_hub.py → download_hub_hf.py} +0 -0
- examples/playground/{chat.py → chat_minimind.py} +0 -0
- examples/playground/chat_modelscope.py +144 -0
- examples/playground/generation.py +4 -4
- examples/tutorials/dpo/ultrafeedback-dpo/requirements.txt +6 -0
- examples/tutorials/dpo/{ultrachat-sft → ultrafeedback-dpo}/step_1_prepare_data.py +0 -0
- examples/tutorials/dpo/ultrafeedback-dpo/step_2_train_dpo_model_single_gpu.py +208 -0
- examples/tutorials/rlhf/gpt2_sst2/step_5_ppo_rlhf.py +175 -361
- examples/tutorials/rlhf/gpt2_sst2_generation/step_2_train_model.py +172 -0
- examples/tutorials/rlhf/gpt2_sst2_generation/step_3_generation.py +78 -0
- examples/tutorials/rlhf/gpt2_sst2_ppo/requirements.txt +2 -0
- examples/tutorials/rlhf/gpt2_sst2_ppo/step_1_prepare_data.py +58 -0
- examples/tutorials/rlhf/gpt2_sst2_ppo/step_2_train_model_on_cpu.py +217 -0
- examples/tutorials/rlhf/gpt2_sst2_ppo/step_2_train_model_two_gpu.py +201 -0
- examples/tutorials/rlhf/gpt2_sst2_ppo/step_3_generation.py +77 -0
- examples/tutorials/{dpo/ultrachat-sft/step_2_train_sft_model2.py → rlhf/gpt2_sst2_reward/step_2_train_model.py} +82 -78
- examples/tutorials/rlhf/gpt2_sst2_reward/step_3_test_model.py +142 -0
- examples/tutorials/rlhf/gpt2_sst2_reward/step_4_test_model.py +127 -0
examples/download/{download_hub.py → download_hub_hf.py}
RENAMED
|
File without changes
|
examples/playground/{chat.py → chat_minimind.py}
RENAMED
|
File without changes
|
examples/playground/chat_modelscope.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
https://github.com/jingyaogong/minimind/blob/master/eval_llm.py
|
| 5 |
+
"""
|
| 6 |
+
import argparse
|
| 7 |
+
import os
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import platform
|
| 10 |
+
import time
|
| 11 |
+
|
| 12 |
+
if platform.system() in ("Windows", "Darwin"):
|
| 13 |
+
from project_settings import project_path, temp_directory
|
| 14 |
+
else:
|
| 15 |
+
project_path = os.path.abspath("../../")
|
| 16 |
+
project_path = Path(project_path)
|
| 17 |
+
temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
|
| 18 |
+
|
| 19 |
+
import torch
|
| 20 |
+
from modelscope import AutoTokenizer, AutoModelForCausalLM
|
| 21 |
+
from transformers import TextStreamer
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def get_args():
|
| 25 |
+
parser = argparse.ArgumentParser()
|
| 26 |
+
parser.add_argument(
|
| 27 |
+
"--pretrained_model_name_or_path",
|
| 28 |
+
default="qgyd2021/Qwen2.5-0.5B-ultrachat-sft-deepspeed",
|
| 29 |
+
type=str
|
| 30 |
+
)
|
| 31 |
+
parser.add_argument(
|
| 32 |
+
"--model_cache_dir",
|
| 33 |
+
default=(temp_directory / "hub_models").as_posix(),
|
| 34 |
+
type=str
|
| 35 |
+
)
|
| 36 |
+
parser.add_argument(
|
| 37 |
+
"--max_new_tokens",
|
| 38 |
+
default=8192, # 8192, 128
|
| 39 |
+
type=int, help="最大生成长度(注意:并非模型实际长文本能力)"
|
| 40 |
+
)
|
| 41 |
+
parser.add_argument("--top_p", default=0.85, type=float, help="nucleus采样阈值(0-1)")
|
| 42 |
+
parser.add_argument("--temperature", default=0.85, type=float, help="生成温度,控制随机性(0-1,越大越随机)")
|
| 43 |
+
|
| 44 |
+
parser.add_argument(
|
| 45 |
+
"--show_speed",
|
| 46 |
+
default=1, # 1, 0
|
| 47 |
+
type=int, help="显示decode速度(tokens/s)"
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
args = parser.parse_args()
|
| 51 |
+
return args
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def main():
|
| 55 |
+
args = get_args()
|
| 56 |
+
|
| 57 |
+
os.environ["MODELSCOPE_CACHE"] = args.model_cache_dir
|
| 58 |
+
|
| 59 |
+
if torch.cuda.is_available():
|
| 60 |
+
device = "cuda"
|
| 61 |
+
elif torch.backends.mps.is_available():
|
| 62 |
+
# device = "mps"
|
| 63 |
+
device = "cpu"
|
| 64 |
+
else:
|
| 65 |
+
device = "cpu"
|
| 66 |
+
print(f"device: {device}")
|
| 67 |
+
|
| 68 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 69 |
+
args.pretrained_model_name_or_path,
|
| 70 |
+
cache_dir=args.model_cache_dir,
|
| 71 |
+
trust_remote_code=True,
|
| 72 |
+
)
|
| 73 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 74 |
+
args.pretrained_model_name_or_path,
|
| 75 |
+
cache_dir=args.model_cache_dir,
|
| 76 |
+
trust_remote_code=True,
|
| 77 |
+
)
|
| 78 |
+
if tokenizer.pad_token is None:
|
| 79 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 80 |
+
tokenizer.pad_token_id = tokenizer.eos_token_id
|
| 81 |
+
|
| 82 |
+
model = model.eval().to(device)
|
| 83 |
+
# print(tokenizer)
|
| 84 |
+
# print(model)
|
| 85 |
+
|
| 86 |
+
prompts = [
|
| 87 |
+
"你有什么特长?",
|
| 88 |
+
"为什么天空是蓝色的",
|
| 89 |
+
"请用Python写一个计算斐波那契数列的函数",
|
| 90 |
+
'解释一下"光合作用"的基本过程',
|
| 91 |
+
"如果明天下雨,我应该如何出门",
|
| 92 |
+
"比较一下猫和狗作为宠物的优缺点",
|
| 93 |
+
"解释什么是机器学习",
|
| 94 |
+
"推荐一些中国的美食"
|
| 95 |
+
]
|
| 96 |
+
input_mode = int(input("[0] 自动测试\n[1] 手动输入\n"))
|
| 97 |
+
|
| 98 |
+
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
| 99 |
+
|
| 100 |
+
# conversation = list()
|
| 101 |
+
conversation = [
|
| 102 |
+
{"role": "system", "content": "You are a helpful assistant"}
|
| 103 |
+
]
|
| 104 |
+
while True:
|
| 105 |
+
if input_mode == 0:
|
| 106 |
+
if len(prompts) == 0:
|
| 107 |
+
break
|
| 108 |
+
user_input = prompts.pop(0)
|
| 109 |
+
print(f"💬: {user_input}")
|
| 110 |
+
else:
|
| 111 |
+
user_input = input("💬: ")
|
| 112 |
+
user_input = str(user_input).strip()
|
| 113 |
+
conversation.append({"role": "user", "content": user_input})
|
| 114 |
+
inputs = tokenizer.apply_chat_template(
|
| 115 |
+
conversation=conversation,
|
| 116 |
+
tokenize=False,
|
| 117 |
+
add_generation_prompt=True
|
| 118 |
+
)
|
| 119 |
+
inputs = tokenizer.__call__(
|
| 120 |
+
inputs,
|
| 121 |
+
return_tensors="pt",
|
| 122 |
+
truncation=True
|
| 123 |
+
)
|
| 124 |
+
inputs = inputs.to(device)
|
| 125 |
+
# print(inputs)
|
| 126 |
+
|
| 127 |
+
print("🤖: ", end="")
|
| 128 |
+
st = time.time()
|
| 129 |
+
generated_ids = model.generate(
|
| 130 |
+
inputs=inputs["input_ids"], attention_mask=inputs["attention_mask"],
|
| 131 |
+
max_new_tokens=args.max_new_tokens, do_sample=True, streamer=streamer,
|
| 132 |
+
pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id,
|
| 133 |
+
top_p=args.top_p, temperature=args.temperature, repetition_penalty=3.0,
|
| 134 |
+
)
|
| 135 |
+
response = tokenizer.decode(generated_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
|
| 136 |
+
conversation.append({"role": "assistant", "content": response})
|
| 137 |
+
gen_tokens = len(generated_ids[0]) - len(inputs["input_ids"][0])
|
| 138 |
+
print(f"\n[Speed]: {gen_tokens / (time.time() - st):.2f} tokens/s\n\n") if args.show_speed else print("\n\n")
|
| 139 |
+
|
| 140 |
+
return
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
if __name__ == "__main__":
|
| 144 |
+
main()
|
examples/playground/generation.py
CHANGED
|
@@ -16,8 +16,8 @@ def get_args():
|
|
| 16 |
parser = argparse.ArgumentParser()
|
| 17 |
parser.add_argument(
|
| 18 |
"--pretrained_model_name_or_path",
|
| 19 |
-
|
| 20 |
-
default=(project_path / "trained_models/gpt2-sst2-
|
| 21 |
type=str
|
| 22 |
)
|
| 23 |
parser.add_argument(
|
|
@@ -50,9 +50,9 @@ def main():
|
|
| 50 |
|
| 51 |
tokenized = tokenizer(
|
| 52 |
# "this",
|
| 53 |
-
|
| 54 |
# "who needs mind-bending",
|
| 55 |
-
"eldom has a movie",
|
| 56 |
# "thanks to scott 's charismatic",
|
| 57 |
return_tensors="pt"
|
| 58 |
)
|
|
|
|
| 16 |
parser = argparse.ArgumentParser()
|
| 17 |
parser.add_argument(
|
| 18 |
"--pretrained_model_name_or_path",
|
| 19 |
+
default=(project_path / "trained_models/gpt2-sst2-generation"),
|
| 20 |
+
# default=(project_path / "trained_models/gpt2-sst2-ppo/checkpoint-150"),
|
| 21 |
type=str
|
| 22 |
)
|
| 23 |
parser.add_argument(
|
|
|
|
| 50 |
|
| 51 |
tokenized = tokenizer(
|
| 52 |
# "this",
|
| 53 |
+
"this is ",
|
| 54 |
# "who needs mind-bending",
|
| 55 |
+
# "eldom has a movie",
|
| 56 |
# "thanks to scott 's charismatic",
|
| 57 |
return_tensors="pt"
|
| 58 |
)
|
examples/tutorials/dpo/ultrafeedback-dpo/requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
transformers
|
| 2 |
+
torch
|
| 3 |
+
modelscope
|
| 4 |
+
datasets
|
| 5 |
+
trl
|
| 6 |
+
deepspeed
|
examples/tutorials/dpo/{ultrachat-sft → ultrafeedback-dpo}/step_1_prepare_data.py
RENAMED
|
File without changes
|
examples/tutorials/dpo/ultrafeedback-dpo/step_2_train_dpo_model_single_gpu.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
https://huggingface.co/docs/trl/v0.16.1/en/sft_trainer
|
| 5 |
+
|
| 6 |
+
单卡 V00 32G 全参微调
|
| 7 |
+
python3 step_2_train_sft_model_single_gpu.py
|
| 8 |
+
|
| 9 |
+
"""
|
| 10 |
+
import argparse
|
| 11 |
+
import os
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
import platform
|
| 14 |
+
|
| 15 |
+
# os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
|
| 16 |
+
|
| 17 |
+
debug_mode = True if platform.system() in ("Windows", "Darwin") else False
|
| 18 |
+
print(f"debug_mode: {debug_mode}")
|
| 19 |
+
|
| 20 |
+
if platform.system() in ("Windows", "Darwin"):
|
| 21 |
+
from project_settings import project_path, temp_directory
|
| 22 |
+
else:
|
| 23 |
+
project_path = os.path.abspath("../../../")
|
| 24 |
+
project_path = Path(project_path)
|
| 25 |
+
temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
|
| 26 |
+
|
| 27 |
+
from datasets import load_dataset
|
| 28 |
+
import torch
|
| 29 |
+
|
| 30 |
+
from modelscope import AutoModelForCausalLM
|
| 31 |
+
from transformers import AutoTokenizer
|
| 32 |
+
from trl import DPOConfig, DPOTrainer
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def get_args():
|
| 36 |
+
parser = argparse.ArgumentParser()
|
| 37 |
+
parser.add_argument(
|
| 38 |
+
"--model_name",
|
| 39 |
+
default=(project_path / "pretrained_models/jingyaogong/MiniMind2").as_posix() if debug_mode else "qgyd2021/Qwen2.5-0.5B-ultrachat-sft-deepspeed",
|
| 40 |
+
type=str
|
| 41 |
+
),
|
| 42 |
+
parser.add_argument(
|
| 43 |
+
"--dataset_path",
|
| 44 |
+
default="HuggingFaceH4/ultrafeedback_binarized",
|
| 45 |
+
# default="miyuki2026/tutorials" if debug_mode else "HuggingFaceH4/ultrachat_200k",
|
| 46 |
+
type=str
|
| 47 |
+
),
|
| 48 |
+
parser.add_argument(
|
| 49 |
+
"--dataset_cache_dir",
|
| 50 |
+
default=(temp_directory / "hub_datasets").as_posix(),
|
| 51 |
+
type=str
|
| 52 |
+
),
|
| 53 |
+
parser.add_argument(
|
| 54 |
+
"--model_cache_dir",
|
| 55 |
+
default=(temp_directory / "hub_models").as_posix(),
|
| 56 |
+
type=str
|
| 57 |
+
),
|
| 58 |
+
parser.add_argument(
|
| 59 |
+
"--output_model_dir",
|
| 60 |
+
default=(temp_directory / "trained_models/qwen2_5-0_5B-ultrafeedback-dpo-single-gpu").as_posix(),
|
| 61 |
+
type=str
|
| 62 |
+
),
|
| 63 |
+
parser.add_argument(
|
| 64 |
+
"--num_workers",
|
| 65 |
+
default=None if debug_mode else os.cpu_count() // 2,
|
| 66 |
+
type=int
|
| 67 |
+
),
|
| 68 |
+
args = parser.parse_args()
|
| 69 |
+
return args
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def format_func(examples, tokenizer):
|
| 73 |
+
chosen = examples["chosen"]
|
| 74 |
+
rejected = examples["rejected"]
|
| 75 |
+
|
| 76 |
+
chosen_prompt = chosen[:-1]
|
| 77 |
+
chosen_response = chosen[-1]
|
| 78 |
+
|
| 79 |
+
rejected_prompt = rejected[:-1]
|
| 80 |
+
rejected_response = rejected[-1]
|
| 81 |
+
|
| 82 |
+
chosen_prompt_text = tokenizer.apply_chat_template(
|
| 83 |
+
conversation=chosen_prompt,
|
| 84 |
+
tokenize=False,
|
| 85 |
+
add_generation_prompt=True, # DPO 需要添加生成提示,让模型知道要从这里开始生成
|
| 86 |
+
)
|
| 87 |
+
rejected_prompt_text = tokenizer.apply_chat_template(
|
| 88 |
+
conversation=rejected_prompt,
|
| 89 |
+
tokenize=False,
|
| 90 |
+
add_generation_prompt=True, # DPO 需要添加生成提示,让模型知道要从这里开始生成
|
| 91 |
+
)
|
| 92 |
+
if chosen_prompt_text != rejected_prompt_text:
|
| 93 |
+
raise AssertionError()
|
| 94 |
+
|
| 95 |
+
chosen_response_role = chosen_response["role"]
|
| 96 |
+
chosen_response_text = chosen_response["content"]
|
| 97 |
+
if chosen_response_role != "assistant":
|
| 98 |
+
raise AssertionError()
|
| 99 |
+
|
| 100 |
+
rejected_response_role = rejected_response["role"]
|
| 101 |
+
rejected_response_text = rejected_response["content"]
|
| 102 |
+
if rejected_response_role != "assistant":
|
| 103 |
+
raise AssertionError()
|
| 104 |
+
|
| 105 |
+
result = {
|
| 106 |
+
"prompt": chosen_prompt_text,
|
| 107 |
+
"chosen": chosen_response_text,
|
| 108 |
+
"rejected": rejected_response_text,
|
| 109 |
+
}
|
| 110 |
+
return result
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def main():
|
| 114 |
+
args = get_args()
|
| 115 |
+
|
| 116 |
+
os.environ["MODELSCOPE_CACHE"] = args.model_cache_dir
|
| 117 |
+
|
| 118 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 119 |
+
|
| 120 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 121 |
+
args.model_name,
|
| 122 |
+
cache_dir=args.model_cache_dir,
|
| 123 |
+
trust_remote_code=True,
|
| 124 |
+
dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
| 125 |
+
)
|
| 126 |
+
ref_model = AutoModelForCausalLM.from_pretrained(
|
| 127 |
+
args.model_name,
|
| 128 |
+
cache_dir=args.model_cache_dir,
|
| 129 |
+
trust_remote_code=True,
|
| 130 |
+
dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
| 131 |
+
)
|
| 132 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 133 |
+
args.model_name,
|
| 134 |
+
cache_dir=args.model_cache_dir,
|
| 135 |
+
trust_remote_code=True,
|
| 136 |
+
)
|
| 137 |
+
model = model.to(device)
|
| 138 |
+
ref_model = ref_model.to(device)
|
| 139 |
+
|
| 140 |
+
if tokenizer.pad_token is None:
|
| 141 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 142 |
+
tokenizer.pad_token_id = tokenizer.eos_token_id
|
| 143 |
+
|
| 144 |
+
print(model)
|
| 145 |
+
print(ref_model)
|
| 146 |
+
print(tokenizer)
|
| 147 |
+
|
| 148 |
+
dataset_dict = load_dataset(
|
| 149 |
+
path=args.dataset_path,
|
| 150 |
+
cache_dir=args.dataset_cache_dir,
|
| 151 |
+
)
|
| 152 |
+
train_dataset = dataset_dict["train_prefs"]
|
| 153 |
+
# test_dataset = dataset_dict["test_prefs"]
|
| 154 |
+
|
| 155 |
+
train_dataset = train_dataset.map(
|
| 156 |
+
lambda x: format_func(x, tokenizer),
|
| 157 |
+
batched=False,
|
| 158 |
+
num_proc=args.num_workers,
|
| 159 |
+
remove_columns=train_dataset.column_names,
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
dpo_config = DPOConfig(
|
| 163 |
+
output_dir=args.output_model_dir,
|
| 164 |
+
num_train_epochs=1,
|
| 165 |
+
per_device_train_batch_size=1 if debug_mode else 2,
|
| 166 |
+
gradient_accumulation_steps=1 if debug_mode else 8,
|
| 167 |
+
save_strategy="steps",
|
| 168 |
+
save_steps=100,
|
| 169 |
+
save_total_limit=2,
|
| 170 |
+
logging_steps=10,
|
| 171 |
+
learning_rate=2e-5,
|
| 172 |
+
warmup_steps=100,
|
| 173 |
+
lr_scheduler_type="cosine",
|
| 174 |
+
fp16=True if torch.cuda.is_available() else False,
|
| 175 |
+
gradient_checkpointing=False, # 如果内存紧张,可以设为 True
|
| 176 |
+
optim="adamw_torch",
|
| 177 |
+
report_to="none",
|
| 178 |
+
max_length=1024 if debug_mode else 2048, # prompt + chosen 的最大长度
|
| 179 |
+
max_prompt_length=512 if debug_mode else 1024, # prompt 的最大长度
|
| 180 |
+
# DPO 特定参数
|
| 181 |
+
beta=0.1, # DPO 的温度参数,控制对 preference 的置信度
|
| 182 |
+
remove_unused_columns=False,
|
| 183 |
+
dataloader_pin_memory=False,
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
trainer = DPOTrainer(
|
| 187 |
+
model=model,
|
| 188 |
+
ref_model=ref_model, # 提供参考模型
|
| 189 |
+
args=dpo_config,
|
| 190 |
+
train_dataset=train_dataset,
|
| 191 |
+
# DPOTrainer 会自动处理数据,不需要 data_collator
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
# 开始训练
|
| 195 |
+
print("开始 DPO 训练...")
|
| 196 |
+
trainer.train()
|
| 197 |
+
|
| 198 |
+
# 保存模型
|
| 199 |
+
print(f"保存模型到: {args.output_model_dir}")
|
| 200 |
+
trainer.save_model()
|
| 201 |
+
tokenizer.save_pretrained(args.output_model_dir)
|
| 202 |
+
|
| 203 |
+
print("DPO 训练完成!")
|
| 204 |
+
return
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
if __name__ == "__main__":
|
| 208 |
+
main()
|
examples/tutorials/rlhf/gpt2_sst2/step_5_ppo_rlhf.py
CHANGED
|
@@ -1,12 +1,11 @@
|
|
| 1 |
#!/usr/bin/python3
|
| 2 |
# -*- coding: utf-8 -*-
|
| 3 |
"""
|
| 4 |
-
|
|
|
|
| 5 |
"""
|
| 6 |
import argparse
|
| 7 |
-
import copy
|
| 8 |
import os
|
| 9 |
-
import random
|
| 10 |
from pathlib import Path
|
| 11 |
import platform
|
| 12 |
from typing import Optional, Tuple, List, Dict, Union
|
|
@@ -14,12 +13,12 @@ from typing import Optional, Tuple, List, Dict, Union
|
|
| 14 |
import numpy as np
|
| 15 |
import torch
|
| 16 |
import torch.nn as nn
|
| 17 |
-
import torch.nn.functional as F
|
| 18 |
from torch.utils.data import DataLoader
|
| 19 |
from datasets import load_dataset
|
| 20 |
from transformers import (
|
| 21 |
-
AutoTokenizer,
|
| 22 |
-
|
|
|
|
| 23 |
)
|
| 24 |
|
| 25 |
# 路径配置
|
|
@@ -29,6 +28,8 @@ else:
|
|
| 29 |
project_path = Path(os.path.abspath("../../../"))
|
| 30 |
temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
|
| 31 |
|
|
|
|
|
|
|
| 32 |
|
| 33 |
def get_args():
|
| 34 |
parser = argparse.ArgumentParser()
|
|
@@ -44,7 +45,7 @@ def get_args():
|
|
| 44 |
parser.add_argument("--valid_dataset_size", default=1000, type=int)
|
| 45 |
|
| 46 |
# 训练参数
|
| 47 |
-
parser.add_argument("--batch_size", default=16, type=int)
|
| 48 |
parser.add_argument("--ppo_epochs", default=4, type=int)
|
| 49 |
parser.add_argument("--mini_batch_size", default=4, type=int)
|
| 50 |
parser.add_argument("--kl_beta", default=0.2, type=float)
|
|
@@ -63,371 +64,184 @@ def get_args():
|
|
| 63 |
|
| 64 |
# 其他
|
| 65 |
parser.add_argument("--num_workers", default=0 if platform.system() == "Windows" else 2, type=int)
|
| 66 |
-
parser.add_argument("--device", default="cpu", type=str)
|
| 67 |
|
| 68 |
return parser.parse_args()
|
| 69 |
|
| 70 |
|
| 71 |
-
|
| 72 |
-
"""
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
def
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
"""奖励模型,为每个token预测奖励"""
|
| 121 |
-
|
| 122 |
-
def __init__(self, config: GPT2Config):
|
| 123 |
-
super().__init__(config)
|
| 124 |
-
self.transformer = GPT2Model(config)
|
| 125 |
-
self.reward_head = nn.Linear(config.hidden_size, 1)
|
| 126 |
-
self.post_init()
|
| 127 |
-
|
| 128 |
-
def forward(self, input_ids, attention_mask=None):
|
| 129 |
-
outputs = self.transformer(
|
| 130 |
-
input_ids,
|
| 131 |
-
attention_mask=attention_mask,
|
| 132 |
-
output_hidden_states=True
|
| 133 |
-
)
|
| 134 |
-
rewards = self.reward_head(outputs.hidden_states[-1]).squeeze(-1)
|
| 135 |
-
return torch.sigmoid(rewards) # [batch, seq_len]
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
class PPOAgent:
|
| 139 |
-
"""PPO训练Agent,封装所有训练逻辑"""
|
| 140 |
-
|
| 141 |
-
def __init__(self, args):
|
| 142 |
-
self.args = args
|
| 143 |
-
self.device = torch.device(args.device)
|
| 144 |
-
|
| 145 |
-
# 加载tokenizer
|
| 146 |
-
self.tokenizer = AutoTokenizer.from_pretrained(args.sft_model_name)
|
| 147 |
-
self.tokenizer.pad_token = self.tokenizer.eos_token
|
| 148 |
-
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
|
| 149 |
-
|
| 150 |
-
# 加载模型
|
| 151 |
-
print("Loading models...")
|
| 152 |
-
self.actor_critic = GPT2ActorCritic.from_pretrained(args.sft_model_name).to(self.device)
|
| 153 |
-
self.reward_model = GPT2RewardModel.from_pretrained(args.reward_model_name).to(self.device)
|
| 154 |
-
self.reward_model.eval()
|
| 155 |
-
|
| 156 |
-
# 参考模型(冻结)
|
| 157 |
-
self.ref_model = copy.deepcopy(self.actor_critic).to(self.device)
|
| 158 |
-
self.ref_model.eval()
|
| 159 |
-
|
| 160 |
-
# 优化器
|
| 161 |
-
self.optimizer = torch.optim.Adam(self.actor_critic.parameters(), lr=args.lr)
|
| 162 |
-
|
| 163 |
-
# 训练状态
|
| 164 |
-
self.training_step = 0
|
| 165 |
-
|
| 166 |
-
def prepare_dataset(self):
|
| 167 |
-
"""准备训练数据集"""
|
| 168 |
-
print("Loading dataset...")
|
| 169 |
-
dataset = load_dataset(
|
| 170 |
-
path=self.args.dataset_path,
|
| 171 |
-
cache_dir=self.args.dataset_cache_dir,
|
| 172 |
-
split="train"
|
| 173 |
-
)
|
| 174 |
-
|
| 175 |
-
def filter_and_truncate(example):
|
| 176 |
-
# 只保留足够长的句子
|
| 177 |
-
tokens = self.tokenizer(example["sentence"])["input_ids"]
|
| 178 |
-
if len(tokens) <= 8:
|
| 179 |
-
return False
|
| 180 |
-
|
| 181 |
-
# 随机截取前2-6个token作为query
|
| 182 |
-
example["query_ids"] = tokens[:random.randint(2, 6)]
|
| 183 |
-
return True
|
| 184 |
-
|
| 185 |
-
dataset = dataset.filter(filter_and_truncate)
|
| 186 |
-
dataset = dataset.select(range(min(len(dataset), 5000))) # CPU上用小数据集
|
| 187 |
-
|
| 188 |
-
return dataset
|
| 189 |
-
|
| 190 |
-
def collect_rollouts(self, batch):
|
| 191 |
-
"""收集一轮交互数据"""
|
| 192 |
-
query_ids_list = []
|
| 193 |
-
response_ids_list = []
|
| 194 |
-
rewards_list = []
|
| 195 |
-
|
| 196 |
-
for i in range(len(batch["query_ids"])):
|
| 197 |
-
query_ids = torch.tensor(batch["query_ids"][i]).to(self.device)
|
| 198 |
-
query_ids_list.append(query_ids)
|
| 199 |
-
|
| 200 |
-
# 生成response
|
| 201 |
-
with torch.no_grad():
|
| 202 |
-
response_len = random.randint(
|
| 203 |
-
self.args.min_response_len,
|
| 204 |
-
self.args.max_response_len
|
| 205 |
-
)
|
| 206 |
-
full_ids = self.actor_critic.generate(
|
| 207 |
-
input_ids=query_ids.unsqueeze(0),
|
| 208 |
-
max_new_tokens=response_len,
|
| 209 |
-
do_sample=True,
|
| 210 |
-
top_p=self.args.top_p,
|
| 211 |
-
temperature=self.args.temperature,
|
| 212 |
-
pad_token_id=self.tokenizer.pad_token_id,
|
| 213 |
-
eos_token_id=self.tokenizer.eos_token_id,
|
| 214 |
-
)[0]
|
| 215 |
-
|
| 216 |
-
response_ids = full_ids[len(query_ids):]
|
| 217 |
-
response_ids_list.append(response_ids)
|
| 218 |
-
|
| 219 |
-
# 计算奖励(只取最后一个token的奖励)
|
| 220 |
-
reward = self.reward_model(
|
| 221 |
-
full_ids.unsqueeze(0),
|
| 222 |
-
attention_mask=torch.ones_like(full_ids).unsqueeze(0)
|
| 223 |
-
)[0, -1]
|
| 224 |
-
# 缩放到[-1, 1]
|
| 225 |
-
rewards_list.append(2 * (reward - 0.5))
|
| 226 |
-
|
| 227 |
-
return query_ids_list, response_ids_list, rewards_list
|
| 228 |
-
|
| 229 |
-
def compute_advantages_and_returns(self, log_probs, values, rewards, masks):
|
| 230 |
-
"""计算GAE advantages和returns"""
|
| 231 |
-
seq_len = rewards.shape[1]
|
| 232 |
-
advantages = torch.zeros_like(rewards)
|
| 233 |
-
returns = torch.zeros_like(rewards)
|
| 234 |
-
|
| 235 |
-
gae = 0
|
| 236 |
-
for t in reversed(range(seq_len)):
|
| 237 |
-
if t == seq_len - 1:
|
| 238 |
-
next_value = 0
|
| 239 |
-
else:
|
| 240 |
-
next_value = values[:, t + 1]
|
| 241 |
-
|
| 242 |
-
delta = rewards[:, t] + self.args.gamma * next_value - values[:, t]
|
| 243 |
-
gae = delta + self.args.gamma * self.args.lam * gae
|
| 244 |
-
advantages[:, t] = gae
|
| 245 |
-
returns[:, t] = advantages[:, t] + values[:, t]
|
| 246 |
-
|
| 247 |
-
# 只对有效位置进行whiten
|
| 248 |
-
advantages = self.masked_whiten(advantages, masks)
|
| 249 |
-
return advantages, returns
|
| 250 |
-
|
| 251 |
-
def masked_whiten(self, values, mask):
|
| 252 |
-
"""带mask的whitening"""
|
| 253 |
-
mask = mask.float()
|
| 254 |
-
mean = (values * mask).sum() / mask.sum()
|
| 255 |
-
var = (((values - mean) * mask) ** 2).sum() / mask.sum()
|
| 256 |
-
whitened = (values - mean) * torch.rsqrt(var + 1e-8)
|
| 257 |
-
return whitened * mask
|
| 258 |
-
|
| 259 |
-
def ppo_step(self, batch_data):
|
| 260 |
-
"""单步PPO更新"""
|
| 261 |
-
(query_ids_list, response_ids_list, old_log_probs,
|
| 262 |
-
advantages, returns, masks) = batch_data
|
| 263 |
-
|
| 264 |
-
# 拼接完整的query+response
|
| 265 |
-
full_ids_list = []
|
| 266 |
-
for q, r in zip(query_ids_list, response_ids_list):
|
| 267 |
-
full_ids_list.append(torch.cat([q, r]))
|
| 268 |
-
|
| 269 |
-
# padding
|
| 270 |
-
padded = self.tokenizer.pad(
|
| 271 |
-
{"input_ids": full_ids_list},
|
| 272 |
-
padding=True,
|
| 273 |
-
return_tensors="pt"
|
| 274 |
-
)
|
| 275 |
-
input_ids = padded["input_ids"].to(self.device)
|
| 276 |
-
attention_mask = padded["attention_mask"].to(self.device)
|
| 277 |
-
|
| 278 |
-
# 前向传播
|
| 279 |
-
logits, values = self.actor_critic(input_ids, attention_mask)
|
| 280 |
-
|
| 281 |
-
# 计算新的log_probs
|
| 282 |
-
log_probs = F.log_softmax(logits[:, :-1, :], dim=-1)
|
| 283 |
-
log_probs = torch.gather(
|
| 284 |
-
log_probs, 2,
|
| 285 |
-
input_ids[:, 1:].unsqueeze(-1)
|
| 286 |
-
).squeeze(-1)
|
| 287 |
-
|
| 288 |
-
# 只保留response部分的log_probs
|
| 289 |
-
response_start = [len(q) for q in query_ids_list]
|
| 290 |
-
new_log_probs = []
|
| 291 |
-
for i, start in enumerate(response_start):
|
| 292 |
-
new_log_probs.append(log_probs[i, start - 1:start - 1 + len(response_ids_list[i])])
|
| 293 |
-
new_log_probs = torch.cat(new_log_probs)
|
| 294 |
-
|
| 295 |
-
# 计算ratio和PPO损失
|
| 296 |
-
old_log_probs = old_log_probs.detach()
|
| 297 |
-
ratio = torch.exp(new_log_probs - old_log_probs)
|
| 298 |
-
|
| 299 |
-
# 裁剪的policy loss
|
| 300 |
-
surr1 = ratio * advantages
|
| 301 |
-
surr2 = torch.clamp(ratio, 1 - self.args.clip_epsilon,
|
| 302 |
-
1 + self.args.clip_epsilon) * advantages
|
| 303 |
-
policy_loss = -torch.min(surr1, surr2).mean()
|
| 304 |
-
|
| 305 |
-
# value loss
|
| 306 |
-
value_pred = []
|
| 307 |
-
for i, start in enumerate(response_start):
|
| 308 |
-
value_pred.append(values[i, start - 1:start - 1 + len(response_ids_list[i])])
|
| 309 |
-
value_pred = torch.cat(value_pred)
|
| 310 |
-
value_loss = F.mse_loss(value_pred, returns)
|
| 311 |
-
|
| 312 |
-
# 总loss
|
| 313 |
-
loss = policy_loss + 0.5 * value_loss
|
| 314 |
-
|
| 315 |
-
return loss, policy_loss, value_loss
|
| 316 |
-
|
| 317 |
-
def train_epoch(self, dataset):
|
| 318 |
-
"""训练一个epoch"""
|
| 319 |
-
total_policy_loss = 0
|
| 320 |
-
total_value_loss = 0
|
| 321 |
-
num_batches = 0
|
| 322 |
-
|
| 323 |
-
for batch_idx in range(0, len(dataset), self.args.batch_size):
|
| 324 |
-
# 1. 收集数据
|
| 325 |
-
batch = dataset[batch_idx:batch_idx + self.args.batch_size]
|
| 326 |
-
query_ids_list, response_ids_list, rewards_list = self.collect_rollouts(batch)
|
| 327 |
-
|
| 328 |
-
# 2. 计算旧的log_probs和values
|
| 329 |
-
old_log_probs_list = []
|
| 330 |
-
values_list = []
|
| 331 |
-
masks_list = []
|
| 332 |
-
|
| 333 |
-
with torch.no_grad():
|
| 334 |
-
for q_ids, r_ids in zip(query_ids_list, response_ids_list):
|
| 335 |
-
full_ids = torch.cat([q_ids, r_ids]).unsqueeze(0).to(self.device)
|
| 336 |
-
attn_mask = torch.ones_like(full_ids)
|
| 337 |
-
|
| 338 |
-
logits, values = self.actor_critic(full_ids, attn_mask)
|
| 339 |
-
|
| 340 |
-
# 计算response部分的log_probs
|
| 341 |
-
log_probs = F.log_softmax(logits[:, :-1, :], dim=-1)
|
| 342 |
-
log_probs = torch.gather(
|
| 343 |
-
log_probs, 2,
|
| 344 |
-
full_ids[:, 1:].unsqueeze(-1)
|
| 345 |
-
).squeeze(-1)
|
| 346 |
-
|
| 347 |
-
start = len(q_ids) - 1
|
| 348 |
-
end = start + len(r_ids)
|
| 349 |
-
old_log_probs_list.append(log_probs[0, start:end])
|
| 350 |
-
values_list.append(values[0, start:end])
|
| 351 |
-
|
| 352 |
-
# 创建mask
|
| 353 |
-
mask = torch.zeros(len(r_ids))
|
| 354 |
-
mask[-1] = 1 # 最后一个token有真实奖励
|
| 355 |
-
masks_list.append(mask)
|
| 356 |
-
|
| 357 |
-
# 转换为tensor
|
| 358 |
-
old_log_probs = torch.cat(old_log_probs_list).to(self.device)
|
| 359 |
-
values = torch.cat(values_list).to(self.device)
|
| 360 |
-
masks = torch.cat(masks_list).to(self.device)
|
| 361 |
-
rewards = torch.zeros_like(values).to(self.device)
|
| 362 |
-
|
| 363 |
-
# 设置奖励(只在最后一个token加上环境奖励)
|
| 364 |
-
for i, (r, mask) in enumerate(zip(rewards_list, masks_list)):
|
| 365 |
-
if mask[-1] > 0:
|
| 366 |
-
# KL惩罚
|
| 367 |
-
kl = old_log_probs[i] - old_log_probs[i] # 这里简化了,实际要用ref_model
|
| 368 |
-
kl_penalty = -self.args.kl_beta * kl
|
| 369 |
-
rewards[i] = kl_penalty + r
|
| 370 |
-
|
| 371 |
-
# 3. 计算advantages和returns
|
| 372 |
-
advantages, returns = self.compute_advantages_and_returns(
|
| 373 |
-
old_log_probs.unsqueeze(0),
|
| 374 |
-
values.unsqueeze(0),
|
| 375 |
-
rewards.unsqueeze(0),
|
| 376 |
-
masks.unsqueeze(0)
|
| 377 |
-
)
|
| 378 |
-
|
| 379 |
-
# 4. PPO多次更新
|
| 380 |
-
batch_data = (query_ids_list, response_ids_list, old_log_probs,
|
| 381 |
-
advantages.squeeze(0), returns.squeeze(0), masks)
|
| 382 |
-
|
| 383 |
-
for _ in range(self.args.ppo_epochs):
|
| 384 |
-
loss, policy_loss, value_loss = self.ppo_step(batch_data)
|
| 385 |
-
|
| 386 |
-
self.optimizer.zero_grad()
|
| 387 |
-
loss.backward()
|
| 388 |
-
torch.nn.utils.clip_grad_norm_(self.actor_critic.parameters(), 1.0)
|
| 389 |
-
self.optimizer.step()
|
| 390 |
-
|
| 391 |
-
total_policy_loss += policy_loss.item()
|
| 392 |
-
total_value_loss += value_loss.item()
|
| 393 |
-
num_batches += 1
|
| 394 |
-
self.training_step += 1
|
| 395 |
-
|
| 396 |
-
if batch_idx % 100 == 0:
|
| 397 |
-
print(f"Batch {batch_idx}/{len(dataset)}: "
|
| 398 |
-
f"policy_loss={total_policy_loss / num_batches:.4f}, "
|
| 399 |
-
f"value_loss={total_value_loss / num_batches:.4f}")
|
| 400 |
-
|
| 401 |
-
return total_policy_loss / num_batches, total_value_loss / num_batches
|
| 402 |
-
|
| 403 |
-
def train(self):
|
| 404 |
-
"""主训练循环"""
|
| 405 |
-
dataset = self.prepare_dataset()
|
| 406 |
-
print(f"Dataset size: {len(dataset)}")
|
| 407 |
-
|
| 408 |
-
for epoch in range(self.args.max_epochs):
|
| 409 |
-
print(f"\n=== Epoch {epoch + 1}/{self.args.max_epochs} ===")
|
| 410 |
-
policy_loss, value_loss = self.train_epoch(dataset)
|
| 411 |
-
print(f"Epoch {epoch + 1} finished: "
|
| 412 |
-
f"policy_loss={policy_loss:.4f}, value_loss={value_loss:.4f}")
|
| 413 |
|
| 414 |
|
| 415 |
def main():
|
| 416 |
args = get_args()
|
| 417 |
print("PPO Training with CPU")
|
| 418 |
-
print(f"Arguments: {args}")
|
| 419 |
|
| 420 |
-
#
|
| 421 |
-
|
| 422 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 423 |
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
print(f"Model saved to {output_dir}")
|
| 430 |
|
| 431 |
|
| 432 |
if __name__ == "__main__":
|
| 433 |
-
main()
|
|
|
|
| 1 |
#!/usr/bin/python3
|
| 2 |
# -*- coding: utf-8 -*-
|
| 3 |
"""
|
| 4 |
+
PPO Training with TRL on SST-2 dataset
|
| 5 |
+
基于 https://huggingface.co/docs/trl/v0.16.1/en/ppo_trainer 的实现
|
| 6 |
"""
|
| 7 |
import argparse
|
|
|
|
| 8 |
import os
|
|
|
|
| 9 |
from pathlib import Path
|
| 10 |
import platform
|
| 11 |
from typing import Optional, Tuple, List, Dict, Union
|
|
|
|
| 13 |
import numpy as np
|
| 14 |
import torch
|
| 15 |
import torch.nn as nn
|
|
|
|
| 16 |
from torch.utils.data import DataLoader
|
| 17 |
from datasets import load_dataset
|
| 18 |
from transformers import (
|
| 19 |
+
AutoTokenizer,
|
| 20 |
+
GPT2LMHeadModel,
|
| 21 |
+
DataCollatorWithPadding
|
| 22 |
)
|
| 23 |
|
| 24 |
# 路径配置
|
|
|
|
| 28 |
project_path = Path(os.path.abspath("../../../"))
|
| 29 |
temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
|
| 30 |
|
| 31 |
+
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
|
| 32 |
+
|
| 33 |
|
| 34 |
def get_args():
|
| 35 |
parser = argparse.ArgumentParser()
|
|
|
|
| 45 |
parser.add_argument("--valid_dataset_size", default=1000, type=int)
|
| 46 |
|
| 47 |
# 训练参数
|
| 48 |
+
parser.add_argument("--batch_size", default=16, type=int)
|
| 49 |
parser.add_argument("--ppo_epochs", default=4, type=int)
|
| 50 |
parser.add_argument("--mini_batch_size", default=4, type=int)
|
| 51 |
parser.add_argument("--kl_beta", default=0.2, type=float)
|
|
|
|
| 64 |
|
| 65 |
# 其他
|
| 66 |
parser.add_argument("--num_workers", default=0 if platform.system() == "Windows" else 2, type=int)
|
| 67 |
+
parser.add_argument("--device", default="cpu", type=str)
|
| 68 |
|
| 69 |
return parser.parse_args()
|
| 70 |
|
| 71 |
|
| 72 |
+
def build_dataset(tokenizer, dataset_path, dataset_cache_dir, valid_dataset_size):
|
| 73 |
+
"""
|
| 74 |
+
构建SST-2数据集,返回query列表
|
| 75 |
+
"""
|
| 76 |
+
dataset = load_dataset(
|
| 77 |
+
dataset_path,
|
| 78 |
+
cache_dir=dataset_cache_dir,
|
| 79 |
+
split="train"
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
# 只取前valid_dataset_size条数据用于演示
|
| 83 |
+
dataset = dataset.select(range(min(valid_dataset_size, len(dataset))))
|
| 84 |
+
|
| 85 |
+
def tokenize_function(examples):
|
| 86 |
+
return tokenizer(examples["sentence"], truncation=True, max_length=128)
|
| 87 |
+
|
| 88 |
+
dataset = dataset.map(tokenize_function, batched=True)
|
| 89 |
+
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
|
| 90 |
+
|
| 91 |
+
return dataset
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
class RewardModelWrapper:
|
| 95 |
+
"""
|
| 96 |
+
奖励模型包装器,用于计算生成文本的奖励
|
| 97 |
+
"""
|
| 98 |
+
|
| 99 |
+
def __init__(self, reward_model_name, tokenizer, device):
|
| 100 |
+
self.device = device
|
| 101 |
+
self.tokenizer = tokenizer
|
| 102 |
+
# 加载你的GPT2RewardModel或标准模型
|
| 103 |
+
from transformers import GPT2ForSequenceClassification
|
| 104 |
+
self.model = GPT2ForSequenceClassification.from_pretrained(reward_model_name).to(device)
|
| 105 |
+
self.model.eval()
|
| 106 |
+
|
| 107 |
+
def get_reward(self, texts: List[str]) -> List[float]:
|
| 108 |
+
"""
|
| 109 |
+
计算文本的奖励分数(SST-2情感分类)
|
| 110 |
+
"""
|
| 111 |
+
inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(self.device)
|
| 112 |
+
with torch.no_grad():
|
| 113 |
+
outputs = self.model(**inputs)
|
| 114 |
+
# SST-2是二分类,取正面情感的logits作为奖励
|
| 115 |
+
logits = outputs.logits
|
| 116 |
+
# 如果模型输出是logits,使用softmax获取正面概率
|
| 117 |
+
probs = torch.softmax(logits, dim=-1)
|
| 118 |
+
# 假设标签1是正面
|
| 119 |
+
rewards = probs[:, 1].cpu().tolist()
|
| 120 |
+
return rewards
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
|
| 123 |
def main():
|
| 124 |
args = get_args()
|
| 125 |
print("PPO Training with CPU")
|
|
|
|
| 126 |
|
| 127 |
+
# 设备设置
|
| 128 |
+
device = torch.device(args.device)
|
| 129 |
+
|
| 130 |
+
# 1. 加载tokenizer
|
| 131 |
+
tokenizer = AutoTokenizer.from_pretrained(args.sft_model_name, cache_dir=args.model_cache_dir)
|
| 132 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 133 |
+
|
| 134 |
+
# 2. 构建数据集
|
| 135 |
+
dataset = build_dataset(
|
| 136 |
+
tokenizer,
|
| 137 |
+
args.dataset_path,
|
| 138 |
+
args.dataset_cache_dir,
|
| 139 |
+
args.valid_dataset_size
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
# 3. 加载模型(使用TRL的AutoModelForCausalLMWithValueHead)
|
| 143 |
+
# 这会在原有LM基础上自动添加value head
|
| 144 |
+
model = AutoModelForCausalLMWithValueHead.from_pretrained(args.sft_model_name)
|
| 145 |
+
model.to(device)
|
| 146 |
+
|
| 147 |
+
# 4. 加载参考模型(用于KL散度计算)
|
| 148 |
+
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(args.sft_model_name)
|
| 149 |
+
ref_model.to(device)
|
| 150 |
+
|
| 151 |
+
# 5. 加载奖励模型
|
| 152 |
+
reward_model = RewardModelWrapper(args.reward_model_name, tokenizer, device)
|
| 153 |
+
|
| 154 |
+
# 6. 配置PPO参数
|
| 155 |
+
ppo_config = PPOConfig(
|
| 156 |
+
model_name=args.sft_model_name,
|
| 157 |
+
learning_rate=args.lr,
|
| 158 |
+
batch_size=args.batch_size,
|
| 159 |
+
mini_batch_size=args.mini_batch_size,
|
| 160 |
+
ppo_epochs=args.ppo_epochs,
|
| 161 |
+
clip_epsilon=args.clip_epsilon,
|
| 162 |
+
gamma=args.gamma,
|
| 163 |
+
lam=args.lam,
|
| 164 |
+
kl_penalty=args.kl_beta,
|
| 165 |
+
device=device,
|
| 166 |
+
log_with=None, # 可设置"wandb"等
|
| 167 |
+
project_kwargs={"logging_dir": "./logs"},
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
# 7. 初始化PPO Trainer
|
| 171 |
+
ppo_trainer = PPOTrainer(
|
| 172 |
+
config=ppo_config,
|
| 173 |
+
model=model,
|
| 174 |
+
ref_model=ref_model,
|
| 175 |
+
tokenizer=tokenizer,
|
| 176 |
+
dataset=dataset,
|
| 177 |
+
data_collator=DataCollatorWithPadding(tokenizer)
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
# 8. 定义response长度采样器(在min和max之间随机)
|
| 181 |
+
response_length_sampler = LengthSampler(args.min_response_len, args.max_response_len)
|
| 182 |
+
|
| 183 |
+
# 9. 训练循环
|
| 184 |
+
generation_kwargs = {
|
| 185 |
+
"min_length": -1, # 不强制最小长度
|
| 186 |
+
"top_p": args.top_p,
|
| 187 |
+
"temperature": args.temperature,
|
| 188 |
+
"do_sample": True,
|
| 189 |
+
"pad_token_id": tokenizer.eos_token_id,
|
| 190 |
+
"max_new_tokens": args.max_new_tokens,
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
for epoch in range(args.max_epochs):
|
| 194 |
+
print(f"Epoch {epoch + 1}/{args.max_epochs}")
|
| 195 |
+
|
| 196 |
+
for batch_idx, batch in enumerate(ppo_trainer.dataloader):
|
| 197 |
+
# 获取query的input_ids
|
| 198 |
+
query_tensors = batch["input_ids"]
|
| 199 |
+
|
| 200 |
+
# 使用模型生成response
|
| 201 |
+
response_tensors = []
|
| 202 |
+
for query in query_tensors:
|
| 203 |
+
# query已经是tensor,添加batch维度
|
| 204 |
+
query = query.unsqueeze(0).to(device)
|
| 205 |
+
|
| 206 |
+
# 生成response(这里使用respond_to_batch工具函数)
|
| 207 |
+
response = respond_to_batch(
|
| 208 |
+
model,
|
| 209 |
+
query,
|
| 210 |
+
length_sampler=response_length_sampler,
|
| 211 |
+
**generation_kwargs
|
| 212 |
+
)
|
| 213 |
+
response_tensors.append(response.squeeze())
|
| 214 |
+
|
| 215 |
+
# 解码生成的文本
|
| 216 |
+
responses = [tokenizer.decode(r, skip_special_tokens=True) for r in response_tensors]
|
| 217 |
+
queries = [tokenizer.decode(q, skip_special_tokens=True) for q in query_tensors]
|
| 218 |
+
|
| 219 |
+
# 使用奖励模型计算奖励
|
| 220 |
+
# 这里我们结合query和response作为完整文本进行情感分析
|
| 221 |
+
full_texts = [q + " " + r for q, r in zip(queries, responses)]
|
| 222 |
+
rewards = reward_model.get_reward(full_texts)
|
| 223 |
+
|
| 224 |
+
# 转换为tensor
|
| 225 |
+
rewards = [torch.tensor(r, device=device) for r in rewards]
|
| 226 |
+
|
| 227 |
+
# 执行PPO更新步骤
|
| 228 |
+
stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
|
| 229 |
+
|
| 230 |
+
# 打印统计信息
|
| 231 |
+
if batch_idx % 10 == 0:
|
| 232 |
+
print(f"Batch {batch_idx}, mean reward: {np.mean(rewards):.4f}")
|
| 233 |
+
print(f"Stats: {stats}")
|
| 234 |
+
print(f"Example query: {queries[0]}")
|
| 235 |
+
print(f"Example response: {responses[0]}")
|
| 236 |
+
print(f"Reward: {rewards[0].item():.4f}")
|
| 237 |
+
print("-" * 50)
|
| 238 |
|
| 239 |
+
# 每个epoch保存一次模型
|
| 240 |
+
save_path = Path("ppo_models") / f"epoch_{epoch}"
|
| 241 |
+
ppo_trainer.save_pretrained(save_path)
|
| 242 |
+
tokenizer.save_pretrained(save_path)
|
| 243 |
+
print(f"Model saved to {save_path}")
|
|
|
|
| 244 |
|
| 245 |
|
| 246 |
if __name__ == "__main__":
|
| 247 |
+
main()
|
examples/tutorials/rlhf/gpt2_sst2_generation/step_2_train_model.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
用sst的句子训练gpt2模型,让其随机生成一些评论。
|
| 5 |
+
"""
|
| 6 |
+
import argparse
|
| 7 |
+
import os
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import platform
|
| 10 |
+
|
| 11 |
+
if platform.system() in ("Windows", "Darwin"):
|
| 12 |
+
from project_settings import project_path, temp_directory
|
| 13 |
+
else:
|
| 14 |
+
project_path = os.path.abspath("../../../")
|
| 15 |
+
project_path = Path(project_path)
|
| 16 |
+
temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
|
| 17 |
+
|
| 18 |
+
from datasets import load_dataset
|
| 19 |
+
import torch
|
| 20 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
|
| 21 |
+
from transformers import GPT2LMHeadModel
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def get_args():
|
| 25 |
+
parser = argparse.ArgumentParser()
|
| 26 |
+
parser.add_argument(
|
| 27 |
+
"--model_name",
|
| 28 |
+
# default="openai-community/gpt2",
|
| 29 |
+
default=(project_path / "pretrained_models/openai-community/gpt2").as_posix(),
|
| 30 |
+
type=str
|
| 31 |
+
),
|
| 32 |
+
parser.add_argument(
|
| 33 |
+
"--dataset_path",
|
| 34 |
+
default="stanfordnlp/sst2",
|
| 35 |
+
type=str
|
| 36 |
+
),
|
| 37 |
+
parser.add_argument("--dataset_name", default=None, type=str),
|
| 38 |
+
parser.add_argument("--dataset_split", default=None, type=str),
|
| 39 |
+
parser.add_argument(
|
| 40 |
+
"--dataset_cache_dir",
|
| 41 |
+
default=(temp_directory / "hub_datasets").as_posix(),
|
| 42 |
+
type=str
|
| 43 |
+
),
|
| 44 |
+
parser.add_argument(
|
| 45 |
+
"--model_cache_dir",
|
| 46 |
+
default=(temp_directory / "hub_models").as_posix(),
|
| 47 |
+
type=str
|
| 48 |
+
),
|
| 49 |
+
parser.add_argument("--dataset_streaming", default=None, type=str),
|
| 50 |
+
parser.add_argument("--valid_dataset_size", default=1000, type=int),
|
| 51 |
+
parser.add_argument("--shuffle_buffer_size", default=5000, type=int),
|
| 52 |
+
|
| 53 |
+
parser.add_argument(
|
| 54 |
+
"--output_model_dir",
|
| 55 |
+
default=(project_path / "trained_models/gpt2-sst2-generation-epoch-3").as_posix(),
|
| 56 |
+
type=str
|
| 57 |
+
),
|
| 58 |
+
|
| 59 |
+
parser.add_argument(
|
| 60 |
+
"--num_workers",
|
| 61 |
+
default=None if platform.system() in ("Windows", "Darwin") else os.cpu_count() // 2,
|
| 62 |
+
type=int
|
| 63 |
+
),
|
| 64 |
+
parser.add_argument(
|
| 65 |
+
"--device",
|
| 66 |
+
default=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
|
| 67 |
+
type=int
|
| 68 |
+
),
|
| 69 |
+
args = parser.parse_args()
|
| 70 |
+
return args
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def main():
|
| 74 |
+
args = get_args()
|
| 75 |
+
|
| 76 |
+
model = AutoModelForCausalLM.from_pretrained(args.model_name)
|
| 77 |
+
model = model.to(args.device)
|
| 78 |
+
tokenizer = AutoTokenizer.from_pretrained(args.model_name)
|
| 79 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 80 |
+
|
| 81 |
+
dataset_dict = load_dataset(
|
| 82 |
+
path=args.dataset_path,
|
| 83 |
+
name=args.dataset_name,
|
| 84 |
+
split=args.dataset_split,
|
| 85 |
+
cache_dir=args.dataset_cache_dir,
|
| 86 |
+
# num_proc=args.num_workers if not args.dataset_streaming else None,
|
| 87 |
+
streaming=args.dataset_streaming,
|
| 88 |
+
)
|
| 89 |
+
train_dataset = dataset_dict["train"]
|
| 90 |
+
valid_dataset = dataset_dict["validation"]
|
| 91 |
+
# test_dataset = dataset_dict["test"]
|
| 92 |
+
|
| 93 |
+
def format_func(example):
|
| 94 |
+
sentence = example["sentence"]
|
| 95 |
+
sentence += tokenizer.eos_token
|
| 96 |
+
tokenized = tokenizer(sentence)
|
| 97 |
+
input_ids = tokenized["input_ids"]
|
| 98 |
+
attention_mask = tokenized["attention_mask"]
|
| 99 |
+
# print(input_ids)
|
| 100 |
+
# print(attention_mask)
|
| 101 |
+
result = {
|
| 102 |
+
"input_ids": input_ids,
|
| 103 |
+
"attention_mask": attention_mask,
|
| 104 |
+
}
|
| 105 |
+
return result
|
| 106 |
+
|
| 107 |
+
train_dataset = train_dataset.map(
|
| 108 |
+
format_func,
|
| 109 |
+
batched=False,
|
| 110 |
+
remove_columns=train_dataset.column_names,
|
| 111 |
+
)
|
| 112 |
+
valid_dataset = valid_dataset.map(
|
| 113 |
+
format_func,
|
| 114 |
+
batched=False,
|
| 115 |
+
remove_columns=valid_dataset.column_names,
|
| 116 |
+
)
|
| 117 |
+
print(f"train_dataset size: {len(train_dataset)}")
|
| 118 |
+
print(f"valid_dataset size: {len(valid_dataset)}")
|
| 119 |
+
|
| 120 |
+
train_dataset = train_dataset.filter(
|
| 121 |
+
function=lambda x: 5 < len(x["input_ids"]) < 1024
|
| 122 |
+
)
|
| 123 |
+
valid_dataset = valid_dataset.filter(
|
| 124 |
+
function=lambda x: 5 < len(x["input_ids"]) < 1024
|
| 125 |
+
)
|
| 126 |
+
print(f"train_dataset size: {len(train_dataset)}")
|
| 127 |
+
print(f"valid_dataset size: {len(valid_dataset)}")
|
| 128 |
+
|
| 129 |
+
data_collator = DataCollatorForLanguageModeling(
|
| 130 |
+
tokenizer,
|
| 131 |
+
mlm=False
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
training_args = TrainingArguments(
|
| 135 |
+
output_dir=args.output_model_dir,
|
| 136 |
+
# overwrite_output_dir=True,
|
| 137 |
+
num_train_epochs=3,
|
| 138 |
+
per_device_train_batch_size=16,
|
| 139 |
+
per_device_eval_batch_size=16,
|
| 140 |
+
eval_strategy="steps",
|
| 141 |
+
eval_steps=100,
|
| 142 |
+
save_strategy="steps",
|
| 143 |
+
save_steps=100,
|
| 144 |
+
save_total_limit=2,
|
| 145 |
+
logging_steps=100,
|
| 146 |
+
learning_rate=5e-5,
|
| 147 |
+
warmup_steps=500,
|
| 148 |
+
weight_decay=0.01,
|
| 149 |
+
fp16=torch.cuda.is_available(),
|
| 150 |
+
dataloader_num_workers=args.num_workers or 0,
|
| 151 |
+
remove_unused_columns=False,
|
| 152 |
+
load_best_model_at_end=False,
|
| 153 |
+
# metric_for_best_model="eval_loss",
|
| 154 |
+
# greater_is_better=False,
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
trainer = Trainer(
|
| 158 |
+
model=model,
|
| 159 |
+
args=training_args,
|
| 160 |
+
data_collator=data_collator,
|
| 161 |
+
train_dataset=train_dataset,
|
| 162 |
+
eval_dataset=valid_dataset,
|
| 163 |
+
tokenizer=tokenizer,
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
trainer.train()
|
| 167 |
+
trainer.save_model()
|
| 168 |
+
return
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
if __name__ == "__main__":
|
| 172 |
+
main()
|
examples/tutorials/rlhf/gpt2_sst2_generation/step_3_generation.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import argparse
|
| 4 |
+
import time
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
|
| 8 |
+
|
| 9 |
+
from project_settings import project_path
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def get_args():
|
| 13 |
+
parser = argparse.ArgumentParser()
|
| 14 |
+
parser.add_argument(
|
| 15 |
+
"--pretrained_model_name_or_path",
|
| 16 |
+
default=(project_path / "trained_models/gpt2-sst2-generation-epoch-3"),
|
| 17 |
+
# default=(project_path / "trained_models/gpt2-sst2-ppo/checkpoint-150"),
|
| 18 |
+
# default=(project_path / "trained_models/gpt2-sst2-generation-epoch-3/checkpoint-5400"),
|
| 19 |
+
type=str
|
| 20 |
+
)
|
| 21 |
+
parser.add_argument(
|
| 22 |
+
"--max_new_tokens",
|
| 23 |
+
default=1024, # 8192, 128
|
| 24 |
+
type=int, help="最大生成长度(注意:并非模型实际长文本能力)"
|
| 25 |
+
)
|
| 26 |
+
parser.add_argument("--top_p", default=0.85, type=float, help="nucleus采样阈值(0-1)")
|
| 27 |
+
parser.add_argument("--temperature", default=0.85, type=float, help="生成温度,控制随机性(0-1,越大越随机)")
|
| 28 |
+
|
| 29 |
+
args = parser.parse_args()
|
| 30 |
+
return args
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def main():
|
| 34 |
+
args = get_args()
|
| 35 |
+
|
| 36 |
+
if torch.cuda.is_available():
|
| 37 |
+
device = "cuda"
|
| 38 |
+
elif torch.backends.mps.is_available():
|
| 39 |
+
# device = "mps"
|
| 40 |
+
device = "cpu"
|
| 41 |
+
else:
|
| 42 |
+
device = "cpu"
|
| 43 |
+
print(f"device: {device}")
|
| 44 |
+
|
| 45 |
+
tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_name_or_path)
|
| 46 |
+
model = AutoModelForCausalLM.from_pretrained(args.pretrained_model_name_or_path)
|
| 47 |
+
model = model.eval().to(device)
|
| 48 |
+
|
| 49 |
+
tokenized = tokenizer(
|
| 50 |
+
# "this",
|
| 51 |
+
# "this is ",
|
| 52 |
+
"it 's ",
|
| 53 |
+
# "please listen ",
|
| 54 |
+
# "eldom has a movie",
|
| 55 |
+
# "thanks to scott 's charismatic",
|
| 56 |
+
return_tensors="pt"
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
|
| 60 |
+
|
| 61 |
+
generated_ids = model.generate(
|
| 62 |
+
inputs=tokenized["input_ids"], attention_mask=tokenized["attention_mask"],
|
| 63 |
+
max_new_tokens=args.max_new_tokens, do_sample=True, streamer=streamer,
|
| 64 |
+
pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id,
|
| 65 |
+
top_p=args.top_p, temperature=args.temperature, repetition_penalty=3.0,
|
| 66 |
+
early_stopping=True,
|
| 67 |
+
)
|
| 68 |
+
# response = tokenizer.decode(generated_ids[0][len(tokenized["input_ids"][0]):], skip_special_tokens=True)
|
| 69 |
+
response = tokenizer.decode(generated_ids[0], skip_special_tokens=False)
|
| 70 |
+
print(response)
|
| 71 |
+
# print(generated_ids)
|
| 72 |
+
print(f"count: {generated_ids.shape}")
|
| 73 |
+
|
| 74 |
+
return
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
if __name__ == "__main__":
|
| 78 |
+
main()
|
examples/tutorials/rlhf/gpt2_sst2_ppo/requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
trl==0.16.1
|
| 2 |
+
transformers==4.50.2
|
examples/tutorials/rlhf/gpt2_sst2_ppo/step_1_prepare_data.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
或使用命令行
|
| 5 |
+
pip install modelscope
|
| 6 |
+
modelscope download \
|
| 7 |
+
--model 'qgyd2021/gpt2-for-sequence-classification-sst2-reward' \
|
| 8 |
+
--local_dir '/root/autodl-tmp/trained_models/Qwen3-8B-sft-deepspeed'
|
| 9 |
+
|
| 10 |
+
python3 step_1_prepare_data.py \
|
| 11 |
+
--repo_id qgyd2021/gpt2-for-sequence-classification-sst2-reward \
|
| 12 |
+
--local_dir /root/autodl-tmp/OpenMiniMind/trained_models/gpt2-for-sequence-classification-sst2-reward
|
| 13 |
+
|
| 14 |
+
python3 step_1_prepare_data.py \
|
| 15 |
+
--repo_id qgyd2021/gpt2-sst2-generation-epoch-3 \
|
| 16 |
+
--local_dir /root/autodl-tmp/OpenMiniMind/trained_models/gpt2-sst2-generation-epoch-3
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
"""
|
| 20 |
+
import argparse
|
| 21 |
+
import os
|
| 22 |
+
from pathlib import Path
|
| 23 |
+
import platform
|
| 24 |
+
|
| 25 |
+
if platform.system() in ("Windows", "Darwin"):
|
| 26 |
+
from project_settings import project_path, temp_directory
|
| 27 |
+
else:
|
| 28 |
+
project_path = os.path.abspath("../../../")
|
| 29 |
+
project_path = Path(project_path)
|
| 30 |
+
temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
|
| 31 |
+
|
| 32 |
+
from modelscope import snapshot_download
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def get_args():
|
| 36 |
+
parser = argparse.ArgumentParser()
|
| 37 |
+
parser.add_argument("--repo_id", default="qgyd2021/Qwen3-8B-sft-deepspeed", type=str)
|
| 38 |
+
parser.add_argument(
|
| 39 |
+
"--local_dir",
|
| 40 |
+
default=(temp_directory / "../trained_models/Qwen3-8B-sft-deepspeed").as_posix(),
|
| 41 |
+
type=str
|
| 42 |
+
)
|
| 43 |
+
args = parser.parse_args()
|
| 44 |
+
return args
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def main():
|
| 48 |
+
args = get_args()
|
| 49 |
+
|
| 50 |
+
snapshot_download(
|
| 51 |
+
model_id=args.repo_id,
|
| 52 |
+
local_dir=args.local_dir,
|
| 53 |
+
)
|
| 54 |
+
return
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
if __name__ == "__main__":
|
| 58 |
+
main()
|
examples/tutorials/rlhf/gpt2_sst2_ppo/step_2_train_model_on_cpu.py
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
PPO Training with TRL on SST-2 dataset
|
| 5 |
+
基于 https://huggingface.co/docs/trl/v0.16.1/en/ppo_trainer 的实现
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
(1)策略模型 policy 根据 queries 生成 query_response 和 logits
|
| 9 |
+
(2)根据 logits 计算 logprob 概率,再索引出 response 对应的概率。
|
| 10 |
+
(3)参考模型 ref_policy 根据 query_response 计算其 ref_logits
|
| 11 |
+
(4)根据 ref_logits 计算 ref_logprob 概率,再索引出 response 对应的概率。
|
| 12 |
+
(5)query_response 中生成的第一个 eos_token 开始后面的 token 都替换为 pad_token。
|
| 13 |
+
(6)价值模型 value_model 计算 query_response 中 response 部分每个token的价值。
|
| 14 |
+
(7)奖励模型 reward_model 计算 postprocessed_query_response 中最后一个非 pad_token 的奖励。
|
| 15 |
+
(8)得到:
|
| 16 |
+
(9)kl = logprobs - ref_logprobs
|
| 17 |
+
non_score_reward = -args.kl_coef * kl
|
| 18 |
+
advantages
|
| 19 |
+
returns = advantages + values
|
| 20 |
+
|
| 21 |
+
"""
|
| 22 |
+
import argparse
|
| 23 |
+
import os
|
| 24 |
+
from pathlib import Path
|
| 25 |
+
import platform
|
| 26 |
+
|
| 27 |
+
import torch
|
| 28 |
+
from datasets import load_dataset
|
| 29 |
+
from transformers import (
|
| 30 |
+
AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification,
|
| 31 |
+
GPT2LMHeadModel, GPT2ForSequenceClassification,
|
| 32 |
+
DataCollatorWithPadding
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
# 路径配置
|
| 36 |
+
if platform.system() in ("Windows", "Darwin"):
|
| 37 |
+
from project_settings import project_path, temp_directory
|
| 38 |
+
else:
|
| 39 |
+
project_path = Path(os.path.abspath("../../../"))
|
| 40 |
+
temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
|
| 41 |
+
|
| 42 |
+
from trl import PPOTrainer, PPOConfig
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def get_args():
|
| 46 |
+
parser = argparse.ArgumentParser()
|
| 47 |
+
parser.add_argument("--sft_model_name", type=str,
|
| 48 |
+
default=(project_path / "trained_models/gpt2-sst2-generation-epoch-3").as_posix())
|
| 49 |
+
parser.add_argument("--reward_model_name", type=str,
|
| 50 |
+
default=(project_path / "trained_models/gpt2-for-sequence-classification-sst2-reward").as_posix())
|
| 51 |
+
parser.add_argument("--dataset_path", default="stanfordnlp/sst2", type=str)
|
| 52 |
+
parser.add_argument("--dataset_cache_dir",
|
| 53 |
+
default=(temp_directory / "hub_datasets").as_posix(), type=str)
|
| 54 |
+
parser.add_argument("--model_cache_dir",
|
| 55 |
+
default=(temp_directory / "hub_models").as_posix(), type=str)
|
| 56 |
+
|
| 57 |
+
# 训练参数
|
| 58 |
+
|
| 59 |
+
# 生成参数
|
| 60 |
+
|
| 61 |
+
parser.add_argument(
|
| 62 |
+
"--output_model_dir",
|
| 63 |
+
default=(project_path / "trained_models/gpt2-sst2-ppo").as_posix(),
|
| 64 |
+
type=str
|
| 65 |
+
),
|
| 66 |
+
|
| 67 |
+
# 其他
|
| 68 |
+
parser.add_argument("--num_workers", default=0 if platform.system() == "Windows" else 2, type=int)
|
| 69 |
+
parser.add_argument("--device", default="cpu", type=str)
|
| 70 |
+
|
| 71 |
+
return parser.parse_args()
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def format_func(example, tokenizer):
|
| 75 |
+
sentence: str = example["sentence"]
|
| 76 |
+
# score: float = float(example["label"])
|
| 77 |
+
tokenized = tokenizer(sentence)
|
| 78 |
+
input_ids = tokenized["input_ids"]
|
| 79 |
+
attention_mask = tokenized["attention_mask"]
|
| 80 |
+
result = {
|
| 81 |
+
"input_ids": input_ids,
|
| 82 |
+
"attention_mask": attention_mask,
|
| 83 |
+
}
|
| 84 |
+
return result
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def token_truncate(example, tokenizer):
|
| 88 |
+
input_ids = example["input_ids"]
|
| 89 |
+
attention_mask = example["attention_mask"]
|
| 90 |
+
input_ids = input_ids[:3]
|
| 91 |
+
attention_mask = attention_mask[:3]
|
| 92 |
+
# text = tokenizer.decode(input_ids)
|
| 93 |
+
result = {
|
| 94 |
+
"input_ids": input_ids,
|
| 95 |
+
"attention_mask": attention_mask,
|
| 96 |
+
# "text": text,
|
| 97 |
+
}
|
| 98 |
+
return result
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def main():
|
| 102 |
+
args = get_args()
|
| 103 |
+
|
| 104 |
+
# 设备设置
|
| 105 |
+
device = torch.device(args.device)
|
| 106 |
+
|
| 107 |
+
# 1. 加载tokenizer
|
| 108 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 109 |
+
args.sft_model_name,
|
| 110 |
+
padding_side="left", # 对于生成任务很重要
|
| 111 |
+
cache_dir=args.model_cache_dir,
|
| 112 |
+
)
|
| 113 |
+
if tokenizer.pad_token is None:
|
| 114 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 115 |
+
tokenizer.pad_token_id = tokenizer.eos_token_id
|
| 116 |
+
print(f"eos_token: {tokenizer.eos_token}")
|
| 117 |
+
print(f"pad_token: {tokenizer.pad_token}")
|
| 118 |
+
|
| 119 |
+
model = AutoModelForCausalLM.from_pretrained(args.sft_model_name)
|
| 120 |
+
value_model = AutoModelForSequenceClassification.from_pretrained(
|
| 121 |
+
args.sft_model_name,
|
| 122 |
+
num_labels=1
|
| 123 |
+
)
|
| 124 |
+
value_model.transformer = model.transformer
|
| 125 |
+
|
| 126 |
+
ref_model = AutoModelForCausalLM.from_pretrained(args.sft_model_name)
|
| 127 |
+
|
| 128 |
+
reward_model = AutoModelForSequenceClassification.from_pretrained(
|
| 129 |
+
args.reward_model_name,
|
| 130 |
+
num_labels=1
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
dataset_dict = load_dataset(
|
| 134 |
+
path=args.dataset_path,
|
| 135 |
+
cache_dir=args.dataset_cache_dir,
|
| 136 |
+
)
|
| 137 |
+
train_dataset = dataset_dict["train"]
|
| 138 |
+
valid_dataset = dataset_dict["validation"]
|
| 139 |
+
# test_dataset = dataset_dict["test"]
|
| 140 |
+
|
| 141 |
+
train_dataset = train_dataset.map(
|
| 142 |
+
lambda example: format_func(example, tokenizer),
|
| 143 |
+
batched=False,
|
| 144 |
+
remove_columns=train_dataset.column_names,
|
| 145 |
+
)
|
| 146 |
+
valid_dataset = valid_dataset.map(
|
| 147 |
+
lambda example: format_func(example, tokenizer),
|
| 148 |
+
batched=False,
|
| 149 |
+
remove_columns=valid_dataset.column_names,
|
| 150 |
+
)
|
| 151 |
+
train_dataset = train_dataset.filter(
|
| 152 |
+
function=lambda x: len(x["input_ids"]) > 8
|
| 153 |
+
)
|
| 154 |
+
valid_dataset = valid_dataset.filter(
|
| 155 |
+
function=lambda x: len(x["input_ids"]) > 8
|
| 156 |
+
)
|
| 157 |
+
train_dataset = train_dataset.map(
|
| 158 |
+
lambda example: token_truncate(example, tokenizer),
|
| 159 |
+
batched=False,
|
| 160 |
+
remove_columns=train_dataset.column_names,
|
| 161 |
+
)
|
| 162 |
+
valid_dataset = valid_dataset.map(
|
| 163 |
+
lambda example: token_truncate(example, tokenizer),
|
| 164 |
+
batched=False,
|
| 165 |
+
remove_columns=valid_dataset.column_names,
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
ppo_config = PPOConfig(
|
| 169 |
+
output_dir=args.output_model_dir,
|
| 170 |
+
num_train_epochs=1,
|
| 171 |
+
eval_strategy="steps",
|
| 172 |
+
eval_steps=50,
|
| 173 |
+
save_strategy="steps",
|
| 174 |
+
save_steps=50,
|
| 175 |
+
save_total_limit=2,
|
| 176 |
+
logging_steps=50,
|
| 177 |
+
learning_rate=1e-5,
|
| 178 |
+
warmup_steps=50,
|
| 179 |
+
per_device_eval_batch_size=10,
|
| 180 |
+
|
| 181 |
+
num_mini_batches=2,
|
| 182 |
+
num_sample_generations=100,
|
| 183 |
+
# total_episodes=100000, # 最多训练多少个样本。
|
| 184 |
+
response_length=64,
|
| 185 |
+
# stop_token=tokenizer.eos_token, # stop_token 和 stop_token_id 只设置一个。
|
| 186 |
+
stop_token_id=tokenizer.eos_token_id,
|
| 187 |
+
batch_size=16,
|
| 188 |
+
|
| 189 |
+
num_ppo_epochs=1,
|
| 190 |
+
whiten_rewards=True,
|
| 191 |
+
gamma=1.0,
|
| 192 |
+
lam=0.95,
|
| 193 |
+
|
| 194 |
+
dataset_num_proc=args.num_workers,
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
data_collator = DataCollatorWithPadding(tokenizer)
|
| 198 |
+
|
| 199 |
+
ppo_trainer = PPOTrainer(
|
| 200 |
+
args=ppo_config,
|
| 201 |
+
processing_class=tokenizer,
|
| 202 |
+
model=model,
|
| 203 |
+
ref_model=ref_model,
|
| 204 |
+
reward_model=reward_model,
|
| 205 |
+
train_dataset=train_dataset,
|
| 206 |
+
value_model=value_model,
|
| 207 |
+
data_collator=data_collator,
|
| 208 |
+
eval_dataset=valid_dataset,
|
| 209 |
+
)
|
| 210 |
+
ppo_trainer.train()
|
| 211 |
+
ppo_trainer.save_model()
|
| 212 |
+
|
| 213 |
+
return
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
if __name__ == "__main__":
|
| 217 |
+
main()
|
examples/tutorials/rlhf/gpt2_sst2_ppo/step_2_train_model_two_gpu.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
PPO Training with TRL on SST-2 dataset
|
| 5 |
+
基于 https://huggingface.co/docs/trl/v0.16.1/en/ppo_trainer 的实现
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
双卡 V100
|
| 9 |
+
|
| 10 |
+
"""
|
| 11 |
+
import argparse
|
| 12 |
+
import os
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
import platform
|
| 15 |
+
|
| 16 |
+
import torch
|
| 17 |
+
from datasets import load_dataset
|
| 18 |
+
from transformers import (
|
| 19 |
+
AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification,
|
| 20 |
+
GPT2LMHeadModel, GPT2ForSequenceClassification,
|
| 21 |
+
DataCollatorWithPadding
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
# 路径配置
|
| 25 |
+
if platform.system() in ("Windows", "Darwin"):
|
| 26 |
+
from project_settings import project_path, temp_directory
|
| 27 |
+
else:
|
| 28 |
+
project_path = Path(os.path.abspath("../../../"))
|
| 29 |
+
temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
|
| 30 |
+
|
| 31 |
+
from trl import PPOTrainer, PPOConfig
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def get_args():
|
| 35 |
+
parser = argparse.ArgumentParser()
|
| 36 |
+
parser.add_argument("--sft_model_name", type=str,
|
| 37 |
+
default=(project_path / "trained_models/gpt2-sst2-generation-epoch-3").as_posix())
|
| 38 |
+
parser.add_argument("--reward_model_name", type=str,
|
| 39 |
+
default=(project_path / "trained_models/gpt2-for-sequence-classification-sst2-reward").as_posix())
|
| 40 |
+
parser.add_argument("--dataset_path", default="stanfordnlp/sst2", type=str)
|
| 41 |
+
parser.add_argument("--dataset_cache_dir",
|
| 42 |
+
default=(temp_directory / "hub_datasets").as_posix(), type=str)
|
| 43 |
+
parser.add_argument("--model_cache_dir",
|
| 44 |
+
default=(temp_directory / "hub_models").as_posix(), type=str)
|
| 45 |
+
|
| 46 |
+
# 训练参数
|
| 47 |
+
|
| 48 |
+
# 生成参数
|
| 49 |
+
|
| 50 |
+
parser.add_argument(
|
| 51 |
+
"--output_model_dir",
|
| 52 |
+
default=(project_path / "trained_models/gpt2-sst2-ppo").as_posix(),
|
| 53 |
+
type=str
|
| 54 |
+
),
|
| 55 |
+
|
| 56 |
+
# 其他
|
| 57 |
+
parser.add_argument("--num_workers", default=0 if platform.system() == "Windows" else 2, type=int)
|
| 58 |
+
|
| 59 |
+
return parser.parse_args()
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def format_func(example, tokenizer):
|
| 63 |
+
sentence: str = example["sentence"]
|
| 64 |
+
# score: float = float(example["label"])
|
| 65 |
+
tokenized = tokenizer(sentence)
|
| 66 |
+
input_ids = tokenized["input_ids"]
|
| 67 |
+
attention_mask = tokenized["attention_mask"]
|
| 68 |
+
result = {
|
| 69 |
+
"input_ids": input_ids,
|
| 70 |
+
"attention_mask": attention_mask,
|
| 71 |
+
}
|
| 72 |
+
return result
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def token_truncate(example, tokenizer):
|
| 76 |
+
input_ids = example["input_ids"]
|
| 77 |
+
attention_mask = example["attention_mask"]
|
| 78 |
+
input_ids = input_ids[:3]
|
| 79 |
+
attention_mask = attention_mask[:3]
|
| 80 |
+
# text = tokenizer.decode(input_ids)
|
| 81 |
+
result = {
|
| 82 |
+
"input_ids": input_ids,
|
| 83 |
+
"attention_mask": attention_mask,
|
| 84 |
+
# "text": text,
|
| 85 |
+
}
|
| 86 |
+
return result
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def main():
|
| 90 |
+
args = get_args()
|
| 91 |
+
|
| 92 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 93 |
+
args.sft_model_name,
|
| 94 |
+
padding_side="left", # 对于生成任务很重要
|
| 95 |
+
cache_dir=args.model_cache_dir,
|
| 96 |
+
)
|
| 97 |
+
if tokenizer.pad_token is None:
|
| 98 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 99 |
+
tokenizer.pad_token_id = tokenizer.eos_token_id
|
| 100 |
+
print(f"eos_token: {tokenizer.eos_token}")
|
| 101 |
+
print(f"pad_token: {tokenizer.pad_token}")
|
| 102 |
+
|
| 103 |
+
model = AutoModelForCausalLM.from_pretrained(args.sft_model_name)
|
| 104 |
+
value_model = AutoModelForSequenceClassification.from_pretrained(
|
| 105 |
+
args.sft_model_name,
|
| 106 |
+
num_labels=1
|
| 107 |
+
)
|
| 108 |
+
value_model.transformer = model.transformer
|
| 109 |
+
|
| 110 |
+
ref_model = AutoModelForCausalLM.from_pretrained(args.sft_model_name)
|
| 111 |
+
reward_model = AutoModelForSequenceClassification.from_pretrained(
|
| 112 |
+
args.reward_model_name,
|
| 113 |
+
num_labels=1
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
dataset_dict = load_dataset(
|
| 117 |
+
path=args.dataset_path,
|
| 118 |
+
cache_dir=args.dataset_cache_dir,
|
| 119 |
+
)
|
| 120 |
+
train_dataset = dataset_dict["train"]
|
| 121 |
+
valid_dataset = dataset_dict["validation"]
|
| 122 |
+
# test_dataset = dataset_dict["test"]
|
| 123 |
+
|
| 124 |
+
train_dataset = train_dataset.map(
|
| 125 |
+
lambda example: format_func(example, tokenizer),
|
| 126 |
+
batched=False,
|
| 127 |
+
remove_columns=train_dataset.column_names,
|
| 128 |
+
)
|
| 129 |
+
valid_dataset = valid_dataset.map(
|
| 130 |
+
lambda example: format_func(example, tokenizer),
|
| 131 |
+
batched=False,
|
| 132 |
+
remove_columns=valid_dataset.column_names,
|
| 133 |
+
)
|
| 134 |
+
train_dataset = train_dataset.filter(
|
| 135 |
+
function=lambda x: len(x["input_ids"]) > 8
|
| 136 |
+
)
|
| 137 |
+
valid_dataset = valid_dataset.filter(
|
| 138 |
+
function=lambda x: len(x["input_ids"]) > 8
|
| 139 |
+
)
|
| 140 |
+
train_dataset = train_dataset.map(
|
| 141 |
+
lambda example: token_truncate(example, tokenizer),
|
| 142 |
+
batched=False,
|
| 143 |
+
remove_columns=train_dataset.column_names,
|
| 144 |
+
)
|
| 145 |
+
valid_dataset = valid_dataset.map(
|
| 146 |
+
lambda example: token_truncate(example, tokenizer),
|
| 147 |
+
batched=False,
|
| 148 |
+
remove_columns=valid_dataset.column_names,
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
ppo_config = PPOConfig(
|
| 152 |
+
output_dir=args.output_model_dir,
|
| 153 |
+
num_train_epochs=1,
|
| 154 |
+
eval_strategy="steps",
|
| 155 |
+
eval_steps=50,
|
| 156 |
+
save_strategy="steps",
|
| 157 |
+
save_steps=50,
|
| 158 |
+
save_total_limit=2,
|
| 159 |
+
logging_steps=50,
|
| 160 |
+
learning_rate=1e-5,
|
| 161 |
+
warmup_steps=50,
|
| 162 |
+
per_device_eval_batch_size=10,
|
| 163 |
+
|
| 164 |
+
num_mini_batches=2,
|
| 165 |
+
num_sample_generations=100,
|
| 166 |
+
# total_episodes=100000, # 最多训练多少个样本。
|
| 167 |
+
response_length=1024,
|
| 168 |
+
# stop_token=tokenizer.eos_token, # stop_token 和 stop_token_id 只设置一个。
|
| 169 |
+
stop_token_id=tokenizer.eos_token_id,
|
| 170 |
+
batch_size=16,
|
| 171 |
+
|
| 172 |
+
num_ppo_epochs=1,
|
| 173 |
+
whiten_rewards=True,
|
| 174 |
+
gamma=1.0,
|
| 175 |
+
lam=0.95,
|
| 176 |
+
|
| 177 |
+
dataset_num_proc=args.num_workers,
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
data_collator = DataCollatorWithPadding(tokenizer)
|
| 181 |
+
|
| 182 |
+
ppo_trainer = PPOTrainer(
|
| 183 |
+
args=ppo_config,
|
| 184 |
+
processing_class=tokenizer,
|
| 185 |
+
model=model,
|
| 186 |
+
ref_model=ref_model,
|
| 187 |
+
reward_model=reward_model,
|
| 188 |
+
train_dataset=train_dataset,
|
| 189 |
+
value_model=value_model,
|
| 190 |
+
data_collator=data_collator,
|
| 191 |
+
eval_dataset=valid_dataset,
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
ppo_trainer.train()
|
| 195 |
+
ppo_trainer.save_model()
|
| 196 |
+
|
| 197 |
+
return
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
if __name__ == "__main__":
|
| 201 |
+
main()
|
examples/tutorials/rlhf/gpt2_sst2_ppo/step_3_generation.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import argparse
|
| 4 |
+
import time
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
|
| 8 |
+
|
| 9 |
+
from project_settings import project_path
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def get_args():
|
| 13 |
+
parser = argparse.ArgumentParser()
|
| 14 |
+
parser.add_argument(
|
| 15 |
+
"--pretrained_model_name_or_path",
|
| 16 |
+
default=(project_path / "trained_models/gpt2-sst2-ppo/checkpoint-1250"),
|
| 17 |
+
# default=(project_path / "trained_models/gpt2-sst2-ppo/checkpoint-150"),
|
| 18 |
+
# default=(project_path / "trained_models/gpt2-sst2-generation-epoch-3/checkpoint-5400"),
|
| 19 |
+
type=str
|
| 20 |
+
)
|
| 21 |
+
parser.add_argument(
|
| 22 |
+
"--max_new_tokens",
|
| 23 |
+
default=512, # 8192, 128
|
| 24 |
+
type=int, help="最大生成长度(注意:并非模型实际长文本能力)"
|
| 25 |
+
)
|
| 26 |
+
parser.add_argument("--top_p", default=0.85, type=float, help="nucleus采样阈值(0-1)")
|
| 27 |
+
parser.add_argument("--temperature", default=0.85, type=float, help="生成温度,控制随机性(0-1,越大越随机)")
|
| 28 |
+
|
| 29 |
+
args = parser.parse_args()
|
| 30 |
+
return args
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def main():
|
| 34 |
+
args = get_args()
|
| 35 |
+
|
| 36 |
+
if torch.cuda.is_available():
|
| 37 |
+
device = "cuda"
|
| 38 |
+
elif torch.backends.mps.is_available():
|
| 39 |
+
# device = "mps"
|
| 40 |
+
device = "cpu"
|
| 41 |
+
else:
|
| 42 |
+
device = "cpu"
|
| 43 |
+
print(f"device: {device}")
|
| 44 |
+
|
| 45 |
+
tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_name_or_path)
|
| 46 |
+
model = AutoModelForCausalLM.from_pretrained(args.pretrained_model_name_or_path)
|
| 47 |
+
model = model.eval().to(device)
|
| 48 |
+
|
| 49 |
+
tokenized = tokenizer(
|
| 50 |
+
# "this",
|
| 51 |
+
"this is ",
|
| 52 |
+
# "please listen ",
|
| 53 |
+
# "eldom has a movie",
|
| 54 |
+
# "thanks to scott 's charismatic",
|
| 55 |
+
return_tensors="pt"
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
|
| 59 |
+
|
| 60 |
+
generated_ids = model.generate(
|
| 61 |
+
inputs=tokenized["input_ids"], attention_mask=tokenized["attention_mask"],
|
| 62 |
+
max_new_tokens=args.max_new_tokens, do_sample=True, streamer=streamer,
|
| 63 |
+
pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id,
|
| 64 |
+
top_p=args.top_p, temperature=args.temperature, repetition_penalty=3.0,
|
| 65 |
+
early_stopping=True,
|
| 66 |
+
)
|
| 67 |
+
# response = tokenizer.decode(generated_ids[0][len(tokenized["input_ids"][0]):], skip_special_tokens=True)
|
| 68 |
+
response = tokenizer.decode(generated_ids[0], skip_special_tokens=False)
|
| 69 |
+
print(response)
|
| 70 |
+
# print(generated_ids)
|
| 71 |
+
print(f"count: {generated_ids.shape}")
|
| 72 |
+
|
| 73 |
+
return
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
if __name__ == "__main__":
|
| 77 |
+
main()
|
examples/tutorials/{dpo/ultrachat-sft/step_2_train_sft_model2.py → rlhf/gpt2_sst2_reward/step_2_train_model.py}
RENAMED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
import argparse
|
| 2 |
import os
|
| 3 |
from pathlib import Path
|
|
@@ -11,26 +13,25 @@ else:
|
|
| 11 |
temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
|
| 12 |
|
| 13 |
from datasets import load_dataset
|
|
|
|
| 14 |
import torch
|
| 15 |
-
from
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
from trl import SFTTrainer, SFTConfig, DataCollatorForCompletionOnlyLM
|
| 20 |
|
| 21 |
|
| 22 |
def get_args():
|
| 23 |
parser = argparse.ArgumentParser()
|
| 24 |
parser.add_argument(
|
| 25 |
"--model_name",
|
| 26 |
-
default=
|
| 27 |
-
|
| 28 |
type=str
|
| 29 |
),
|
| 30 |
parser.add_argument(
|
| 31 |
"--dataset_path",
|
| 32 |
-
|
| 33 |
-
default="miyuki2026/tutorials",
|
| 34 |
type=str
|
| 35 |
),
|
| 36 |
parser.add_argument("--dataset_name", default=None, type=str),
|
|
@@ -45,15 +46,13 @@ def get_args():
|
|
| 45 |
default=(temp_directory / "hub_models").as_posix(),
|
| 46 |
type=str
|
| 47 |
),
|
| 48 |
-
parser.add_argument("--dataset_streaming",
|
| 49 |
parser.add_argument("--valid_dataset_size", default=1000, type=int),
|
| 50 |
parser.add_argument("--shuffle_buffer_size", default=5000, type=int),
|
| 51 |
|
| 52 |
-
parser.add_argument("--max_seq_length", default=2048, type=int)
|
| 53 |
-
|
| 54 |
parser.add_argument(
|
| 55 |
"--output_model_dir",
|
| 56 |
-
default=(project_path / "trained_models/
|
| 57 |
type=str
|
| 58 |
),
|
| 59 |
parser.add_argument(
|
|
@@ -61,105 +60,110 @@ def get_args():
|
|
| 61 |
default=None if platform.system() in ("Windows", "Darwin") else os.cpu_count() // 2,
|
| 62 |
type=int
|
| 63 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
args = parser.parse_args()
|
| 65 |
return args
|
| 66 |
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
def main():
|
| 69 |
args = get_args()
|
| 70 |
|
| 71 |
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 72 |
-
|
| 73 |
-
model = AutoModelForCausalLM.from_pretrained(args.model_name)
|
| 74 |
-
model = model.to(device)
|
| 75 |
tokenizer = AutoTokenizer.from_pretrained(args.model_name)
|
| 76 |
if tokenizer.pad_token is None:
|
| 77 |
tokenizer.pad_token = tokenizer.eos_token
|
| 78 |
tokenizer.pad_token_id = tokenizer.eos_token_id
|
| 79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
dataset_dict = load_dataset(
|
| 81 |
path=args.dataset_path,
|
| 82 |
name=args.dataset_name,
|
| 83 |
-
|
| 84 |
-
# split="train_sft",
|
| 85 |
-
# split="test_sft",
|
| 86 |
cache_dir=args.dataset_cache_dir,
|
| 87 |
-
#
|
|
|
|
| 88 |
)
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
train_dataset = dataset["train"]
|
| 98 |
-
valid_dataset = dataset["test"]
|
| 99 |
-
train_dataset = valid_dataset
|
| 100 |
-
|
| 101 |
-
response_template = "<|im_end|>\n<|im_start|>assistant"
|
| 102 |
-
instruction_template = "<|im_end|>\n<|im_start|>user"
|
| 103 |
-
data_collator = DataCollatorForCompletionOnlyLM(
|
| 104 |
-
response_template=response_template,
|
| 105 |
-
instruction_template=instruction_template,
|
| 106 |
-
tokenizer=tokenizer,
|
| 107 |
-
mlm=False,
|
| 108 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
-
|
| 111 |
-
# print(examples)
|
| 112 |
-
formated_text: str = tokenizer.apply_chat_template(
|
| 113 |
-
conversation=examples["conversations"],
|
| 114 |
-
tokenize=False,
|
| 115 |
-
add_generation_prompt=False,
|
| 116 |
-
)
|
| 117 |
-
return formated_text
|
| 118 |
-
|
| 119 |
-
sft_config = SFTConfig(
|
| 120 |
output_dir=args.output_model_dir,
|
|
|
|
| 121 |
num_train_epochs=1,
|
| 122 |
-
|
| 123 |
-
|
|
|
|
|
|
|
| 124 |
save_strategy="steps",
|
| 125 |
-
save_steps=
|
| 126 |
save_total_limit=2,
|
| 127 |
-
logging_steps=
|
| 128 |
-
learning_rate=
|
| 129 |
-
warmup_steps=
|
| 130 |
-
|
| 131 |
-
fp16=
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
),
|
| 139 |
)
|
| 140 |
|
| 141 |
-
trainer =
|
| 142 |
model=model,
|
| 143 |
-
args=
|
| 144 |
data_collator=data_collator,
|
| 145 |
train_dataset=train_dataset,
|
| 146 |
-
|
| 147 |
)
|
| 148 |
|
| 149 |
-
# 开始训练
|
| 150 |
-
print("开始训练...")
|
| 151 |
trainer.train()
|
| 152 |
-
|
| 153 |
-
# 保存模型
|
| 154 |
-
print(f"保存模型到: {args.output_model_dir}")
|
| 155 |
trainer.save_model()
|
| 156 |
-
tokenizer.save_pretrained(args.output_model_dir)
|
| 157 |
-
|
| 158 |
-
print("训练完成!")
|
| 159 |
-
|
| 160 |
return
|
| 161 |
|
| 162 |
|
| 163 |
if __name__ == "__main__":
|
| 164 |
main()
|
| 165 |
-
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
import argparse
|
| 4 |
import os
|
| 5 |
from pathlib import Path
|
|
|
|
| 13 |
temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
|
| 14 |
|
| 15 |
from datasets import load_dataset
|
| 16 |
+
import numpy as np
|
| 17 |
import torch
|
| 18 |
+
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
|
| 19 |
+
DataCollatorWithPadding,
|
| 20 |
+
Trainer, TrainingArguments
|
| 21 |
+
)
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
def get_args():
|
| 25 |
parser = argparse.ArgumentParser()
|
| 26 |
parser.add_argument(
|
| 27 |
"--model_name",
|
| 28 |
+
# default="openai-community/gpt2",
|
| 29 |
+
default=(project_path / "pretrained_models/openai-community/gpt2").as_posix(),
|
| 30 |
type=str
|
| 31 |
),
|
| 32 |
parser.add_argument(
|
| 33 |
"--dataset_path",
|
| 34 |
+
default="stanfordnlp/sst2",
|
|
|
|
| 35 |
type=str
|
| 36 |
),
|
| 37 |
parser.add_argument("--dataset_name", default=None, type=str),
|
|
|
|
| 46 |
default=(temp_directory / "hub_models").as_posix(),
|
| 47 |
type=str
|
| 48 |
),
|
| 49 |
+
parser.add_argument("--dataset_streaming", default=None, type=str),
|
| 50 |
parser.add_argument("--valid_dataset_size", default=1000, type=int),
|
| 51 |
parser.add_argument("--shuffle_buffer_size", default=5000, type=int),
|
| 52 |
|
|
|
|
|
|
|
| 53 |
parser.add_argument(
|
| 54 |
"--output_model_dir",
|
| 55 |
+
default=(project_path / "trained_models/gpt2-for-sequence-classification-sst2-reward").as_posix(),
|
| 56 |
type=str
|
| 57 |
),
|
| 58 |
parser.add_argument(
|
|
|
|
| 60 |
default=None if platform.system() in ("Windows", "Darwin") else os.cpu_count() // 2,
|
| 61 |
type=int
|
| 62 |
),
|
| 63 |
+
parser.add_argument(
|
| 64 |
+
"--device",
|
| 65 |
+
default=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
|
| 66 |
+
type=str
|
| 67 |
+
),
|
| 68 |
args = parser.parse_args()
|
| 69 |
return args
|
| 70 |
|
| 71 |
|
| 72 |
+
def format_func(example, tokenizer):
|
| 73 |
+
sentence: str = example["sentence"]
|
| 74 |
+
labels: float = float(example["label"])
|
| 75 |
+
tokenized = tokenizer(sentence)
|
| 76 |
+
input_ids = tokenized["input_ids"]
|
| 77 |
+
attention_mask = tokenized["attention_mask"]
|
| 78 |
+
result = {
|
| 79 |
+
"input_ids": input_ids,
|
| 80 |
+
"attention_mask": attention_mask,
|
| 81 |
+
"labels": labels,
|
| 82 |
+
}
|
| 83 |
+
return result
|
| 84 |
+
|
| 85 |
+
|
| 86 |
def main():
|
| 87 |
args = get_args()
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
tokenizer = AutoTokenizer.from_pretrained(args.model_name)
|
| 90 |
if tokenizer.pad_token is None:
|
| 91 |
tokenizer.pad_token = tokenizer.eos_token
|
| 92 |
tokenizer.pad_token_id = tokenizer.eos_token_id
|
| 93 |
|
| 94 |
+
model = AutoModelForSequenceClassification.from_pretrained(
|
| 95 |
+
args.model_name,
|
| 96 |
+
num_labels=1,
|
| 97 |
+
pad_token_id=tokenizer.pad_token_id
|
| 98 |
+
)
|
| 99 |
+
print(f"model.num_labels: {model.num_labels}")
|
| 100 |
+
|
| 101 |
dataset_dict = load_dataset(
|
| 102 |
path=args.dataset_path,
|
| 103 |
name=args.dataset_name,
|
| 104 |
+
split=args.dataset_split,
|
|
|
|
|
|
|
| 105 |
cache_dir=args.dataset_cache_dir,
|
| 106 |
+
# num_proc=args.num_workers if not args.dataset_streaming else None,
|
| 107 |
+
streaming=args.dataset_streaming,
|
| 108 |
)
|
| 109 |
+
train_dataset = dataset_dict["train"]
|
| 110 |
+
valid_dataset = dataset_dict["validation"]
|
| 111 |
+
# test_dataset = dataset_dict["test"]
|
| 112 |
+
|
| 113 |
+
train_dataset = train_dataset.map(
|
| 114 |
+
lambda example: format_func(example, tokenizer),
|
| 115 |
+
batched=False,
|
| 116 |
+
remove_columns=train_dataset.column_names,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
)
|
| 118 |
+
valid_dataset = valid_dataset.map(
|
| 119 |
+
lambda example: format_func(example, tokenizer),
|
| 120 |
+
batched=False,
|
| 121 |
+
remove_columns=valid_dataset.column_names,
|
| 122 |
+
)
|
| 123 |
+
train_dataset = train_dataset.filter(
|
| 124 |
+
function=lambda x: len(x["input_ids"]) > 6
|
| 125 |
+
)
|
| 126 |
+
valid_dataset = valid_dataset.filter(
|
| 127 |
+
function=lambda x: len(x["input_ids"]) > 6
|
| 128 |
+
)
|
| 129 |
+
data_collator = DataCollatorWithPadding(tokenizer)
|
| 130 |
|
| 131 |
+
training_args = TrainingArguments(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
output_dir=args.output_model_dir,
|
| 133 |
+
# overwrite_output_dir=True,
|
| 134 |
num_train_epochs=1,
|
| 135 |
+
per_device_train_batch_size=16,
|
| 136 |
+
per_device_eval_batch_size=16,
|
| 137 |
+
eval_strategy="steps",
|
| 138 |
+
eval_steps=200,
|
| 139 |
save_strategy="steps",
|
| 140 |
+
save_steps=200,
|
| 141 |
save_total_limit=2,
|
| 142 |
+
logging_steps=200,
|
| 143 |
+
learning_rate=5e-5,
|
| 144 |
+
warmup_steps=200,
|
| 145 |
+
weight_decay=0.01,
|
| 146 |
+
fp16=torch.cuda.is_available(),
|
| 147 |
+
dataloader_num_workers=args.num_workers or 0,
|
| 148 |
+
remove_unused_columns=False,
|
| 149 |
+
load_best_model_at_end=True,
|
| 150 |
+
metric_for_best_model="eval_loss",
|
| 151 |
+
greater_is_better=False,
|
| 152 |
+
logging_dir=(Path(args.output_model_dir) / "logs").as_posix(),
|
|
|
|
| 153 |
)
|
| 154 |
|
| 155 |
+
trainer = Trainer(
|
| 156 |
model=model,
|
| 157 |
+
args=training_args,
|
| 158 |
data_collator=data_collator,
|
| 159 |
train_dataset=train_dataset,
|
| 160 |
+
eval_dataset=valid_dataset,
|
| 161 |
)
|
| 162 |
|
|
|
|
|
|
|
| 163 |
trainer.train()
|
|
|
|
|
|
|
|
|
|
| 164 |
trainer.save_model()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
return
|
| 166 |
|
| 167 |
|
| 168 |
if __name__ == "__main__":
|
| 169 |
main()
|
|
|
examples/tutorials/rlhf/gpt2_sst2_reward/step_3_test_model.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import argparse
|
| 4 |
+
import os
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
import platform
|
| 7 |
+
from typing import Any, Dict, List, Optional, Union, Tuple
|
| 8 |
+
|
| 9 |
+
if platform.system() in ("Windows", "Darwin"):
|
| 10 |
+
from project_settings import project_path, temp_directory
|
| 11 |
+
else:
|
| 12 |
+
project_path = os.path.abspath("../../../")
|
| 13 |
+
project_path = Path(project_path)
|
| 14 |
+
temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
|
| 15 |
+
|
| 16 |
+
from datasets import load_dataset
|
| 17 |
+
import numpy as np
|
| 18 |
+
import torch
|
| 19 |
+
import torch.nn as nn
|
| 20 |
+
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, GPT2ForSequenceClassification
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def get_args():
|
| 25 |
+
parser = argparse.ArgumentParser()
|
| 26 |
+
parser.add_argument(
|
| 27 |
+
"--model_name",
|
| 28 |
+
default=(project_path / "trained_models/gpt2-for-sequence-classification-sst2-reward").as_posix(),
|
| 29 |
+
type=str
|
| 30 |
+
),
|
| 31 |
+
parser.add_argument(
|
| 32 |
+
"--dataset_path",
|
| 33 |
+
default="stanfordnlp/sst2",
|
| 34 |
+
type=str
|
| 35 |
+
),
|
| 36 |
+
parser.add_argument("--dataset_name", default=None, type=str),
|
| 37 |
+
parser.add_argument("--dataset_split", default=None, type=str),
|
| 38 |
+
parser.add_argument(
|
| 39 |
+
"--dataset_cache_dir",
|
| 40 |
+
default=(temp_directory / "hub_datasets").as_posix(),
|
| 41 |
+
type=str
|
| 42 |
+
),
|
| 43 |
+
parser.add_argument(
|
| 44 |
+
"--model_cache_dir",
|
| 45 |
+
default=(temp_directory / "hub_models").as_posix(),
|
| 46 |
+
type=str
|
| 47 |
+
),
|
| 48 |
+
parser.add_argument("--dataset_streaming", default=None, type=str),
|
| 49 |
+
|
| 50 |
+
parser.add_argument(
|
| 51 |
+
"--num_workers",
|
| 52 |
+
default=None if platform.system() in ("Windows", "Darwin") else os.cpu_count() // 2,
|
| 53 |
+
type=int
|
| 54 |
+
),
|
| 55 |
+
parser.add_argument(
|
| 56 |
+
"--device",
|
| 57 |
+
default=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
|
| 58 |
+
type=str
|
| 59 |
+
),
|
| 60 |
+
args = parser.parse_args()
|
| 61 |
+
return args
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
class RewardModelWrapper:
|
| 65 |
+
"""
|
| 66 |
+
奖励模型包装器,用于计算生成文本的奖励
|
| 67 |
+
"""
|
| 68 |
+
|
| 69 |
+
def __init__(self, reward_model_name, tokenizer, device):
|
| 70 |
+
self.device = device
|
| 71 |
+
self.tokenizer = tokenizer
|
| 72 |
+
# 加载你的GPT2RewardModel或标准模型
|
| 73 |
+
from transformers import GPT2ForSequenceClassification
|
| 74 |
+
self.model = GPT2ForSequenceClassification.from_pretrained(reward_model_name).to(device)
|
| 75 |
+
self.model.eval()
|
| 76 |
+
|
| 77 |
+
def get_reward(self, texts: List[str]) -> List[float]:
|
| 78 |
+
"""
|
| 79 |
+
计算文本的奖励分数(SST-2情感分类)
|
| 80 |
+
"""
|
| 81 |
+
inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(self.device)
|
| 82 |
+
with torch.no_grad():
|
| 83 |
+
outputs = self.model(**inputs)
|
| 84 |
+
# SST-2是二分类,取正面情感的logits作为奖励
|
| 85 |
+
logits = outputs.logits
|
| 86 |
+
# 如果模型输出是logits,使用softmax获取正面概率
|
| 87 |
+
probs = torch.softmax(logits, dim=-1)
|
| 88 |
+
# 假设标签1是正面
|
| 89 |
+
rewards = probs[:, 1].cpu().tolist()
|
| 90 |
+
return rewards
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def main():
|
| 94 |
+
args = get_args()
|
| 95 |
+
|
| 96 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 97 |
+
|
| 98 |
+
tokenizer = AutoTokenizer.from_pretrained(args.model_name)
|
| 99 |
+
if tokenizer.pad_token is None:
|
| 100 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 101 |
+
tokenizer.pad_token_id = tokenizer.eos_token_id
|
| 102 |
+
|
| 103 |
+
reward_model = AutoModelForSequenceClassification.from_pretrained(
|
| 104 |
+
args.model_name,
|
| 105 |
+
)
|
| 106 |
+
print(f"reward_model.num_labels: {reward_model.num_labels}")
|
| 107 |
+
|
| 108 |
+
dataset_dict = load_dataset(
|
| 109 |
+
path=args.dataset_path,
|
| 110 |
+
name=args.dataset_name,
|
| 111 |
+
split=args.dataset_split,
|
| 112 |
+
cache_dir=args.dataset_cache_dir,
|
| 113 |
+
# num_proc=args.num_workers if not args.dataset_streaming else None,
|
| 114 |
+
streaming=args.dataset_streaming,
|
| 115 |
+
)
|
| 116 |
+
# dataset = dataset_dict["train"]
|
| 117 |
+
dataset = dataset_dict["validation"]
|
| 118 |
+
# dataset = dataset_dict["test"]
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
for example in dataset:
|
| 122 |
+
sentence: str = example["sentence"]
|
| 123 |
+
score: float = float(example["label"])
|
| 124 |
+
|
| 125 |
+
outputs = tokenizer(
|
| 126 |
+
sentence,
|
| 127 |
+
return_tensors="pt"
|
| 128 |
+
)
|
| 129 |
+
input_ids = outputs["input_ids"]
|
| 130 |
+
|
| 131 |
+
with torch.no_grad():
|
| 132 |
+
rewards = reward_model.forward(input_ids)
|
| 133 |
+
logits = rewards.logits
|
| 134 |
+
logits = logits.detach().cpu().numpy()
|
| 135 |
+
reward = logits[0][0]
|
| 136 |
+
msg = f"reward: {reward}\nscore: {score}\nsentence: {sentence}\n"
|
| 137 |
+
print(msg)
|
| 138 |
+
return
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
if __name__ == "__main__":
|
| 142 |
+
main()
|
examples/tutorials/rlhf/gpt2_sst2_reward/step_4_test_model.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import argparse
|
| 4 |
+
import os
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
import platform
|
| 7 |
+
from typing import Any, Dict, List, Optional, Union, Tuple
|
| 8 |
+
|
| 9 |
+
if platform.system() in ("Windows", "Darwin"):
|
| 10 |
+
from project_settings import project_path, temp_directory
|
| 11 |
+
else:
|
| 12 |
+
project_path = os.path.abspath("../../../")
|
| 13 |
+
project_path = Path(project_path)
|
| 14 |
+
temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
|
| 15 |
+
|
| 16 |
+
from datasets import load_dataset
|
| 17 |
+
import numpy as np
|
| 18 |
+
import torch
|
| 19 |
+
import torch.nn as nn
|
| 20 |
+
from transformers import (AutoTokenizer, AutoModelForSequenceClassification
|
| 21 |
+
)
|
| 22 |
+
from trl.trainer.utils import get_reward
|
| 23 |
+
from transformers import GPT2ForSequenceClassification
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def get_args():
|
| 27 |
+
parser = argparse.ArgumentParser()
|
| 28 |
+
parser.add_argument(
|
| 29 |
+
"--model_name",
|
| 30 |
+
default=(project_path / "trained_models/gpt2-for-sequence-classification-sst2-reward").as_posix(),
|
| 31 |
+
type=str
|
| 32 |
+
),
|
| 33 |
+
parser.add_argument(
|
| 34 |
+
"--dataset_path",
|
| 35 |
+
default="stanfordnlp/sst2",
|
| 36 |
+
type=str
|
| 37 |
+
),
|
| 38 |
+
parser.add_argument("--dataset_name", default=None, type=str),
|
| 39 |
+
parser.add_argument("--dataset_split", default=None, type=str),
|
| 40 |
+
parser.add_argument(
|
| 41 |
+
"--dataset_cache_dir",
|
| 42 |
+
default=(temp_directory / "hub_datasets").as_posix(),
|
| 43 |
+
type=str
|
| 44 |
+
),
|
| 45 |
+
parser.add_argument(
|
| 46 |
+
"--model_cache_dir",
|
| 47 |
+
default=(temp_directory / "hub_models").as_posix(),
|
| 48 |
+
type=str
|
| 49 |
+
),
|
| 50 |
+
parser.add_argument("--dataset_streaming", default=None, type=str),
|
| 51 |
+
|
| 52 |
+
parser.add_argument(
|
| 53 |
+
"--num_workers",
|
| 54 |
+
default=None if platform.system() in ("Windows", "Darwin") else os.cpu_count() // 2,
|
| 55 |
+
type=int
|
| 56 |
+
),
|
| 57 |
+
parser.add_argument(
|
| 58 |
+
"--device",
|
| 59 |
+
default=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
|
| 60 |
+
type=str
|
| 61 |
+
),
|
| 62 |
+
args = parser.parse_args()
|
| 63 |
+
return args
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def main():
|
| 67 |
+
args = get_args()
|
| 68 |
+
|
| 69 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 70 |
+
|
| 71 |
+
tokenizer = AutoTokenizer.from_pretrained(args.model_name)
|
| 72 |
+
if tokenizer.pad_token is None:
|
| 73 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 74 |
+
tokenizer.pad_token_id = tokenizer.eos_token_id
|
| 75 |
+
|
| 76 |
+
reward_model = AutoModelForSequenceClassification.from_pretrained(args.model_name)
|
| 77 |
+
|
| 78 |
+
dataset_dict = load_dataset(
|
| 79 |
+
path=args.dataset_path,
|
| 80 |
+
name=args.dataset_name,
|
| 81 |
+
split=args.dataset_split,
|
| 82 |
+
cache_dir=args.dataset_cache_dir,
|
| 83 |
+
# num_proc=args.num_workers if not args.dataset_streaming else None,
|
| 84 |
+
streaming=args.dataset_streaming,
|
| 85 |
+
)
|
| 86 |
+
# dataset = dataset_dict["train"]
|
| 87 |
+
dataset = dataset_dict["validation"]
|
| 88 |
+
# dataset = dataset_dict["test"]
|
| 89 |
+
|
| 90 |
+
count = 0
|
| 91 |
+
batch_text = list()
|
| 92 |
+
for example in dataset:
|
| 93 |
+
|
| 94 |
+
sentence: str = example["sentence"]
|
| 95 |
+
# labels: int = int(example["label"])
|
| 96 |
+
batch_text.append(sentence)
|
| 97 |
+
count += 1
|
| 98 |
+
if count >= 4:
|
| 99 |
+
break
|
| 100 |
+
|
| 101 |
+
outputs = tokenizer(
|
| 102 |
+
batch_text,
|
| 103 |
+
padding=True,
|
| 104 |
+
truncation=True,
|
| 105 |
+
return_tensors="pt"
|
| 106 |
+
)
|
| 107 |
+
input_ids = outputs["input_ids"]
|
| 108 |
+
attention_mask = outputs["attention_mask"]
|
| 109 |
+
|
| 110 |
+
# last_token_idx = attention_mask.sum(dim=1) - 1
|
| 111 |
+
# print(last_token_idx)
|
| 112 |
+
|
| 113 |
+
reward_logits, score, sequence_lengths = get_reward(
|
| 114 |
+
model=reward_model,
|
| 115 |
+
query_responses=input_ids,
|
| 116 |
+
pad_token_id=tokenizer.pad_token_id,
|
| 117 |
+
context_length=0,
|
| 118 |
+
)
|
| 119 |
+
print(reward_logits)
|
| 120 |
+
print(score)
|
| 121 |
+
print(sequence_lengths)
|
| 122 |
+
|
| 123 |
+
return
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
if __name__ == "__main__":
|
| 127 |
+
main()
|