| ```python | |
| from openrlhf.models.model import get_llm_for_sequence_regression | |
| from transformers import AutoTokenizer | |
| from typing import List | |
| import torch | |
| import regex as re | |
| def strip_sequence(text, pad_token, eos_token): | |
| pad_token_escaped = re.escape(pad_token) | |
| eos_token_escaped = re.escape(eos_token) | |
| pattern = f"^({eos_token_escaped}|{pad_token_escaped})+" | |
| text = re.sub(pattern, "", text) | |
| pattern = f"({eos_token_escaped}|{pad_token_escaped})+$" | |
| text = re.sub(pattern, "", text) | |
| return text | |
| class RewardModelProxy: | |
| def __init__( | |
| self, | |
| reward_pretrain:str, | |
| max_len:int, | |
| batch_size:int, | |
| normalize_reward:bool=False, | |
| flash_attn:bool=True, | |
| bf16:bool=True, | |
| load_in_4bit:bool=False, | |
| value_head_prefix:str="score", | |
| disable_fast_tokenizer:bool=False, | |
| ): | |
| self.reward_model = get_llm_for_sequence_regression( | |
| reward_pretrain, | |
| "reward", | |
| normalize_reward=normalize_reward, | |
| use_flash_attention_2=flash_attn, | |
| bf16=bf16, | |
| load_in_4bit=load_in_4bit, | |
| value_head_prefix=value_head_prefix, | |
| device_map="cuda:5", | |
| ) | |
| self.reward_model.eval() | |
| self.tokenizer = AutoTokenizer.from_pretrained(reward_pretrain, trust_remote_code=True, use_fast=not disable_fast_tokenizer) | |
| self.max_length = max_len | |
| self.batch_size = batch_size | |
| def get_reward(self, conversations:List[List[dict]]): | |
| if self.batch_size is None: | |
| batch_size = len(conversations) | |
| else: | |
| batch_size = self.batch_size | |
| queries = [] | |
| for conversation in conversations: | |
| query = self.tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=False) | |
| queries.append(query) | |
| # remove pad_token | |
| for i in range(len(queries)): | |
| queries[i] = ( | |
| strip_sequence(queries[i], self.tokenizer.pad_token, self.tokenizer.eos_token) | |
| + self.tokenizer.eos_token | |
| ) | |
| scores = [] | |
| # batch | |
| with torch.no_grad(): | |
| for i in range(0, len(queries), batch_size): | |
| inputs = self.tokenize_fn( | |
| queries[i : min(len(queries), i + batch_size)], device=self.reward_model.device | |
| ) | |
| r = self.reward_model(inputs["input_ids"], inputs["attention_mask"]) | |
| r = r.tolist() | |
| scores.extend(r) | |
| return scores | |
| def tokenize_fn(self, texts, device): | |
| batch = self.tokenizer( | |
| texts, | |
| return_tensors="pt", | |
| add_special_tokens=False, | |
| max_length=self.max_length, | |
| padding=True, | |
| truncation=True, | |
| ) | |
| return {k: v.to(device) for k, v in batch.items()} | |
| def __call__(self, conversations:List[List[dict]]): | |
| return self.get_reward(conversations) | |
| RM = RewardModelProxy( | |
| "CodeDPO/Qwen2.5-Coder-7B_with_margin_scalebt", | |
| max_len=2048, | |
| batch_size=8, | |
| ) | |
| conversations = [ | |
| [ | |
| {"role": "system", "content": "Hello, how can I help you today?"}, | |
| {"role": "user", "content": "I want to book a flight."}, | |
| ], | |
| ] | |
| scores = RM(conversations) | |
| print(scores) | |
| ``` |