GPT2 Reward Model
A GPT-2 based reward model fine-tuned with human preference data for RLHF.
Architecture
- Base: GPT-2
- Reward head: Linear layer on top of the last hidden state
- Output: Sigmoid-normalized scalar reward score
Usage
import torch
from torch import nn
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
class RewardHead(nn.Module):
def __init__(self, config):
super().__init__()
self.hidden_size = config.hidden_size
self.reward = nn.Linear(self.hidden_size, 1)
nn.init.normal_(self.reward.weight, std=(1.0 / np.sqrt(self.hidden_size + 1)))
nn.init.zeros_(self.reward.bias)
def forward(self, hidden_states):
return self.reward(hidden_states)
class GPT2RewardHead(nn.Module):
def __init__(self, model_name="gpt2"):
super().__init__()
self.llm = AutoModelForCausalLM.from_pretrained(model_name)
self.reward_head = RewardHead(self.llm.config)
def forward(self, input_ids, attention_mask):
outputs = self.llm(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
last_hidden = outputs.hidden_states[-1]
return torch.sigmoid(self.reward_head(last_hidden).squeeze(-1))
tokenizer = AutoTokenizer.from_pretrained("your-username/gpt2-reward-model", subfolder="tokenizer")
model = GPT2RewardHead("gpt2")
state_dict = torch.load(hf_hub_download("your-username/gpt2-reward-model", "reward_model.pt"), map_location="cpu")
model.load_state_dict(state_dict)
model.eval()
- Downloads last month
- -
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support