GPT2 Reward Model

A GPT-2 based reward model fine-tuned with human preference data for RLHF.

Architecture

  • Base: GPT-2
  • Reward head: Linear layer on top of the last hidden state
  • Output: Sigmoid-normalized scalar reward score

Usage

import torch
from torch import nn
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer

class RewardHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.hidden_size = config.hidden_size
        self.reward = nn.Linear(self.hidden_size, 1)
        nn.init.normal_(self.reward.weight, std=(1.0 / np.sqrt(self.hidden_size + 1)))
        nn.init.zeros_(self.reward.bias)

    def forward(self, hidden_states):
        return self.reward(hidden_states)

class GPT2RewardHead(nn.Module):
    def __init__(self, model_name="gpt2"):
        super().__init__()
        self.llm = AutoModelForCausalLM.from_pretrained(model_name)
        self.reward_head = RewardHead(self.llm.config)

    def forward(self, input_ids, attention_mask):
        outputs = self.llm(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        last_hidden = outputs.hidden_states[-1]
        return torch.sigmoid(self.reward_head(last_hidden).squeeze(-1))

tokenizer = AutoTokenizer.from_pretrained("your-username/gpt2-reward-model", subfolder="tokenizer")
model = GPT2RewardHead("gpt2")
state_dict = torch.load(hf_hub_download("your-username/gpt2-reward-model", "reward_model.pt"), map_location="cpu")
model.load_state_dict(state_dict)
model.eval()
Downloads last month
-
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support