argilla/ultrafeedback-binarized-preferences
Viewer • Updated • 63.6k • 1.43k • 84
Helpfulness reward model trained on argilla/ultrafeedback-binarized-preferences using pair-ranking loss with rating margin.
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file
class RewardModel(nn.Module):
def __init__(self, backbone):
super().__init__()
self.backbone = backbone
self.config = backbone.config
if hasattr(backbone, "lm_head"):
backbone.lm_head = nn.Identity()
self.reward_head = nn.Linear(backbone.config.hidden_size, 1, bias=False)
def forward(self, input_ids, attention_mask, **kwargs):
outputs = self.backbone(
input_ids=input_ids,
attention_mask=attention_mask,
output_hidden_states=True,
)
hidden_states = outputs.hidden_states[-1]
seq_lengths = attention_mask.sum(dim=1) - 1
batch_idx = torch.arange(hidden_states.size(0), device=hidden_states.device)
last_hidden = hidden_states[batch_idx, seq_lengths]
return self.reward_head(last_hidden).squeeze(-1)
REPO_ID = "Seungjun/llama3.2-1b-helpfulness-reward-model"
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
base_model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.2-1B-Instruct", torch_dtype=torch.bfloat16
)
model = RewardModel(backbone=base_model)
weights_path = hf_hub_download(repo_id=REPO_ID, filename="model.safetensors")
state_dict = load_file(weights_path)
model.load_state_dict(state_dict)
model.eval().bfloat16()
helpful_messages = [
{"role": "user", "content": "Do I need a visa to travel from the US to the UK for a one-week vacation?"},
{"role": "assistant", "content": "No, US citizens traveling for tourism do not need a visa for stays in the UK for up to six months. You will simply need a valid passport that covers the duration of your stay."},
]
unhelpful_messages = [
{"role": "user", "content": "Do I need a visa to travel from the US to the UK for a one-week vacation?"},
{"role": "assistant", "content": "The UK is a very popular destination for American tourists, especially during the summer months. Many travelers enjoy visiting historic landmarks like the Tower of London or exploring the Scottish Highlands. It is always a good idea to pack a raincoat and check your flight status before heading to the airport."},
]
# Higher = more helpful
print("====Helpful response reward:")
with torch.no_grad():
text = tokenizer.apply_chat_template(helpful_messages, tokenize=False, add_generation_prompt=False)
enc = tokenizer(text, max_length=1024, padding="max_length", truncation=True, return_tensors="pt")
reward = model(**enc).item()
print(f"Reward: {reward}")
print("\n====Unhelpful response reward:")
with torch.no_grad():
text = tokenizer.apply_chat_template(unhelpful_messages, tokenize=False, add_generation_prompt=False)
enc = tokenizer(text, max_length=1024, padding="max_length", truncation=True, return_tensors="pt")
reward = model(**enc).item()
print(f"Reward: {reward}")
Standard LLaMA 3.2 1B Instruct with the language modeling head (lm_head) replaced by a scalar reward head (nn.Linear(hidden_size, 1)). The reward is computed from the hidden state of the last non-padding token.
where m(r) = (chosen_avg_rating - rejected_avg_rating) from the ultrafeedback dataset ratings (scale 1-5).
Base model
meta-llama/Llama-3.2-1B-Instruct