Vidushee/BT_Preference_Dataset
Viewer • Updated • 31.1k • 34
A Bradley-Terry reward model fine-tuned from Qwen/Qwen3-32B for scoring question quality about research papers.
import re
import torch
from transformers import AutoTokenizer, pipeline
model_path = "Vidushee/Qwen3-32B-BT-RewardModel"
tokenizer = AutoTokenizer.from_pretrained(model_path)
rm_pipe = pipeline(
"sentiment-analysis",
model=model_path,
device=0,
tokenizer=tokenizer,
model_kwargs={"torch_dtype": torch.bfloat16, "attn_implementation": "flash_attention_2"},
truncation=True,
max_length=12288,
)
pipe_kwargs = {
"return_all_scores": True,
"function_to_apply": "none",
"batch_size": 1,
}
# Format your conversation
chat = [
{"role": "user", "content": "Your paper context here"},
{"role": "assistant", "content": "Question to score"},
]
text = tokenizer.apply_chat_template(
chat, tokenize=False, add_generation_prompt=False, enable_thinking=False
)
# Strip empty think blocks that Qwen3 inserts even with enable_thinking=False
text = re.sub(r"<think>\s*</think>\s*", "", text)
# Strip trailing newline so reward pools from <|im_end|>
text = text.rstrip("\n")
outputs = rm_pipe([text], **pipe_kwargs)
reward = outputs[0][0]["score"]
print(f"Reward: {reward}")
# Score chosen vs rejected responses
chosen_chat = [
{"role": "user", "content": "Paper context..."},
{"role": "assistant", "content": "Good question about the paper"},
]
rejected_chat = [
{"role": "user", "content": "Paper context..."},
{"role": "assistant", "content": "Bad question about the paper"},
]
def format_text(messages):
text = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=False, enable_thinking=False
)
text = re.sub(r"<think>\s*</think>\s*", "", text)
return text.rstrip("\n")
outputs = rm_pipe([format_text(chosen_chat), format_text(rejected_chat)], **pipe_kwargs)
chosen_reward = outputs[0][0]["score"]
rejected_reward = outputs[1][0]["score"]
print(f"Chosen reward: {chosen_reward:.4f}")
print(f"Rejected reward: {rejected_reward:.4f}")
print(f"Chosen is better: {chosen_reward > rejected_reward}")
Base model
Qwen/Qwen3-32B