rm_code / reward.py
hahayang012's picture
Upload folder using huggingface_hub
d8a76be verified
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import torch
import pandas as pd
import wandb
# === 初始化 wandb ===
wandb.init(
project="reward_model_scoring", # 项目名可自定义
name="fomatted_5e-6_1500", # 当前 run 的名称
)
# === 模型路径(你保存训练结果的目录)===
rm_path = "/home/ckpt/5e-6/global_step180_hf" # 你的 reward model 存放目录
# === 加载 tokenizer(包含 special token)===
tokenizer = AutoTokenizer.from_pretrained(rm_path)
# === 加载 config 并确保 num_labels=1 ===
config = AutoConfig.from_pretrained(rm_path)
config.num_labels = 1
# === 加载奖励模型 ===
model = AutoModelForSequenceClassification.from_pretrained(
rm_path,
config=config,
device_map="auto"
)
model.eval()
# === 套壳函数:输入一批文本 → 输出一批 reward 分数 ===
def get_reward_score(texts):
inputs = tokenizer(
texts,
return_tensors="pt",
padding=True,
truncation=True,
max_length=8192,
).to(model.device)
with torch.no_grad():
outputs = model(**inputs)
scores = outputs.logits.squeeze(-1).float().cpu().tolist() # shape: [batch_size]
return scores
# === 读取你要打分的数据集 ===
df = pd.read_parquet("/home/data/formatted_test.parquet").sample(n=1500, random_state=42).reset_index(drop=True)
# 字段必须包含 chosen_prompt, chosen, reject
def format_input(prompt, answer):
return prompt + answer # 可替换为 prompt + "\n\n" + answer
chosen_texts = [format_input(p, a) for p, a in zip(df["chosen_prompt"], df["chosen"])]
rejected_texts = [format_input(p, a) for p, a in zip(df["chosen_prompt"], df["reject"])]
# === 初始化打分结果列表 ===
chosen_scores, rejected_scores, accs = [], [], []
# === 创建 wandb 表格用于可视化 ===
sample_table = wandb.Table(columns=[
"index", "prompt", "chosen", "rejected",
"chosen_score", "rejected_score", "delta_score", "acc"
])
# === 分批次打分 + 实时打印 + 写入 wandb 表格 ===
batch_size = 16
for i in range(0, len(chosen_texts), batch_size):
chosen_batch = chosen_texts[i:i+batch_size]
rejected_batch = rejected_texts[i:i+batch_size]
chosen_batch_scores = get_reward_score(chosen_batch)
rejected_batch_scores = get_reward_score(rejected_batch)
for j in range(len(chosen_batch_scores)):
idx = i + j
c_score = chosen_batch_scores[j]
r_score = rejected_batch_scores[j]
delta = c_score - r_score
acc = int(delta > 0)
# ✅ 写入全局结果
chosen_scores.append(c_score)
rejected_scores.append(r_score)
accs.append(acc)
current_accuracy = sum(accs) / len(accs)
print(f"[{idx}] acc={acc}, chosen_reward={c_score:.3f}, reject_reward={r_score:.3f} | 当前平均准确率: {current_accuracy:.3f}")
# ✅ 添加到 wandb 表格
sample_table.add_data(
idx,
df.loc[idx, "chosen_prompt"],
df.loc[idx, "chosen"],
df.loc[idx, "reject"],
c_score,
r_score,
delta,
acc
)
# === 写入打分结果到 DataFrame ===
df["chosen_score"] = chosen_scores
df["rejected_score"] = rejected_scores
df["delta_score"] = df["chosen_score"] - df["rejected_score"]
df["acc"] = accs
# === 显示平均指标 ===
accuracy = df["acc"].mean()
mean_chosen = df["chosen_score"].mean()
mean_rejected = df["rejected_score"].mean()
mean_delta = df["delta_score"].mean()
print(f"\n✅ Reward Model Accuracy = {accuracy:.3f}")
print(f"📊 mean_chosen = {mean_chosen:.3f}, mean_rejected = {mean_rejected:.3f}, mean_delta = {mean_delta:.3f}")
# === log 到 wandb ===
wandb.log({
"samples_table": sample_table,
"final_accuracy": accuracy,
"mean_chosen_score": mean_chosen,
"mean_rejected_score": mean_rejected,
"mean_delta_score": mean_delta,
})
# === 关闭 wandb run ===
wandb.finish()