from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig import torch import pandas as pd import wandb # === 初始化 wandb === wandb.init( project="reward_model_scoring", # 项目名可自定义 name="fomatted_5e-6_1500", # 当前 run 的名称 ) # === 模型路径(你保存训练结果的目录)=== rm_path = "/home/ckpt/5e-6/global_step180_hf" # 你的 reward model 存放目录 # === 加载 tokenizer(包含 special token)=== tokenizer = AutoTokenizer.from_pretrained(rm_path) # === 加载 config 并确保 num_labels=1 === config = AutoConfig.from_pretrained(rm_path) config.num_labels = 1 # === 加载奖励模型 === model = AutoModelForSequenceClassification.from_pretrained( rm_path, config=config, device_map="auto" ) model.eval() # === 套壳函数:输入一批文本 → 输出一批 reward 分数 === def get_reward_score(texts): inputs = tokenizer( texts, return_tensors="pt", padding=True, truncation=True, max_length=8192, ).to(model.device) with torch.no_grad(): outputs = model(**inputs) scores = outputs.logits.squeeze(-1).float().cpu().tolist() # shape: [batch_size] return scores # === 读取你要打分的数据集 === df = pd.read_parquet("/home/data/formatted_test.parquet").sample(n=1500, random_state=42).reset_index(drop=True) # 字段必须包含 chosen_prompt, chosen, reject def format_input(prompt, answer): return prompt + answer # 可替换为 prompt + "\n\n" + answer chosen_texts = [format_input(p, a) for p, a in zip(df["chosen_prompt"], df["chosen"])] rejected_texts = [format_input(p, a) for p, a in zip(df["chosen_prompt"], df["reject"])] # === 初始化打分结果列表 === chosen_scores, rejected_scores, accs = [], [], [] # === 创建 wandb 表格用于可视化 === sample_table = wandb.Table(columns=[ "index", "prompt", "chosen", "rejected", "chosen_score", "rejected_score", "delta_score", "acc" ]) # === 分批次打分 + 实时打印 + 写入 wandb 表格 === batch_size = 16 for i in range(0, len(chosen_texts), batch_size): chosen_batch = chosen_texts[i:i+batch_size] rejected_batch = rejected_texts[i:i+batch_size] chosen_batch_scores = get_reward_score(chosen_batch) rejected_batch_scores = get_reward_score(rejected_batch) for j in range(len(chosen_batch_scores)): idx = i + j c_score = chosen_batch_scores[j] r_score = rejected_batch_scores[j] delta = c_score - r_score acc = int(delta > 0) # ✅ 写入全局结果 chosen_scores.append(c_score) rejected_scores.append(r_score) accs.append(acc) current_accuracy = sum(accs) / len(accs) print(f"[{idx}] acc={acc}, chosen_reward={c_score:.3f}, reject_reward={r_score:.3f} | 当前平均准确率: {current_accuracy:.3f}") # ✅ 添加到 wandb 表格 sample_table.add_data( idx, df.loc[idx, "chosen_prompt"], df.loc[idx, "chosen"], df.loc[idx, "reject"], c_score, r_score, delta, acc ) # === 写入打分结果到 DataFrame === df["chosen_score"] = chosen_scores df["rejected_score"] = rejected_scores df["delta_score"] = df["chosen_score"] - df["rejected_score"] df["acc"] = accs # === 显示平均指标 === accuracy = df["acc"].mean() mean_chosen = df["chosen_score"].mean() mean_rejected = df["rejected_score"].mean() mean_delta = df["delta_score"].mean() print(f"\n✅ Reward Model Accuracy = {accuracy:.3f}") print(f"📊 mean_chosen = {mean_chosen:.3f}, mean_rejected = {mean_rejected:.3f}, mean_delta = {mean_delta:.3f}") # === log 到 wandb === wandb.log({ "samples_table": sample_table, "final_accuracy": accuracy, "mean_chosen_score": mean_chosen, "mean_rejected_score": mean_rejected, "mean_delta_score": mean_delta, }) # === 关闭 wandb run === wandb.finish()