# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math import os import argparse import warnings from typing import List, Tuple from vllm import LLM import pandas as pd import numpy as np import wandb def build_pair_prompts(chosen_prompt: str, chosen: str, reject: str, joiner: str = "\n") -> Tuple[str, str]: """为一条样本构造两条输入:prompt_chosen, prompt_reject。""" cp = chosen_prompt if chosen_prompt is not None else "" ch = chosen if chosen is not None else "" rj = reject if reject is not None else "" # 统一去掉尾部多余空白,避免不必要差异 cp = cp.rstrip() ch = ch.rstrip() rj = rj.rstrip() prompt_chosen = f"{cp}{joiner}{ch}" prompt_reject = f"{cp}{joiner}{rj}" return prompt_chosen, prompt_reject data_path="/home/data/raw/test/1159-L6_format_full_label_v5.0safe.parquet" if not os.path.exists(data_path): raise FileNotFoundError(f"数据文件不存在:{data_path}") df = pd.read_parquet(data_path) required_cols = ["chosen_prompt", "chosen", "reject"] for c in required_cols: if c not in df.columns: raise ValueError( f"数据缺少必要列 `{c}`,实际列为:{list(df.columns)}" ) num_samples = len(df) print(f"Loaded {num_samples} samples from {data_path}") llm = LLM(model="/home/rm5.0_9e-6", runner="pooling", convert="reward") results = [] correct_cnt = 0 total_cnt = 0 batch_size=16 # 将样本分批 num_batches = math.ceil(num_samples/batch_size) print("\nStart pairwise reward evaluation...\n" + "-" * 70) for b in range(num_batches): start = b * batch_size end = min((b + 1) * batch_size, num_samples) batch = df.iloc[start:end] # 为本批构造 2 * batch_size 条输入(chosen 与 reject 各一条) pair_prompts = [] indices = [] # 保存对应的样本索引 for idx, row in batch.iterrows(): prompt_chosen, prompt_reject = build_pair_prompts( row["chosen_prompt"], row["chosen"], row["reject"], joiner="" ) # 追加顺序:chosen 在前,reject 在后 pair_prompts.append(prompt_chosen) pair_prompts.append(prompt_reject) indices.append(idx) # 调用 vLLM 奖励接口 try: outputs = llm.reward(pair_prompts) except Exception as e: warnings.warn(f"llm.reward 执行失败(batch={b}):{e}") # 跳过本批 continue # 将 reward 向量规约为标量 # outputs 的顺序与 pair_prompts 一一对应: # [sample0_chosen, sample0_reject, sample1_chosen, sample1_reject, ...] scalar_scores = [] for out in outputs: # vLLM PoolingRequestOutput 结构:out.outputs.data 是向量 score = out.outputs.data[-1] # 防NaN if np.isnan(score): score = -1e30 scalar_scores.append(score) # 解析每条样本的两分数,计算正确与累计正确率 for i, idx in enumerate(indices): chosen_score = scalar_scores[2 * i] reject_score = scalar_scores[2 * i + 1] total_cnt += 1 is_correct = chosen_score > reject_score correct_cnt += int(is_correct) running_acc = correct_cnt / total_cnt # 打印逐样本 print( f"[{total_cnt:6d}] " f"Chosen={chosen_score:.6f} | Reject={reject_score:.6f} | " f"Correct={is_correct} | RunningAcc={running_acc*100:.2f}%" ) # ===================== # 4) 汇总与保存 # ===================== final_acc = (correct_cnt / total_cnt) if total_cnt > 0 else 0.0 print("\n" + "-" * 70) print(f"Finished. Total={total_cnt}, Correct={correct_cnt}, " f"FinalAcc={final_acc*100:.2f}%")