| | from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig |
| | import torch |
| | import pandas as pd |
| | import wandb |
| |
|
| | |
| | wandb.init( |
| | project="reward_model_scoring", |
| | name="fomatted_5e-6_1500", |
| | ) |
| |
|
| | |
| | rm_path = "/home/ckpt/5e-6/global_step180_hf" |
| |
|
| | |
| | tokenizer = AutoTokenizer.from_pretrained(rm_path) |
| |
|
| | |
| | config = AutoConfig.from_pretrained(rm_path) |
| | config.num_labels = 1 |
| |
|
| | |
| | model = AutoModelForSequenceClassification.from_pretrained( |
| | rm_path, |
| | config=config, |
| | device_map="auto" |
| | ) |
| | model.eval() |
| |
|
| | |
| | def get_reward_score(texts): |
| | inputs = tokenizer( |
| | texts, |
| | return_tensors="pt", |
| | padding=True, |
| | truncation=True, |
| | max_length=8192, |
| | ).to(model.device) |
| |
|
| | with torch.no_grad(): |
| | outputs = model(**inputs) |
| | scores = outputs.logits.squeeze(-1).float().cpu().tolist() |
| | return scores |
| |
|
| | |
| | df = pd.read_parquet("/home/data/formatted_test.parquet").sample(n=1500, random_state=42).reset_index(drop=True) |
| | |
| |
|
| | def format_input(prompt, answer): |
| | return prompt + answer |
| |
|
| | chosen_texts = [format_input(p, a) for p, a in zip(df["chosen_prompt"], df["chosen"])] |
| | rejected_texts = [format_input(p, a) for p, a in zip(df["chosen_prompt"], df["reject"])] |
| |
|
| | |
| | chosen_scores, rejected_scores, accs = [], [], [] |
| |
|
| | |
| | sample_table = wandb.Table(columns=[ |
| | "index", "prompt", "chosen", "rejected", |
| | "chosen_score", "rejected_score", "delta_score", "acc" |
| | ]) |
| |
|
| | |
| | batch_size = 16 |
| | for i in range(0, len(chosen_texts), batch_size): |
| | chosen_batch = chosen_texts[i:i+batch_size] |
| | rejected_batch = rejected_texts[i:i+batch_size] |
| |
|
| | chosen_batch_scores = get_reward_score(chosen_batch) |
| | rejected_batch_scores = get_reward_score(rejected_batch) |
| |
|
| | for j in range(len(chosen_batch_scores)): |
| | idx = i + j |
| | c_score = chosen_batch_scores[j] |
| | r_score = rejected_batch_scores[j] |
| | delta = c_score - r_score |
| | acc = int(delta > 0) |
| |
|
| |
|
| | |
| | chosen_scores.append(c_score) |
| | rejected_scores.append(r_score) |
| | accs.append(acc) |
| | current_accuracy = sum(accs) / len(accs) |
| | print(f"[{idx}] acc={acc}, chosen_reward={c_score:.3f}, reject_reward={r_score:.3f} | 当前平均准确率: {current_accuracy:.3f}") |
| |
|
| | |
| | sample_table.add_data( |
| | idx, |
| | df.loc[idx, "chosen_prompt"], |
| | df.loc[idx, "chosen"], |
| | df.loc[idx, "reject"], |
| | c_score, |
| | r_score, |
| | delta, |
| | acc |
| | ) |
| |
|
| | |
| | df["chosen_score"] = chosen_scores |
| | df["rejected_score"] = rejected_scores |
| | df["delta_score"] = df["chosen_score"] - df["rejected_score"] |
| | df["acc"] = accs |
| |
|
| | |
| | accuracy = df["acc"].mean() |
| | mean_chosen = df["chosen_score"].mean() |
| | mean_rejected = df["rejected_score"].mean() |
| | mean_delta = df["delta_score"].mean() |
| |
|
| | print(f"\n✅ Reward Model Accuracy = {accuracy:.3f}") |
| | print(f"📊 mean_chosen = {mean_chosen:.3f}, mean_rejected = {mean_rejected:.3f}, mean_delta = {mean_delta:.3f}") |
| |
|
| | |
| | wandb.log({ |
| | "samples_table": sample_table, |
| | "final_accuracy": accuracy, |
| | "mean_chosen_score": mean_chosen, |
| | "mean_rejected_score": mean_rejected, |
| | "mean_delta_score": mean_delta, |
| | }) |
| |
|
| |
|
| | |
| | wandb.finish() |
| |
|