# Requires: transformers>=4.51.0, torch, pandas, pyarrow, tqdm
import os
import math
import pandas as pd
from tqdm import tqdm
import torch
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sentence_transformers import CrossEncoder
MODEL_NAME = "deeppin/Qwen3-Reranker-8B-SequenceClassification"
DATA_PATH  = "data/valid.parquet"
BATCH_SIZE = 8
MAX_LENGTH = 8192
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def format_instruction(instruction, query, doc):
    # prefix = (
    #     '<|im_start|>system\n'
    #     'You are a judge for retrieval-style matching between a roleplay prompt ("Query") and a candidate reply ("Document"). '
    #     'Score higher when the Document stays in persona, follows the context coherently, and is vivid/engaging.\n'
    #     '<|im_end|>\n<|im_start|>user\n'
    # )
    # suffix = "<|im_end|>\n<|im_start|>assistant\n"
    # if instruction is None:
    #     instruction = (
    #         "Given a roleplay prompt, retrieve replies that best match persona adherence, plot continuity, and vividness."
    #     )
    output = f"<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}"
    return output
import re
import re
_SYS_BLOCK = re.compile(
    r"<\|im_start\|\>\s*system\b.*?<\|im_end\|\>", re.IGNORECASE | re.DOTALL
)
_TURN_BLOCK = re.compile(
    r"<\|im_start\|\>\s*(user|assistant)\b\s*(.*?)\s*<\|im_end\|\>",
    re.IGNORECASE | re.DOTALL,
)
_ANY_CHATML_TAG = re.compile(r"<\|[^|]+?\|>")  # 清理残余 ChatML 标记，如 <|im_start|>

_SYS = re.compile(r"<\|im_start\|\>\s*system\b(.*?)<\|im_end\|\>", re.I|re.S)
_TURN = re.compile(r"<\|im_start\|\>\s*(user|assistant)\b(.*?)<\|im_end\|\>", re.I|re.S)
_TAG  = re.compile(r"<\|[^|]+?\|>")

_START = re.compile(r"<\|im_start\|\>\s*(system|user|assistant)\s*", re.IGNORECASE)
_END   = re.compile(r"<\|im_end\|\>", re.IGNORECASE)
_ANY   = re.compile(r"<\|[^|>]+?\|>", re.IGNORECASE)
_THINK_BLOCK = re.compile(r"<think>.*?</think>", re.IGNORECASE | re.DOTALL)

def flatten_chatml(text: str, keep_think: bool = False, *, single_line: bool = False, sep: str = " ") -> str:
    if not isinstance(text, str):
        return ""

    t = text.replace("\r\n", "\n")  # 统一行尾
    if not keep_think:
        t = _THINK_BLOCK.sub("", t)

    t = _START.sub("", t)
    t = _END.sub("\n", t)   # 先把段落边界保留为换行，便于后面统一折叠
    t = _ANY.sub("", t)

    # 基本空白规整
    t = re.sub(r"[ \t]*\n[ \t]*", "\n", t)
    t = re.sub(r"\n{3,}", "\n\n", t)
    t = t.strip()

    if single_line:
        # 1) 全部换行（含 Unicode 分隔符）→ 指定分隔符
        t = t.replace("\r", "\n")
        t = re.sub(r"[\n\u2028\u2029]+", sep, t)
        # 2) 折叠多余空白（含制表符、不间断空格等）
        t = re.sub(r"[ \t\u00A0]{2,}", " ", t)
        t = re.sub(r"\s{2,}", " ", t)
        t = t.strip()
    return t
# def format_instruction(instruction, query, doc):
#     prefix = '<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n'
#     suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
#     if instruction is None:
#         instruction = (
#             "Given a roleplay prompt and recent context, score candidate replies higher when they stay in character, continue the scene coherently, and feel vivid and engaging."
#         )
#     output = f"{prefix}<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}{suffix}"
#     return output

# ===== 模型与分词器 =====
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    padding_side="left",
    use_fast=False,
    trust_remote_code=True,
)
tokenizer.truncation_side = "left"
# 确保有 pad_token
if tokenizer.pad_token_id is None:
    if tokenizer.eos_token_id is not None:
        tokenizer.pad_token = tokenizer.eos_token
    else:
        tokenizer.add_special_tokens({"pad_token": "<|endoftext|>"})

# 常规精度版本（更稳妥，不易出现 NaN）
# model = AutoModelForSequenceClassification.from_pretrained(
#     MODEL_NAME,
#     trust_remote_code=True,
# ).to(DEVICE).eval()

# 如需更快推理（需 GPU 且装好 FA2），用下面这行替换上面加载： 
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, torch_dtype=torch.float16, attn_implementation="flash_attention_2",
    trust_remote_code=True,
).to("cuda").eval()
model.config.pad_token_id = tokenizer.pad_token_id
TASK = "Given a roleplay prompt and recent context, score candidate replies higher when they stay in character, continue the scene coherently, and feel vivid and engaging."

# ===== 读取与清洗数据 =====
df = pd.read_parquet(DATA_PATH)
need_cols = ["chosen_prompt", "chosen", "reject"]
for col in need_cols:
    if col not in df.columns:
        raise ValueError(f"缺少必要列：{col}")

def norm_text(x):
    if x is None or (isinstance(x, float) and math.isnan(x)):
        return ""
    return str(x).strip()

df = df[need_cols].copy()
for col in need_cols:
    # 去 ChatML 标签并合并为单行（sep="" 表示紧贴；如果想要空格，用 sep=" "）
    df[col] = df[col].map(lambda s: flatten_chatml(norm_text(s), single_line=True, sep=""))

# 过滤空样本
mask = (df["chosen_prompt"].str.len()>0) & (df["chosen"].str.len()>0) & (df["reject"].str.len()>0)
df = df[mask].reset_index(drop=True)
total = len(df)
if total == 0:
    raise ValueError("过滤后无有效样本。请检查数据内容。")
print(f"[Info] 有效样本数: {total}")

# ---------- 推理（逐样本两对比较） ----------
correct = 0
seen = 0

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Scoring (per-sample)"):
    q_clean = row["chosen_prompt"]
    c_clean = row["chosen"]
    r_clean = row["reject"]

    p1 = format_instruction(TASK, q_clean, c_clean)  # chosen
    p2 = format_instruction(TASK, q_clean, r_clean)  # reject

    enc = tokenizer([p1, p2], padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
    enc = {k: v.to(DEVICE) for k, v in enc.items()}

    with torch.no_grad():
        logits = model(**enc).logits.squeeze(-1)  # 形状 [2]

    l1, l2 = float(logits[0]), float(logits[1])
    is_correct = (l1 > l2)  # 如果方向相反，改成 (l1 < l2)

    correct += int(is_correct)
    seen += 1
    print(f"[{idx}] logits={[l1, l2]} | first>second={is_correct} | running_acc={correct/seen:.2%} ({correct}/{seen})")

print(f"\n[Result] Total={seen} | Correct={correct} | Accuracy={correct/seen:.2%}")