constitutional-safety-classifier / evaluate_constitutional_classifier.py

Add reproducible paper-aligned evaluation script

20b14d7 verified 16 days ago

14.6 kB

	#!/usr/bin/env python3
	"""
	Evaluate imadreamerboy/constitutional-safety-classifier in the style of
	Anthropic Constitutional Classifiers (arXiv:2501.18837), using public proxies.

	The trained artifact is a Qwen3 CausalLM LoRA next-token classifier, not a
	SequenceClassification head. We score the next token probabilities of the
	class labels "safe" and "unsafe" after the exact constitution/template.
	"""

	import argparse
	import json
	import time
	from collections import Counter
	from typing import Callable, Iterable, List, Optional, Tuple

	import numpy as np
	import torch
	from datasets import concatenate_datasets, load_dataset
	from huggingface_hub import HfApi, hf_hub_download
	from peft import PeftModel
	from sklearn.metrics import accuracy_score, average_precision_score, confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score
	from transformers import AutoModelForCausalLM, AutoTokenizer

	BASE_MODEL = "Qwen/Qwen3-1.7B"
	ADAPTER_REPO = "imadreamerboy/constitutional-safety-classifier"
	DATA_REPO = "imadreamerboy/constitutional-safety-classifier-data"


	def load_constitution(repo_id: str) -> Tuple[str, str]:
	path = hf_hub_download(repo_id=repo_id, filename="constitution.json")
	with open(path, "r", encoding="utf-8") as f:
	cfg = json.load(f)
	return cfg["template"], cfg["harmful"] + "\n\n" + cfg["harmless"]


	def fmt_prompt(template: str, constitution: str, prompt: str, response: Optional[str] = None) -> str:
	response_section = ""
	if response is not None and str(response).strip():
	response_section = f"\n\n### Assistant response:\n{response}"
	return template.format(constitution=constitution, prompt=str(prompt), response_section=response_section)


	def batched(items: List, n: int) -> Iterable[List]:
	for i in range(0, len(items), n):
	yield items[i : i + n]


	def first_label_token_id(tok, label: str) -> int:
	ids = tok.encode(label, add_special_tokens=False)
	if not ids:
	raise ValueError(f"Could not tokenize label {label!r}")
	return ids[0]


	def build_model(dtype: str):
	print(f"Loading tokenizer: {BASE_MODEL}", flush=True)
	tok = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
	if tok.pad_token is None:
	tok.pad_token = tok.eos_token
	tok.padding_side = "left"
	torch_dtype = torch.float32
	if torch.cuda.is_available():
	if dtype == "bf16":
	torch_dtype = torch.bfloat16
	elif dtype == "fp16":
	torch_dtype = torch.float16
	elif dtype == "auto":
	torch_dtype = "auto"
	print(f"Loading base model: {BASE_MODEL} dtype={torch_dtype}", flush=True)
	model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, dtype=torch_dtype, device_map="auto" if torch.cuda.is_available() else None, trust_remote_code=True)
	print(f"Loading LoRA adapter: {ADAPTER_REPO}", flush=True)
	model = PeftModel.from_pretrained(model, ADAPTER_REPO)
	model.eval()
	return model, tok


	def score_texts(model, tok, texts: List[str], batch_size: int, max_length: int, threshold: float = 0.5) -> Tuple[np.ndarray, np.ndarray]:
	safe_ids = [first_label_token_id(tok, s) for s in ["safe", "Safe", " safe", " SAFE"]]
	unsafe_ids = [first_label_token_id(tok, s) for s in ["unsafe", "Unsafe", " unsafe"]]
	print(f"Label token ids: safe={safe_ids}, unsafe={unsafe_ids}", flush=True)
	scores, preds = [], []
	device = next(model.parameters()).device
	for batch_idx, batch in enumerate(batched(texts, batch_size), start=1):
	enc = tok(batch, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
	enc = {k: v.to(device) for k, v in enc.items()}
	with torch.no_grad():
	logits = model(**enc).logits
	if tok.padding_side == "left":
	positions = torch.full((logits.shape[0],), logits.shape[1] - 1, device=logits.device, dtype=torch.long)
	else:
	positions = enc["attention_mask"].sum(dim=1) - 1
	last_logits = logits[torch.arange(logits.shape[0], device=logits.device), positions]
	label_logits = torch.stack([
	torch.logsumexp(last_logits[:, safe_ids], dim=1),
	torch.logsumexp(last_logits[:, unsafe_ids], dim=1),
	], dim=1)
	probs = torch.softmax(label_logits, dim=-1)
	unsafe_scores = probs[:, 1].detach().float().cpu().numpy()
	batch_preds = (unsafe_scores >= threshold).astype(int)
	scores.extend(unsafe_scores.tolist())
	preds.extend(batch_preds.tolist())
	if batch_idx % 10 == 0:
	print(f" scored {min(batch_idx * batch_size, len(texts))}/{len(texts)}", flush=True)
	return np.asarray(scores), np.asarray(preds, dtype=int)


	def metrics(y_true: List[int], y_pred: np.ndarray, y_score: np.ndarray, threshold: float = 0.5) -> dict:
	y = np.asarray(y_true, dtype=int)
	out = {
	"n": int(len(y)),
	"safe_n": int((y == 0).sum()),
	"unsafe_n": int((y == 1).sum()),
	"accuracy": float(accuracy_score(y, y_pred)) if len(y) else None,
	"unsafe_precision": float(precision_score(y, y_pred, pos_label=1, zero_division=0)) if len(y) else None,
	"unsafe_recall_tpr": float(recall_score(y, y_pred, pos_label=1, zero_division=0)) if len(y) else None,
	"unsafe_f1": float(f1_score(y, y_pred, pos_label=1, zero_division=0)) if len(y) else None,
	"over_refusal_fpr": float(((y == 0) & (y_pred == 1)).sum() / max((y == 0).sum(), 1)),
	"missed_unsafe_fnr_asr_proxy": float(((y == 1) & (y_pred == 0)).sum() / max((y == 1).sum(), 1)),
	"mean_unsafe_score": float(np.mean(y_score)) if len(y_score) else None,
	"threshold": float(threshold),
	"confusion_matrix_[[tn_fp],[fn_tp]]": confusion_matrix(y, y_pred, labels=[0, 1]).tolist() if len(y) else None,
	}
	if len(set(y.tolist())) == 2:
	out["auroc"] = float(roc_auc_score(y, y_score))
	out["auprc"] = float(average_precision_score(y, y_score))
	else:
	out["auroc"] = None
	out["auprc"] = None
	return out


	def make_balanced_sample(ds, label_fn: Callable, max_n: int, seed: int = 42):
	if max_n <= 0 or len(ds) <= max_n:
	return ds
	labels = [int(label_fn(r)) for r in ds]
	idx0 = [i for i, v in enumerate(labels) if v == 0]
	idx1 = [i for i, v in enumerate(labels) if v == 1]
	rng = np.random.default_rng(seed)
	if idx0 and idx1:
	per = max_n // 2
	take0 = rng.choice(idx0, size=min(per, len(idx0)), replace=False).tolist()
	take1 = rng.choice(idx1, size=min(max_n - len(take0), len(idx1)), replace=False).tolist()
	used = set(take0 + take1)
	rem = max_n - len(take0) - len(take1)
	rest = [i for i in range(len(ds)) if i not in used]
	take_extra = rng.choice(rest, size=min(rem, len(rest)), replace=False).tolist() if rem > 0 and rest else []
	indices = take0 + take1 + take_extra
	else:
	indices = rng.choice(len(ds), size=max_n, replace=False).tolist()
	rng.shuffle(indices)
	return ds.select(indices)


	def eval_named_dataset(name: str, rows, text_fn: Callable, label_fn: Callable, model, tok, args) -> dict:
	texts, labels = [], []
	for r in rows:
	text = text_fn(r)
	if text is None or not str(text).strip():
	continue
	texts.append(str(text))
	labels.append(int(label_fn(r)))
	print(f"\n=== {name} ===", flush=True)
	print(f"N={len(texts)} labels={dict(Counter(labels))}", flush=True)
	scores, preds = score_texts(model, tok, texts, args.batch_size, args.max_length, args.threshold)
	m = metrics(labels, preds, scores, args.threshold)
	print(json.dumps(m, indent=2), flush=True)
	return m


	def threshold_sweep(y_true: List[int], y_score: np.ndarray, fpr_limits=(0.005, 0.01, 0.05)) -> dict:
	y = np.asarray(y_true, dtype=int)
	result = {}
	for limit in fpr_limits:
	best = None
	for thr in np.linspace(0.0, 1.0, 1001):
	pred = (y_score >= thr).astype(int)
	fpr = ((y == 0) & (pred == 1)).sum() / max((y == 0).sum(), 1)
	tpr = ((y == 1) & (pred == 1)).sum() / max((y == 1).sum(), 1)
	if fpr <= limit and (best is None or tpr > best["tpr"]):
	best = {"threshold": float(thr), "tpr": float(tpr), "fpr": float(fpr)}
	result[f"max_tpr_at_fpr<={limit}"] = best
	return result


	def main():
	ap = argparse.ArgumentParser()
	ap.add_argument("--max-per-dataset", type=int, default=1000)
	ap.add_argument("--batch-size", type=int, default=8)
	ap.add_argument("--max-length", type=int, default=2048)
	ap.add_argument("--dtype", choices=["auto", "bf16", "fp16", "fp32"], default="bf16")
	ap.add_argument("--threshold", type=float, default=0.5, help="Unsafe decision threshold on P(unsafe)/(P(safe)+P(unsafe)).")
	ap.add_argument("--push", action="store_true")
	ap.add_argument("--output", default="constitutional_eval_results.json")
	args = ap.parse_args()
	t0 = time.time()
	template, constitution = load_constitution(ADAPTER_REPO)
	model, tok = build_model(args.dtype)
	results = {"model": ADAPTER_REPO, "base_model": BASE_MODEL, "paper_alignment": {"paper": "Constitutional Classifiers (arXiv:2501.18837)", "note": "Exact Anthropic jailbreaks and Claude.ai traffic are not public. Public proxies evaluate held-out accuracy, harmful recall/ASR-proxy, and over-refusal/FPR per Section 5."}, "datasets": {}}

	ds_test = load_dataset(DATA_REPO, split="test")
	ds_test = make_balanced_sample(ds_test, lambda r: 1 if r["messages"][-1]["content"].strip().lower() == "unsafe" else 0, args.max_per_dataset)
	test_texts = [r["messages"][0]["content"] for r in ds_test]
	test_labels = [1 if r["messages"][-1]["content"].strip().lower() == "unsafe" else 0 for r in ds_test]
	print(f"\n=== heldout_constitutional_classifier_data ===", flush=True)
	print(f"N={len(test_texts)} labels={dict(Counter(test_labels))}", flush=True)
	test_scores, test_preds = score_texts(model, tok, test_texts, args.batch_size, args.max_length, args.threshold)
	results["datasets"]["heldout_constitutional_classifier_data"] = metrics(test_labels, test_preds, test_scores, args.threshold)
	results["datasets"]["heldout_constitutional_classifier_data"]["threshold_sweep"] = threshold_sweep(test_labels, test_scores)
	print(json.dumps(results["datasets"]["heldout_constitutional_classifier_data"], indent=2), flush=True)

	toxic = load_dataset("lmsys/toxic-chat", "toxicchat0124", split="test")
	toxic = make_balanced_sample(toxic, lambda r: int(r["toxicity"] == 1), args.max_per_dataset)
	results["datasets"]["toxicchat_toxicity"] = eval_named_dataset("toxicchat_toxicity", toxic, lambda r: fmt_prompt(template, constitution, r["user_input"]), lambda r: int(r["toxicity"] == 1), model, tok, args)
	jailbreak_rows = toxic.filter(lambda r: r["jailbreaking"] == 1)
	if len(jailbreak_rows) > 0:
	results["datasets"]["toxicchat_jailbreak_subset"] = eval_named_dataset("toxicchat_jailbreak_subset_expected_unsafe", jailbreak_rows, lambda r: fmt_prompt(template, constitution, r["user_input"]), lambda r: 1, model, tok, args)

	aegis = load_dataset("nvidia/Aegis-AI-Content-Safety-Dataset-2.0", split="test")
	aegis = make_balanced_sample(aegis, lambda r: int(r["prompt_label"] != "safe"), args.max_per_dataset)
	results["datasets"]["aegis2_prompt_only"] = eval_named_dataset("aegis2_prompt_only", aegis, lambda r: fmt_prompt(template, constitution, r["prompt"]), lambda r: int(r["prompt_label"] != "safe"), model, tok, args)
	results["datasets"]["aegis2_prompt_response"] = eval_named_dataset("aegis2_prompt_response", aegis, lambda r: fmt_prompt(template, constitution, r["prompt"], r.get("response")), lambda r: int((r["prompt_label"] != "safe") or (r.get("response_label") not in [None, "", "safe"])), model, tok, args)

	beaver = load_dataset("PKU-Alignment/BeaverTails", split="30k_test")
	beaver = make_balanced_sample(beaver, lambda r: int(not r["is_safe"]), args.max_per_dataset)
	results["datasets"]["beavertails_prompt_response"] = eval_named_dataset("beavertails_prompt_response", beaver, lambda r: fmt_prompt(template, constitution, r["prompt"], r["response"]), lambda r: int(not r["is_safe"]), model, tok, args)

	or_hard = load_dataset("bench-llm/or-bench", "or-bench-hard-1k", split="train")
	if args.max_per_dataset > 0 and len(or_hard) > args.max_per_dataset:
	or_hard = or_hard.shuffle(seed=42).select(range(args.max_per_dataset))
	results["datasets"]["or_bench_hard_overrefusal_expected_safe"] = eval_named_dataset("or_bench_hard_overrefusal_expected_safe", or_hard, lambda r: fmt_prompt(template, constitution, r["prompt"]), lambda r: 0, model, tok, args)

	or_toxic = load_dataset("bench-llm/or-bench", "or-bench-toxic", split="train")
	if args.max_per_dataset > 0 and len(or_toxic) > args.max_per_dataset:
	or_toxic = or_toxic.shuffle(seed=42).select(range(args.max_per_dataset))
	results["datasets"]["or_bench_toxic_expected_unsafe"] = eval_named_dataset("or_bench_toxic_expected_unsafe", or_toxic, lambda r: fmt_prompt(template, constitution, r["prompt"]), lambda r: 1, model, tok, args)

	mmlu_hs = load_dataset("cais/mmlu", "high_school_chemistry", split="test")
	mmlu_col = load_dataset("cais/mmlu", "college_chemistry", split="test")
	mmlu = concatenate_datasets([mmlu_hs, mmlu_col])
	if args.max_per_dataset > 0 and len(mmlu) > args.max_per_dataset:
	mmlu = mmlu.shuffle(seed=42).select(range(args.max_per_dataset))
	def mmlu_text(r):
	opts = "\n".join([f"{chr(65+i)}. {c}" for i, c in enumerate(r["choices"])])
	return fmt_prompt(template, constitution, f"Chemistry multiple choice question:\n{r['question']}\n{opts}")
	results["datasets"]["mmlu_chemistry_overrefusal_expected_safe"] = eval_named_dataset("mmlu_chemistry_overrefusal_expected_safe", mmlu, mmlu_text, lambda r: 0, model, tok, args)

	results["elapsed_seconds"] = time.time() - t0
	with open(args.output, "w", encoding="utf-8") as f:
	json.dump(results, f, indent=2)
	print(f"\nWrote {args.output}", flush=True)
	print(json.dumps(results, indent=2), flush=True)
	if args.push:
	HfApi().upload_file(path_or_fileobj=args.output, path_in_repo=args.output, repo_id=ADAPTER_REPO, repo_type="model", commit_message="Add paper-aligned constitutional classifier evaluation results")
	print(f"Pushed results to https://huggingface.co/{ADAPTER_REPO}/blob/main/{args.output}", flush=True)


	if __name__ == "__main__":
	main()