obliteratus

Running on Zero

App Files Files Community

obliteratus / scripts /benchmark_gpt_oss_20b.py

pliny-the-prompter

Upload 127 files

45113e6 verified about 2 months ago

raw

history blame contribute delete

12.1 kB

	#!/usr/bin/env python3
	"""OBLITERATUS GPT-OSS 20B Benchmark — Full Method Comparison.

	Runs all abliteration methods on openai/gpt-oss-20b and produces a
	comprehensive comparison table with:
	- Refusal rate (primary metric)
	- KL divergence / perplexity (capability preservation)
	- Capability probes (knowledge, truthfulness, math reasoning)
	- MoE-specific metrics (EGA expert directions, router stability)
	- Timing and GPU memory usage

	Usage:
	python scripts/benchmark_gpt_oss_20b.py
	python scripts/benchmark_gpt_oss_20b.py --methods basic surgical optimized nuclear
	python scripts/benchmark_gpt_oss_20b.py --prompts 50 --output results.json
	python scripts/benchmark_gpt_oss_20b.py --quick # fast mode: 20 prompts, skip slow methods

	Designed for T4 16GB (auto 4-bit quantization) or A10G+ (float16).
	"""

	from __future__ import annotations

	import argparse
	import gc
	import json
	import os
	import shutil
	import sys
	import time
	from pathlib import Path

	os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")

	import torch

	# Ensure the project root is on sys.path
	project_root = Path(__file__).resolve().parent.parent
	sys.path.insert(0, str(project_root))

	from obliteratus.abliterate import ( # noqa: E402
	AbliterationPipeline,
	METHODS,
	HARMFUL_PROMPTS,
	HARMLESS_PROMPTS,
	)
	from obliteratus.evaluation.benchmarks import BenchmarkRunner, format_benchmark_report # noqa: E402


	def parse_args():
	parser = argparse.ArgumentParser(description="OBLITERATUS GPT-OSS 20B Benchmark")
	parser.add_argument(
	"--model", default="openai/gpt-oss-20b",
	help="Model to benchmark (default: openai/gpt-oss-20b)",
	)
	parser.add_argument(
	"--methods", nargs="+",
	default=["basic", "advanced", "surgical", "optimized", "inverted", "nuclear"],
	help="Methods to compare",
	)
	parser.add_argument(
	"--prompts", type=int, default=33,
	help="Number of prompts per side (harmful/harmless)",
	)
	parser.add_argument(
	"--output", type=str, default=None,
	help="Save results JSON to this path",
	)
	parser.add_argument(
	"--quick", action="store_true",
	help="Quick mode: 20 prompts, skip aggressive/inverted",
	)
	parser.add_argument(
	"--skip-benchmarks", action="store_true",
	help="Skip capability benchmark probes (faster)",
	)
	parser.add_argument(
	"--output-dir", default="/tmp/obliteratus_bench",
	help="Directory for temporary model outputs",
	)
	parser.add_argument(
	"--bayesian-trials", type=int, default=30,
	help="Number of Bayesian optimization trials for 'optimized' method",
	)
	return parser.parse_args()


	def gpu_info() -> dict:
	"""Get GPU information."""
	if not torch.cuda.is_available():
	return {"gpu": "CPU only", "total_gb": 0, "free_gb": 0}
	return {
	"gpu": torch.cuda.get_device_name(0),
	"total_gb": round(torch.cuda.get_device_properties(0).total_memory / 1e9, 1),
	"free_gb": round(torch.cuda.mem_get_info(0)[0] / 1e9, 1),
	}


	def cleanup():
	"""Force GPU memory cleanup."""
	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	torch.cuda.reset_peak_memory_stats()


	def run_single_method(
	model_name: str,
	method: str,
	harmful: list[str],
	harmless: list[str],
	output_dir: str,
	run_benchmarks: bool = True,
	bayesian_trials: int = 30,
	) -> dict:
	"""Run a single abliteration method and collect metrics."""
	cleanup()

	outdir = f"{output_dir}/{method}"
	t0 = time.time()
	pipeline = None
	result = {
	"model": model_name,
	"method": method,
	"label": METHODS.get(method, {}).get("label", method),
	}

	try:
	# For the optimized method, we might want to control trial count
	if method == "optimized":
	# Temporarily patch bayesian_trials in the method config
	METHODS["optimized"]["bayesian_trials"] = bayesian_trials

	pipeline = AbliterationPipeline(
	model_name=model_name,
	output_dir=outdir,
	device="auto",
	dtype="float16",
	method=method,
	harmful_prompts=harmful,
	harmless_prompts=harmless,
	on_log=lambda msg: print(f" {msg}"),
	)
	pipeline.run()
	elapsed = time.time() - t0

	result.update({
	"time_seconds": round(elapsed, 1),
	"quality": dict(pipeline._quality_metrics),
	"strong_layers": pipeline._strong_layers,
	"n_strong_layers": len(pipeline._strong_layers),
	"n_directions": pipeline.n_directions,
	})

	# MoE-specific metrics
	if pipeline._expert_directions:
	n_expert_dirs = sum(len(d) for d in pipeline._expert_directions.values())
	result["ega_expert_dirs"] = n_expert_dirs
	result["ega_layers"] = len(pipeline._expert_directions)

	if pipeline._expert_safety_scores:
	result["expert_classified_layers"] = len(pipeline._expert_safety_scores)

	if pipeline._cot_preserve_directions:
	result["cot_preserved_layers"] = len(pipeline._cot_preserve_directions)

	if pipeline._float_layer_weights:
	result["float_layer_weights"] = {
	str(k): round(v, 3) for k, v in pipeline._float_layer_weights.items()
	}

	if pipeline._kl_contributions:
	result["kl_contributions"] = {
	str(k): round(v, 6) for k, v in pipeline._kl_contributions.items()
	}

	if pipeline._lora_adapters:
	result["lora_adapters"] = len(pipeline._lora_adapters)

	if pipeline._steering_hooks:
	result["steering_hooks"] = len(pipeline._steering_hooks)

	# GPU memory
	if torch.cuda.is_available():
	result["peak_gpu_mb"] = round(torch.cuda.max_memory_allocated() / 1e6, 1)

	# Capability benchmarks (optional)
	if run_benchmarks:
	print("\n Running capability benchmarks...")
	try:
	runner = BenchmarkRunner(
	pipeline.handle.model,
	pipeline.handle.tokenizer,
	)
	bench_results = runner.run_all()
	result["benchmarks"] = {
	name: {
	"score": round(br.score, 3),
	"n_correct": br.n_correct,
	"n_total": br.n_total,
	"per_category": {
	k: round(v, 3) for k, v in br.per_category.items()
	},
	}
	for name, br in bench_results.items()
	}
	report = format_benchmark_report(bench_results)
	print(f"\n{report}")
	except Exception as e:
	print(f" Benchmark probes failed: {e}")
	result["benchmarks"] = {"error": str(e)}

	print(f"\n === {method} complete in {elapsed:.1f}s ===")
	print(f" Quality: {json.dumps(pipeline._quality_metrics, default=str)}")

	except Exception as e:
	elapsed = time.time() - t0
	result.update({
	"time_seconds": round(elapsed, 1),
	"error": str(e),
	})
	print(f"\n === {method} FAILED after {elapsed:.1f}s: {e} ===")
	import traceback
	traceback.print_exc()

	# Cleanup saved model to free disk
	shutil.rmtree(outdir, ignore_errors=True)

	if pipeline is not None:
	del pipeline
	cleanup()

	return result


	def print_summary_table(results: list[dict]):
	"""Print a formatted comparison table."""
	print(f"\n{'='*90}")
	print("BENCHMARK RESULTS SUMMARY")
	print(f"{'='*90}")

	# Header
	header = (
	f"{'Method':<12} {'Time':>7} {'PPL':>8} {'Coher':>7} "
	f"{'Refusal':>8} {'Know':>6} {'Truth':>6} {'Math':>6} "
	f"{'EGA':>5} {'CoT':>4} {'GPU MB':>7}"
	)
	print(header)
	print("-" * len(header))

	for r in results:
	method = r["method"]
	time_s = f"{r['time_seconds']:.0f}s" if "time_seconds" in r else "N/A"

	if "error" in r:
	print(f"{method:<12} {time_s:>7} {'FAILED':>8}")
	continue

	q = r.get("quality", {})
	ppl = q.get("perplexity")
	coh = q.get("coherence")
	ref = q.get("refusal_rate")
	gpu = r.get("peak_gpu_mb")

	# Benchmark scores
	bench = r.get("benchmarks", {})
	know = bench.get("knowledge", {}).get("score")
	truth = bench.get("truthfulness", {}).get("score")
	math = bench.get("math_reasoning", {}).get("score")

	# MoE metrics
	ega = r.get("ega_expert_dirs", "")
	cot = r.get("cot_preserved_layers", "")

	ppl_s = f"{ppl:.1f}" if ppl is not None else "N/A"
	coh_s = f"{coh:.0%}" if coh is not None else "N/A"
	ref_s = f"{ref:.0%}" if ref is not None else "N/A"
	know_s = f"{know:.0%}" if know is not None else "N/A"
	truth_s = f"{truth:.0%}" if truth is not None else "N/A"
	math_s = f"{math:.0%}" if math is not None else "N/A"
	gpu_s = f"{gpu:.0f}" if gpu is not None else "N/A"
	ega_s = str(ega) if ega else "-"
	cot_s = str(cot) if cot else "-"

	print(
	f"{method:<12} {time_s:>7} {ppl_s:>8} {coh_s:>7} "
	f"{ref_s:>8} {know_s:>6} {truth_s:>6} {math_s:>6} "
	f"{ega_s:>5} {cot_s:>4} {gpu_s:>7}"
	)

	print(f"{'='*90}")

	# Legend
	print("\nLegend:")
	print(" PPL = Perplexity (lower = better capability preservation)")
	print(" Coher = Coherence score (higher = more coherent text)")
	print(" Refusal = Refusal rate on harmful prompts (lower = more abliterated)")
	print(" Know = MMLU-style knowledge probe")
	print(" Truth = TruthfulQA-style truthfulness probe")
	print(" Math = GSM8K-style math reasoning probe")
	print(" EGA = Expert-Granular Abliteration directions computed")
	print(" CoT = Layers where CoT reasoning was preserved")
	print(" GPU MB = Peak GPU memory usage")


	def main():
	args = parse_args()

	if args.quick:
	args.prompts = 20
	args.methods = [m for m in args.methods if m not in ("aggressive", "inverted")]
	args.bayesian_trials = 15

	gpu = gpu_info()
	harmful = HARMFUL_PROMPTS[:args.prompts]
	harmless = HARMLESS_PROMPTS[:args.prompts]

	print("=" * 60)
	print(" OBLITERATUS GPT-OSS 20B BENCHMARK")
	print("=" * 60)
	print(f" Model: {args.model}")
	print(f" Methods: {args.methods}")
	print(f" Prompts: {args.prompts} per side")
	print(f" GPU: {gpu['gpu']} ({gpu['total_gb']} GB total, {gpu['free_gb']} GB free)")
	print(f" Benchmarks: {'skip' if args.skip_benchmarks else 'enabled'}")
	if "optimized" in args.methods:
	print(f" Bayesian: {args.bayesian_trials} trials")
	print("=" * 60)

	all_results = []

	for method in args.methods:
	if method not in METHODS:
	print(f"\nSKIP: unknown method '{method}'")
	continue

	print(f"\n{'━'*60}")
	print(f" METHOD: {method} — {METHODS[method]['label']}")
	print(f"{'━'*60}")

	result = run_single_method(
	model_name=args.model,
	method=method,
	harmful=harmful,
	harmless=harmless,
	output_dir=args.output_dir,
	run_benchmarks=not args.skip_benchmarks,
	bayesian_trials=args.bayesian_trials,
	)
	all_results.append(result)

	# Summary
	print_summary_table(all_results)

	# Save JSON
	output_path = args.output or f"benchmark_gpt_oss_{int(time.time())}.json"
	with open(output_path, "w") as f:
	json.dump(all_results, f, indent=2, default=str)
	print(f"\nFull results saved to: {output_path}")


	if __name__ == "__main__":
	main()