obliteratus / scripts /benchmark_gpt_oss_20b.py
pliny-the-prompter's picture
Upload 127 files
45113e6 verified
#!/usr/bin/env python3
"""OBLITERATUS GPT-OSS 20B Benchmark — Full Method Comparison.
Runs all abliteration methods on openai/gpt-oss-20b and produces a
comprehensive comparison table with:
- Refusal rate (primary metric)
- KL divergence / perplexity (capability preservation)
- Capability probes (knowledge, truthfulness, math reasoning)
- MoE-specific metrics (EGA expert directions, router stability)
- Timing and GPU memory usage
Usage:
python scripts/benchmark_gpt_oss_20b.py
python scripts/benchmark_gpt_oss_20b.py --methods basic surgical optimized nuclear
python scripts/benchmark_gpt_oss_20b.py --prompts 50 --output results.json
python scripts/benchmark_gpt_oss_20b.py --quick # fast mode: 20 prompts, skip slow methods
Designed for T4 16GB (auto 4-bit quantization) or A10G+ (float16).
"""
from __future__ import annotations
import argparse
import gc
import json
import os
import shutil
import sys
import time
from pathlib import Path
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
import torch
# Ensure the project root is on sys.path
project_root = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(project_root))
from obliteratus.abliterate import ( # noqa: E402
AbliterationPipeline,
METHODS,
HARMFUL_PROMPTS,
HARMLESS_PROMPTS,
)
from obliteratus.evaluation.benchmarks import BenchmarkRunner, format_benchmark_report # noqa: E402
def parse_args():
parser = argparse.ArgumentParser(description="OBLITERATUS GPT-OSS 20B Benchmark")
parser.add_argument(
"--model", default="openai/gpt-oss-20b",
help="Model to benchmark (default: openai/gpt-oss-20b)",
)
parser.add_argument(
"--methods", nargs="+",
default=["basic", "advanced", "surgical", "optimized", "inverted", "nuclear"],
help="Methods to compare",
)
parser.add_argument(
"--prompts", type=int, default=33,
help="Number of prompts per side (harmful/harmless)",
)
parser.add_argument(
"--output", type=str, default=None,
help="Save results JSON to this path",
)
parser.add_argument(
"--quick", action="store_true",
help="Quick mode: 20 prompts, skip aggressive/inverted",
)
parser.add_argument(
"--skip-benchmarks", action="store_true",
help="Skip capability benchmark probes (faster)",
)
parser.add_argument(
"--output-dir", default="/tmp/obliteratus_bench",
help="Directory for temporary model outputs",
)
parser.add_argument(
"--bayesian-trials", type=int, default=30,
help="Number of Bayesian optimization trials for 'optimized' method",
)
return parser.parse_args()
def gpu_info() -> dict:
"""Get GPU information."""
if not torch.cuda.is_available():
return {"gpu": "CPU only", "total_gb": 0, "free_gb": 0}
return {
"gpu": torch.cuda.get_device_name(0),
"total_gb": round(torch.cuda.get_device_properties(0).total_memory / 1e9, 1),
"free_gb": round(torch.cuda.mem_get_info(0)[0] / 1e9, 1),
}
def cleanup():
"""Force GPU memory cleanup."""
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
def run_single_method(
model_name: str,
method: str,
harmful: list[str],
harmless: list[str],
output_dir: str,
run_benchmarks: bool = True,
bayesian_trials: int = 30,
) -> dict:
"""Run a single abliteration method and collect metrics."""
cleanup()
outdir = f"{output_dir}/{method}"
t0 = time.time()
pipeline = None
result = {
"model": model_name,
"method": method,
"label": METHODS.get(method, {}).get("label", method),
}
try:
# For the optimized method, we might want to control trial count
if method == "optimized":
# Temporarily patch bayesian_trials in the method config
METHODS["optimized"]["bayesian_trials"] = bayesian_trials
pipeline = AbliterationPipeline(
model_name=model_name,
output_dir=outdir,
device="auto",
dtype="float16",
method=method,
harmful_prompts=harmful,
harmless_prompts=harmless,
on_log=lambda msg: print(f" {msg}"),
)
pipeline.run()
elapsed = time.time() - t0
result.update({
"time_seconds": round(elapsed, 1),
"quality": dict(pipeline._quality_metrics),
"strong_layers": pipeline._strong_layers,
"n_strong_layers": len(pipeline._strong_layers),
"n_directions": pipeline.n_directions,
})
# MoE-specific metrics
if pipeline._expert_directions:
n_expert_dirs = sum(len(d) for d in pipeline._expert_directions.values())
result["ega_expert_dirs"] = n_expert_dirs
result["ega_layers"] = len(pipeline._expert_directions)
if pipeline._expert_safety_scores:
result["expert_classified_layers"] = len(pipeline._expert_safety_scores)
if pipeline._cot_preserve_directions:
result["cot_preserved_layers"] = len(pipeline._cot_preserve_directions)
if pipeline._float_layer_weights:
result["float_layer_weights"] = {
str(k): round(v, 3) for k, v in pipeline._float_layer_weights.items()
}
if pipeline._kl_contributions:
result["kl_contributions"] = {
str(k): round(v, 6) for k, v in pipeline._kl_contributions.items()
}
if pipeline._lora_adapters:
result["lora_adapters"] = len(pipeline._lora_adapters)
if pipeline._steering_hooks:
result["steering_hooks"] = len(pipeline._steering_hooks)
# GPU memory
if torch.cuda.is_available():
result["peak_gpu_mb"] = round(torch.cuda.max_memory_allocated() / 1e6, 1)
# Capability benchmarks (optional)
if run_benchmarks:
print("\n Running capability benchmarks...")
try:
runner = BenchmarkRunner(
pipeline.handle.model,
pipeline.handle.tokenizer,
)
bench_results = runner.run_all()
result["benchmarks"] = {
name: {
"score": round(br.score, 3),
"n_correct": br.n_correct,
"n_total": br.n_total,
"per_category": {
k: round(v, 3) for k, v in br.per_category.items()
},
}
for name, br in bench_results.items()
}
report = format_benchmark_report(bench_results)
print(f"\n{report}")
except Exception as e:
print(f" Benchmark probes failed: {e}")
result["benchmarks"] = {"error": str(e)}
print(f"\n === {method} complete in {elapsed:.1f}s ===")
print(f" Quality: {json.dumps(pipeline._quality_metrics, default=str)}")
except Exception as e:
elapsed = time.time() - t0
result.update({
"time_seconds": round(elapsed, 1),
"error": str(e),
})
print(f"\n === {method} FAILED after {elapsed:.1f}s: {e} ===")
import traceback
traceback.print_exc()
# Cleanup saved model to free disk
shutil.rmtree(outdir, ignore_errors=True)
if pipeline is not None:
del pipeline
cleanup()
return result
def print_summary_table(results: list[dict]):
"""Print a formatted comparison table."""
print(f"\n{'='*90}")
print("BENCHMARK RESULTS SUMMARY")
print(f"{'='*90}")
# Header
header = (
f"{'Method':<12} {'Time':>7} {'PPL':>8} {'Coher':>7} "
f"{'Refusal':>8} {'Know':>6} {'Truth':>6} {'Math':>6} "
f"{'EGA':>5} {'CoT':>4} {'GPU MB':>7}"
)
print(header)
print("-" * len(header))
for r in results:
method = r["method"]
time_s = f"{r['time_seconds']:.0f}s" if "time_seconds" in r else "N/A"
if "error" in r:
print(f"{method:<12} {time_s:>7} {'FAILED':>8}")
continue
q = r.get("quality", {})
ppl = q.get("perplexity")
coh = q.get("coherence")
ref = q.get("refusal_rate")
gpu = r.get("peak_gpu_mb")
# Benchmark scores
bench = r.get("benchmarks", {})
know = bench.get("knowledge", {}).get("score")
truth = bench.get("truthfulness", {}).get("score")
math = bench.get("math_reasoning", {}).get("score")
# MoE metrics
ega = r.get("ega_expert_dirs", "")
cot = r.get("cot_preserved_layers", "")
ppl_s = f"{ppl:.1f}" if ppl is not None else "N/A"
coh_s = f"{coh:.0%}" if coh is not None else "N/A"
ref_s = f"{ref:.0%}" if ref is not None else "N/A"
know_s = f"{know:.0%}" if know is not None else "N/A"
truth_s = f"{truth:.0%}" if truth is not None else "N/A"
math_s = f"{math:.0%}" if math is not None else "N/A"
gpu_s = f"{gpu:.0f}" if gpu is not None else "N/A"
ega_s = str(ega) if ega else "-"
cot_s = str(cot) if cot else "-"
print(
f"{method:<12} {time_s:>7} {ppl_s:>8} {coh_s:>7} "
f"{ref_s:>8} {know_s:>6} {truth_s:>6} {math_s:>6} "
f"{ega_s:>5} {cot_s:>4} {gpu_s:>7}"
)
print(f"{'='*90}")
# Legend
print("\nLegend:")
print(" PPL = Perplexity (lower = better capability preservation)")
print(" Coher = Coherence score (higher = more coherent text)")
print(" Refusal = Refusal rate on harmful prompts (lower = more abliterated)")
print(" Know = MMLU-style knowledge probe")
print(" Truth = TruthfulQA-style truthfulness probe")
print(" Math = GSM8K-style math reasoning probe")
print(" EGA = Expert-Granular Abliteration directions computed")
print(" CoT = Layers where CoT reasoning was preserved")
print(" GPU MB = Peak GPU memory usage")
def main():
args = parse_args()
if args.quick:
args.prompts = 20
args.methods = [m for m in args.methods if m not in ("aggressive", "inverted")]
args.bayesian_trials = 15
gpu = gpu_info()
harmful = HARMFUL_PROMPTS[:args.prompts]
harmless = HARMLESS_PROMPTS[:args.prompts]
print("=" * 60)
print(" OBLITERATUS GPT-OSS 20B BENCHMARK")
print("=" * 60)
print(f" Model: {args.model}")
print(f" Methods: {args.methods}")
print(f" Prompts: {args.prompts} per side")
print(f" GPU: {gpu['gpu']} ({gpu['total_gb']} GB total, {gpu['free_gb']} GB free)")
print(f" Benchmarks: {'skip' if args.skip_benchmarks else 'enabled'}")
if "optimized" in args.methods:
print(f" Bayesian: {args.bayesian_trials} trials")
print("=" * 60)
all_results = []
for method in args.methods:
if method not in METHODS:
print(f"\nSKIP: unknown method '{method}'")
continue
print(f"\n{'━'*60}")
print(f" METHOD: {method}{METHODS[method]['label']}")
print(f"{'━'*60}")
result = run_single_method(
model_name=args.model,
method=method,
harmful=harmful,
harmless=harmless,
output_dir=args.output_dir,
run_benchmarks=not args.skip_benchmarks,
bayesian_trials=args.bayesian_trials,
)
all_results.append(result)
# Summary
print_summary_table(all_results)
# Save JSON
output_path = args.output or f"benchmark_gpt_oss_{int(time.time())}.json"
with open(output_path, "w") as f:
json.dump(all_results, f, indent=2, default=str)
print(f"\nFull results saved to: {output_path}")
if __name__ == "__main__":
main()