obliteratus / scripts /run_benchmark_remote.sh
pliny-the-prompter's picture
Upload 129 files
4837177 verified
#!/usr/bin/env bash
# ─────────────────────────────────────────────────────────────────────────────
# OBLITERATUS Remote Benchmark Runner
#
# One-command benchmark on your HuggingFace Space GPU.
#
# Usage:
# ./scripts/run_benchmark_remote.sh # defaults: Qwen 0.5B, all methods
# ./scripts/run_benchmark_remote.sh --model Qwen/Qwen2.5-1.5B-Instruct
# ./scripts/run_benchmark_remote.sh --model openai/gpt-oss-20b
# ./scripts/run_benchmark_remote.sh --models "Qwen/Qwen2.5-0.5B-Instruct openai/gpt-oss-20b"
# ./scripts/run_benchmark_remote.sh --methods "basic advanced surgical"
# ./scripts/run_benchmark_remote.sh --prompts 33 # use 33/66/99 prompts per side
# ./scripts/run_benchmark_remote.sh --dry-run # print the command, don't execute
# ./scripts/run_benchmark_remote.sh --verbose # show SSH debug output
# ─────────────────────────────────────────────────────────────────────────────
set -euo pipefail
# ── Defaults ─────────────────────────────────────────────────────────────────
SSH_KEY="${OBLITERATUS_SSH_KEY:-$HOME/.ssh/hf_obliteratus}"
SSH_HOST="${OBLITERATUS_SSH_HOST:-}"
MODEL="${OBLITERATUS_MODEL:-Qwen/Qwen2.5-0.5B-Instruct}"
MODELS=""
METHODS="${OBLITERATUS_METHODS:-basic advanced aggressive surgical inverted nuclear}"
PROMPTS="${OBLITERATUS_PROMPTS:-33}"
DRY_RUN=false
VERBOSE=false
# ── Parse args ───────────────────────────────────────────────────────────────
while [[ $# -gt 0 ]]; do
case "$1" in
--model) MODEL="$2"; MODELS=""; shift 2 ;;
--models) MODELS="$2"; shift 2 ;;
--methods) METHODS="$2"; shift 2 ;;
--prompts) PROMPTS="$2"; shift 2 ;;
--key) SSH_KEY="$2"; shift 2 ;;
--host) SSH_HOST="$2"; shift 2 ;;
--dry-run) DRY_RUN=true; shift ;;
--verbose|-v) VERBOSE=true; shift ;;
-h|--help)
head -15 "$0" | tail -11
exit 0
;;
*)
echo "Unknown arg: $1" >&2; exit 1 ;;
esac
done
# If --models not set, use single --model
if [[ -z "$MODELS" ]]; then
MODELS="$MODEL"
fi
# ── Validate SSH host ──────────────────────────────────────────────────────
if [[ -z "$SSH_HOST" ]]; then
echo "ERROR: SSH_HOST not configured."
echo ""
echo "Set your HF Space SSH host:"
echo " 1. export OBLITERATUS_SSH_HOST=your-username-spacename@ssh.hf.space"
echo " 2. Or pass --host your-username-spacename@ssh.hf.space"
exit 1
fi
# ── Validate SSH key ────────────────────────────────────────────────────────
if [[ ! -f "$SSH_KEY" ]]; then
echo "ERROR: SSH key not found at $SSH_KEY"
echo ""
echo "Either:"
echo " 1. Place your HF Space SSH key at ~/.ssh/hf_obliteratus"
echo " 2. Set OBLITERATUS_SSH_KEY=/path/to/key"
echo " 3. Pass --key /path/to/key"
exit 1
fi
echo "╔══════════════════════════════════════════════════════════════╗"
echo "β•‘ OBLITERATUS β€” Remote GPU Benchmark β•‘"
echo "╠══════════════════════════════════════════════════════════════╣"
echo "β•‘ Host: $SSH_HOST"
echo "β•‘ Models: $MODELS"
echo "β•‘ Methods: $METHODS"
echo "β•‘ Prompts: $PROMPTS per side"
echo "β•‘ SSH key: $SSH_KEY"
echo "β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•"
echo ""
# ── Build the Python benchmark script to run remotely ────────────────────────
read -r -d '' REMOTE_SCRIPT << 'PYEOF' || true
import json, sys, time, shutil, gc, os
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
os.environ.setdefault("CUDA_LAUNCH_BLOCKING", "1")
import torch
import torch.nn as nn
# Add app dir to path (HF Space layout: /home/user/app)
sys.path.insert(0, os.environ.get("APP_DIR", "/home/user/app"))
# ── Hotpatch: fix device detection for accelerate device_map="auto" ──────
# The deployed Space code uses next(model.parameters()).device which is
# unreliable when accelerate distributes params across devices.
import obliteratus.abliterate as _abl
@staticmethod
def _get_model_device(model):
"""Find the correct input device (embedding layer) for accelerate models."""
if hasattr(model, "hf_device_map"):
try:
embed = model.get_input_embeddings()
return next(embed.parameters()).device
except (StopIteration, AttributeError):
for p in model.parameters():
if p.device.type != "meta":
return p.device
return torch.device("cpu")
return next(model.parameters()).device
_abl.AbliterationPipeline._get_model_device = _get_model_device
# Patch _collect_activations to use the fixed device detection
_orig_collect = _abl.AbliterationPipeline._collect_activations.__code__
import types
def _patched_collect(self, layer_modules, prompts, label):
"""Collect last-token activations β€” patched for correct device detection."""
n_layers = len(layer_modules)
activations = {i: [] for i in range(n_layers)}
hooks = []
def make_hook(idx):
def hook_fn(module, input, output):
hidden = output[0] if isinstance(output, tuple) else output
activations[idx].append(hidden[:, -1, :].detach().cpu().float())
return hook_fn
for idx in range(n_layers):
hooks.append(layer_modules[idx].register_forward_hook(make_hook(idx)))
model = self.handle.model
tokenizer = self.handle.tokenizer
max_length = 256
if torch.cuda.is_available():
free_gb = sum(
torch.cuda.mem_get_info(i)[0] / (1024 ** 3)
for i in range(torch.cuda.device_count())
)
if free_gb < 2.0:
max_length = 64
self.log(f" Low GPU memory ({free_gb:.1f} GB free), using max_length={max_length}")
elif free_gb < 4.0:
max_length = 128
self.log(f" Tight GPU memory ({free_gb:.1f} GB free), using max_length={max_length}")
device = self._get_model_device(model)
try:
for i, prompt in enumerate(prompts):
self.log(f" [{label}] prompt {i + 1}/{len(prompts)}")
inputs = tokenizer(
prompt, return_tensors="pt", padding=True, truncation=True,
max_length=max_length,
)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
model(**inputs)
del inputs
self._free_gpu_memory()
finally:
for h in hooks:
h.remove()
return activations
_abl.AbliterationPipeline._collect_activations = _patched_collect
print("[hotpatch] Device detection fix applied")
# ── End hotpatch ─────────────────────────────────────────────────────────
# ── Hotpatch: nuclear mode tuning ─────────────────────────────────────────
# The deployed Space code has stale nuclear defaults. Override them here
# so the benchmark exercises the latest tuning without redeploying.
import math as _math
# 1. Updated method configs (read at __init__ time)
_abl.METHODS["nuclear"].update({
"n_directions": 4,
"reflection_strength": 1.25,
"embed_regularization": 0.50,
"steering_strength": 0.15,
"safety_neuron_masking": False,
})
_abl.METHODS["inverted"]["safety_neuron_masking"] = False
# 2. Cap layers for inversion modes (40% of total) β€” post-distill
_orig_distill = _abl.AbliterationPipeline._distill_refusal_subspace
def _patched_distill(self):
_orig_distill(self)
if self.invert_refusal and self._strong_layers:
try:
n_total = len(_abl.get_layer_modules(self.handle))
except Exception:
n_total = 24
max_layers = max(3, int(n_total * 0.40))
if len(self._strong_layers) > max_layers:
old_count = len(self._strong_layers)
self._strong_layers = self._strong_layers[:max_layers]
self.log(f" [hotpatch] Capped {old_count} -> {max_layers} layers for inversion (40% of {n_total})")
# Truncate SAE directions: 4 features for nuclear, 6 for inverted
n_sae = 4 if self.reflection_strength < 2.0 else 6
for idx in list(self._sae_directions.keys()):
dirs = self._sae_directions[idx]
if dirs.shape[0] > n_sae:
self._sae_directions[idx] = dirs[:n_sae]
if self._sae_directions:
self.log(f" [hotpatch] SAE features capped to {n_sae} per layer")
_abl.AbliterationPipeline._distill_refusal_subspace = _patched_distill
print("[hotpatch] Nuclear tuning: 4 dirs, 1.25x reflect, no neuron mask, 40%% layer cap, 4 SAE features")
# ── End nuclear hotpatch ──────────────────────────────────────────────────
from obliteratus.abliterate import AbliterationPipeline, METHODS, HARMFUL_PROMPTS, HARMLESS_PROMPTS
MODELS_LIST = os.environ["BENCH_MODELS"].split()
METHODS_LIST = os.environ["BENCH_METHODS"].split()
N_PROMPTS = int(os.environ["BENCH_PROMPTS"])
print(f"\n{'='*60}")
print(f"OBLITERATUS BENCHMARK")
print(f"{'='*60}")
print(f"Models: {MODELS_LIST}")
print(f"Methods: {METHODS_LIST}")
print(f"Prompts: {N_PROMPTS} per side")
if torch.cuda.is_available():
gpu = torch.cuda.get_device_name(0)
total = torch.cuda.get_device_properties(0).total_memory / 1e9
free = torch.cuda.mem_get_info(0)[0] / 1e9
print(f"GPU: {gpu} ({total:.1f} GB total, {free:.1f} GB free)")
else:
print("GPU: NONE (CPU only)")
print(f"{'='*60}\n")
harmful = HARMFUL_PROMPTS[:N_PROMPTS]
harmless = HARMLESS_PROMPTS[:N_PROMPTS]
all_results = []
for model_name in MODELS_LIST:
print(f"\n{'═'*60}")
print(f"MODEL: {model_name}")
print(f"{'═'*60}")
model_results = []
for method in METHODS_LIST:
if method not in METHODS:
print(f"SKIP unknown method: {method}")
continue
print(f"\n{'─'*60}")
print(f"METHOD: {method} β€” {METHODS[method]['label']}")
print(f"{'─'*60}")
# Clean slate
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
outdir = f"/tmp/obliteratus_bench_{method}"
t0 = time.time()
pipeline = None
try:
pipeline = AbliterationPipeline(
model_name=model_name,
output_dir=outdir,
device="auto",
dtype="float16",
method=method,
harmful_prompts=harmful,
harmless_prompts=harmless,
on_log=lambda msg: print(f" {msg}"),
)
result_path = pipeline.run()
elapsed = time.time() - t0
r = {
"model": model_name,
"method": method,
"label": METHODS[method]["label"],
"time_seconds": round(elapsed, 1),
"quality": pipeline._quality_metrics,
"strong_layers": pipeline._strong_layers,
"n_strong_layers": len(pipeline._strong_layers),
"n_directions": pipeline.n_directions,
}
if torch.cuda.is_available():
r["peak_gpu_mb"] = round(torch.cuda.max_memory_allocated() / 1e6, 1)
model_results.append(r)
print(f"\n βœ“ {method} complete in {elapsed:.1f}s")
print(f" Quality: {json.dumps(pipeline._quality_metrics, default=str)}")
except Exception as e:
elapsed = time.time() - t0
model_results.append({
"model": model_name,
"method": method,
"label": METHODS.get(method, {}).get("label", method),
"time_seconds": round(elapsed, 1),
"error": str(e),
})
print(f"\n βœ— {method} FAILED after {elapsed:.1f}s: {e}")
import traceback
traceback.print_exc()
# Cleanup saved model to free disk
shutil.rmtree(outdir, ignore_errors=True)
# Force cleanup between runs
if pipeline is not None:
del pipeline
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
all_results.extend(model_results)
# Summary table for this model
print(f"\n{'='*60}")
print(f"RESULTS: {model_name}")
print(f"{'Method':<12} {'Time':>8} {'PPL':>10} {'Coher':>8} {'Refusal':>8} {'GPU MB':>8}")
print(f"{'─'*12} {'─'*8} {'─'*10} {'─'*8} {'─'*8} {'─'*8}")
for r in model_results:
if "error" in r:
print(f"{r['method']:<12} {r['time_seconds']:>7.1f}s {'FAILED':>10}")
continue
q = r.get("quality", {})
ppl = q.get("perplexity")
coh = q.get("coherence")
ref = q.get("refusal_rate")
gpu = r.get("peak_gpu_mb")
ppl_str = f"{ppl:.2f}" if ppl is not None else "N/A"
print(f"{r['method']:<12} {r['time_seconds']:>7.1f}s "
f"{ppl_str:>10} "
f"{f'{coh:.0%}' if coh is not None else 'N/A':>8} "
f"{f'{ref:.0%}' if ref is not None else 'N/A':>8} "
f"{gpu if gpu is not None else 'N/A':>8}")
print(f"{'='*60}")
# Final JSON dump
print(f"\n\n{'='*60}")
print("ALL BENCHMARK RESULTS (JSON)")
print(f"{'='*60}")
print("```json")
print(json.dumps(all_results, indent=2, default=str))
print("```")
PYEOF
# ── SSH options ──────────────────────────────────────────────────────────────
SSH_OPTS=(
-i "$SSH_KEY"
-o StrictHostKeyChecking=no
-o UserKnownHostsFile=/dev/null
-o ConnectTimeout=30
-o ServerAliveInterval=60
-o ServerAliveCountMax=10
)
if $VERBOSE; then
SSH_OPTS+=( -v )
fi
# ── Pre-flight: verify SSH connectivity ─────────────────────────────────────
echo "Checking SSH connectivity..."
if ! ssh "${SSH_OPTS[@]}" "$SSH_HOST" "echo 'SSH_OK'" 2>/tmp/obliteratus_ssh_debug.log; then
echo ""
echo "ERROR: SSH connection failed!"
echo ""
echo "Debug output:"
cat /tmp/obliteratus_ssh_debug.log
echo ""
echo "Troubleshooting checklist:"
echo " 1. Is Dev Mode enabled on your HF Space?"
echo " β†’ Check your Space's Settings tab (Dev Mode must be ON)"
echo " 2. Is the Space awake (not sleeping/building)?"
echo " β†’ Visit the Space URL and wait for the UI to load"
echo " 3. Is your SSH public key added to your HF profile?"
echo " β†’ https://huggingface.co/settings/keys"
echo " β†’ Run: cat ${SSH_KEY}.pub"
echo " 4. Are key permissions correct?"
echo " β†’ Run: chmod 600 $SSH_KEY"
echo " 5. Try manually:"
echo " β†’ ssh -v -i $SSH_KEY $SSH_HOST echo hello"
echo ""
rm -f /tmp/obliteratus_ssh_debug.log
exit 1
fi
rm -f /tmp/obliteratus_ssh_debug.log
echo "SSH connection verified βœ“"
echo ""
# ── Build SSH command ────────────────────────────────────────────────────────
# Write the Python script to a temp file and pipe it, instead of passing
# via -c (avoids command-line length limits and shell escaping issues).
REMOTE_SCRIPT_FILE=$(mktemp /tmp/obliteratus_bench_XXXXXX.py)
echo "$REMOTE_SCRIPT" > "$REMOTE_SCRIPT_FILE"
trap "rm -f '$REMOTE_SCRIPT_FILE'" EXIT
if $DRY_RUN; then
echo "[DRY RUN] Would execute:"
echo " cat script.py | ssh ${SSH_OPTS[*]} $SSH_HOST 'BENCH_MODELS=... python3 -u'"
echo ""
echo "Script saved to: $REMOTE_SCRIPT_FILE"
exit 0
fi
echo "Running benchmark on Space..."
echo ""
# Sanitize inputs: reject values containing shell metacharacters to prevent
# command injection on the remote host.
for _var_name in MODELS METHODS PROMPTS; do
_val="${!_var_name}"
if [[ "$_val" =~ [\'\"\;\&\|\`\$\(\)\{\}\<\>\\] ]]; then
echo "ERROR: ${_var_name} contains unsafe characters: $_val" >&2
exit 1
fi
done
cat "$REMOTE_SCRIPT_FILE" | ssh "${SSH_OPTS[@]}" "$SSH_HOST" \
"BENCH_MODELS='$MODELS' BENCH_METHODS='$METHODS' BENCH_PROMPTS='$PROMPTS' python3 -u -"