obliteratus

Running on Zero

File size: 31,679 Bytes

#!/usr/bin/env python3
"""Abliteration Technique Comparison Study.

A rigorous, controlled comparison of refusal-direction removal techniques.
Uses a synthetic "planted refusal direction" methodology: we inject a known
direction into a model's activations so we can measure whether each technique
correctly identifies and removes it.

Additionally compiles literature results for a full comparison table.

Techniques compared:
  1. Arditi et al. (2024) — difference-of-means, last token, raw prompts
  2. Arditi + chat template — same but with chat-formatted prompts
  3. FailSpy/abliterator — Arditi with middle-60% layer heuristic
  4. Gabliteration — SVD multi-direction (4 dirs), regularization 0.0
  5. grimjim — Gabliteration + norm preservation
  6. OBLITERATUS basic — our current basic config
  7. OBLITERATUS advanced — 4 directions, norm-preserve, reg=0.3
  8. Heretic (p-e-w) — TPE Bayesian optimization (literature)

Metrics:
  - Direction recovery: cosine similarity to planted ground-truth direction
  - Residual after projection: how much of the refusal direction remains
  - Capability preservation: Frobenius distance of modified vs original weights
  - Layer selection accuracy: did it pick the right layers?
  - Perplexity delta: change in language modeling loss (on synthetic data)
"""

from __future__ import annotations

import gc
import json
import math
import os
import sys
import time

import torch
import torch.nn as nn
import torch.nn.functional as F

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))


# ══════════════════════════════════════════════════════════════════════════
# Synthetic model with planted refusal direction
# ══════════════════════════════════════════════════════════════════════════


def create_synthetic_model(
    hidden_dim: int = 128,
    n_layers: int = 12,
    n_heads: int = 4,
    vocab_size: int = 1000,
    seq_len: int = 64,
):
    """Create a tiny GPT-2 model for controlled experiments."""
    from transformers import GPT2Config, GPT2LMHeadModel

    config = GPT2Config(
        vocab_size=vocab_size,
        n_positions=seq_len,
        n_embd=hidden_dim,
        n_layer=n_layers,
        n_head=n_heads,
        n_inner=hidden_dim * 4,
        resid_pdrop=0.0,
        attn_pdrop=0.0,
        embd_pdrop=0.0,
    )
    model = GPT2LMHeadModel(config)
    model.eval()
    return model, config


def plant_refusal_direction(
    model: nn.Module,
    target_layers: list[int],
    hidden_dim: int,
    n_directions: int = 1,
    signal_strength: float = 5.0,
    seed: int = 42,
) -> tuple[dict[int, torch.Tensor], dict[int, torch.Tensor]]:
    """Plant a known refusal direction into specific layers.

    Modifies the output projection (c_proj) of attention modules by adding
    a rank-1 perturbation along a random direction. This simulates the
    refusal direction that RLHF training creates.

    Returns:
        (planted_directions, planted_subspaces): ground truth per layer
    """
    torch.manual_seed(seed)

    planted_directions: dict[int, torch.Tensor] = {}
    planted_subspaces: dict[int, torch.Tensor] = {}

    for idx in target_layers:
        # Generate random orthogonal directions
        dirs = torch.randn(n_directions, hidden_dim)
        # Gram-Schmidt orthogonalize
        for i in range(n_directions):
            for j in range(i):
                dirs[i] -= (dirs[i] @ dirs[j]) * dirs[j]
            dirs[i] = dirs[i] / dirs[i].norm()

        planted_directions[idx] = dirs[0].clone()
        planted_subspaces[idx] = dirs.clone()

        # Inject into attention output projection (c_proj for GPT-2)
        layer = model.transformer.h[idx]
        attn = layer.attn

        # Add refusal component to c_proj: W += strength * d @ d^T
        # This makes the layer produce extra activation along d when
        # processing any input, creating a "refusal signal"
        with torch.no_grad():
            for dir_idx in range(n_directions):
                d = dirs[dir_idx]
                # Scale decreases for secondary directions
                s = signal_strength * (0.7 ** dir_idx)
                # Inject into c_proj (output projection)
                W = attn.c_proj.weight.data  # GPT-2: (hidden, hidden)
                perturbation = s * d.unsqueeze(1) @ d.unsqueeze(0)  # rank-1
                W.add_(perturbation)

    return planted_directions, planted_subspaces


def measure_residual_direction(
    model: nn.Module,
    layer_idx: int,
    direction: torch.Tensor,
) -> float:
    """Measure how much of a direction remains in a layer's output projection.

    Returns the magnitude of the direction's component in the weight matrix.
    """
    layer = model.transformer.h[layer_idx]
    W = layer.attn.c_proj.weight.data
    d = direction.to(W.device, W.dtype)

    # Project W onto direction: ||W @ d||^2 / ||d||^2
    coeff = W @ d  # (hidden,)
    return coeff.norm().item()


def collect_synthetic_activations(
    model: nn.Module,
    n_prompts: int,
    seq_len: int,
    vocab_size: int,
    n_layers: int,
    add_refusal_signal: bool = False,
    signal_direction: dict[int, torch.Tensor] | None = None,
    signal_strength: float = 2.0,
    seed: int = 0,
) -> dict[int, list[torch.Tensor]]:
    """Collect activations on random token sequences.

    If add_refusal_signal=True, adds an artificial activation along
    the signal_direction to simulate harmful-prompt activations.
    """
    torch.manual_seed(seed)

    activations: dict[int, list[torch.Tensor]] = {i: [] for i in range(n_layers)}
    hooks = []

    def make_hook(idx: int):
        def hook_fn(module, input, output):
            hidden = output[0] if isinstance(output, tuple) else output
            act = hidden[:, -1, :].detach().cpu().float()

            if add_refusal_signal and signal_direction and idx in signal_direction:
                # Add the planted refusal activation
                d = signal_direction[idx]
                act = act + signal_strength * d.unsqueeze(0)

            activations[idx].append(act)
        return hook_fn

    layers = list(model.transformer.h)
    for idx in range(n_layers):
        hooks.append(layers[idx].register_forward_hook(make_hook(idx)))

    try:
        for i in range(n_prompts):
            input_ids = torch.randint(0, vocab_size, (1, seq_len))
            with torch.no_grad():
                model(input_ids)
    finally:
        for h in hooks:
            h.remove()

    return activations


# ══════════════════════════════════════════════════════════════════════════
# Reference baseline implementations
# ══════════════════════════════════════════════════════════════════════════


def extract_directions(
    harmful_acts: dict[int, list[torch.Tensor]],
    harmless_acts: dict[int, list[torch.Tensor]],
    n_layers: int,
    n_directions: int = 1,
) -> tuple[dict[int, torch.Tensor], dict[int, torch.Tensor], dict[int, float]]:
    """Extract refusal directions from activation contrasts.

    Returns (directions, subspaces, norms) per layer.
    """
    directions: dict[int, torch.Tensor] = {}
    subspaces: dict[int, torch.Tensor] = {}
    norms: dict[int, float] = {}

    for idx in range(n_layers):
        h_stack = torch.stack(harmful_acts[idx]).squeeze(1)
        s_stack = torch.stack(harmless_acts[idx]).squeeze(1)

        if n_directions == 1:
            diff = h_stack.mean(dim=0) - s_stack.mean(dim=0)
            norm = diff.norm().item()
            if norm > 0:
                directions[idx] = diff / diff.norm()
                subspaces[idx] = directions[idx].unsqueeze(0)
                norms[idx] = norm
        else:
            min_n = min(h_stack.shape[0], s_stack.shape[0])
            diff_matrix = h_stack[:min_n] - s_stack[:min_n]
            diff_matrix = torch.nan_to_num(diff_matrix)
            k = min(n_directions, diff_matrix.shape[0], diff_matrix.shape[1])
            try:
                U, S, Vh = torch.linalg.svd(diff_matrix, full_matrices=False)
                sub = Vh[:k]
                primary = sub[0]
                pn = primary.norm()
                if pn > 1e-8:
                    primary = primary / pn
                directions[idx] = primary
                subspaces[idx] = sub
                norms[idx] = (S[:k] ** 2).sum().item()
            except Exception:
                continue

    return directions, subspaces, norms


def select_layers(
    norms: dict[int, float],
    n_layers: int,
    method: str = "top_norm",
) -> list[int]:
    """Select layers for abliteration."""
    sorted_layers = sorted(norms.items(), key=lambda x: x[1], reverse=True)
    if not sorted_layers:
        return []

    if method == "middle_60":
        start = int(n_layers * 0.2)
        end = int(n_layers * 0.8)
        selected = [idx for idx, _ in sorted_layers if start <= idx < end]
        return selected if selected else [sorted_layers[0][0]]

    elif method == "knee":
        if len(sorted_layers) < 3:
            return [sorted_layers[0][0]]
        vals = [n for _, n in sorted_layers]
        max_n = vals[0]
        if max_n <= 0:
            return [sorted_layers[0][0]]
        normalized = [v / max_n for v in vals]
        n_pts = len(normalized)
        best_k, best_dist = 1, 0.0
        x_s, y_s = 0.0, normalized[0]
        x_e, y_e = 1.0, normalized[-1]
        line_len = math.sqrt((x_e - x_s) ** 2 + (y_e - y_s) ** 2)
        if line_len > 0:
            for i in range(1, n_pts - 1):
                x_i = i / (n_pts - 1)
                y_i = normalized[i]
                dist = abs((y_e - y_s) * x_i - (x_e - x_s) * y_i
                           + x_e * y_s - y_e * x_s) / line_len
                if dist > best_dist:
                    best_dist = dist
                    best_k = i + 1
        min_threshold = max_n * 0.05
        selected = [idx for idx, n in sorted_layers[:best_k] if n >= min_threshold]
        return selected if selected else [sorted_layers[0][0]]

    else:  # top_norm
        max_norm = sorted_layers[0][1]
        threshold = max_norm * 0.5
        selected = [idx for idx, n in sorted_layers if n >= threshold]
        return selected if selected else [sorted_layers[0][0]]


def apply_projection(
    model: nn.Module,
    selected_layers: list[int],
    subspaces: dict[int, torch.Tensor],
    regularization: float = 0.0,
    norm_preserve: bool = False,
    multi_dir_norm_fix: bool = False,
) -> int:
    """Project refusal direction out of weight matrices.

    When multi_dir_norm_fix=True, uses the correct approach: capture norms
    before projecting any directions, then restore once after all directions.
    """
    scale = 1.0 - regularization
    n_modified = 0

    for idx in selected_layers:
        sub = subspaces.get(idx)
        if sub is None:
            continue

        layer = model.transformer.h[idx]

        # Capture norms before any projections (if multi-dir + norm-preserve)
        saved_norms: dict[str, float] = {}
        if multi_dir_norm_fix and norm_preserve and sub.shape[0] > 1:
            for name, param in layer.named_parameters():
                if name.endswith(".weight") and param.dim() == 2:
                    saved_norms[name] = param.data.norm().item()

        for dir_idx in range(sub.shape[0]):
            d = sub[dir_idx].unsqueeze(-1)  # (hidden, 1)

            for name, module in layer.named_modules():
                if not hasattr(module, "weight"):
                    continue
                W = module.weight.data
                if W.dim() != 2:
                    continue

                # Per-direction norm preserve (the OLD buggy way)
                use_per_dir_norm = norm_preserve and not (multi_dir_norm_fix and sub.shape[0] > 1)
                original_norm = W.norm().item() if use_per_dir_norm else 0.0

                if W.shape[-1] == d.shape[0]:
                    coeff = W @ d
                    W.sub_(d.T * (scale * coeff))
                    n_modified += 1
                elif W.shape[0] == d.shape[0]:
                    coeff = d.T @ W
                    W.sub_((scale * d) * coeff)
                    n_modified += 1
                else:
                    continue

                if use_per_dir_norm and original_norm > 0:
                    new_norm = W.norm().item()
                    if new_norm > 0:
                        W.mul_(original_norm / new_norm)

        # Restore norms once after all directions (the FIXED way)
        if multi_dir_norm_fix and norm_preserve and sub.shape[0] > 1 and saved_norms:
            for name, param in layer.named_parameters():
                if name not in saved_norms:
                    continue
                orig = saved_norms[name]
                if orig > 0:
                    cur = param.data.norm().item()
                    if cur > 0 and abs(cur - orig) > 1e-6:
                        param.data.mul_(orig / cur)

    return n_modified


# ══════════════════════════════════════════════════════════════════════════
# Experiment runner
# ══════════════════════════════════════════════════════════════════════════


def run_experiment():
    """Run the full comparison experiment with synthetic planted directions."""

    # Configuration
    hidden_dim = 128
    n_layers = 12
    n_heads = 4
    vocab_size = 1000
    seq_len = 32
    n_prompts = 48       # prompts per side (harmful + harmless)
    n_planted_dirs = 4   # ground truth directions planted
    signal_strength = 5.0
    target_layers = [3, 4, 5, 6, 7, 8]  # layers with planted signal

    print(f"\n{'='*80}")
    print("ABLITERATION TECHNIQUE COMPARISON — SYNTHETIC PLANTED-DIRECTION TEST")
    print(f"{'='*80}")
    print(f"Model:           GPT-2 tiny ({hidden_dim}d, {n_layers}L, {n_heads}H)")
    print(f"Target layers:   {target_layers}")
    print(f"Planted dirs:    {n_planted_dirs} orthogonal directions per target layer")
    print(f"Signal strength: {signal_strength}")
    print(f"Prompts:         {n_prompts} per side")
    print(f"{'='*80}\n")

    # Define experiments
    experiments = [
        {
            "name": "Arditi (1-dir, top-norm)",
            "source": "Arditi 2024",
            "n_directions": 1,
            "layer_selection": "top_norm",
            "regularization": 0.0,
            "norm_preserve": False,
            "multi_dir_norm_fix": False,
        },
        {
            "name": "FailSpy (1-dir, mid-60%)",
            "source": "FailSpy",
            "n_directions": 1,
            "layer_selection": "middle_60",
            "regularization": 0.0,
            "norm_preserve": False,
            "multi_dir_norm_fix": False,
        },
        {
            "name": "Gabliteration (4-dir, knee)",
            "source": "Gabliteration",
            "n_directions": 4,
            "layer_selection": "knee",
            "regularization": 0.0,
            "norm_preserve": False,
            "multi_dir_norm_fix": False,
        },
        {
            "name": "grimjim (4-dir, norm-pres, BUGGY)",
            "source": "grimjim",
            "n_directions": 4,
            "layer_selection": "knee",
            "regularization": 0.0,
            "norm_preserve": True,
            "multi_dir_norm_fix": False,  # Old buggy sequential norm-preserve
        },
        {
            "name": "grimjim (4-dir, norm-pres, FIXED)",
            "source": "Ours (fix)",
            "n_directions": 4,
            "layer_selection": "knee",
            "regularization": 0.0,
            "norm_preserve": True,
            "multi_dir_norm_fix": True,   # Our fix: capture once, restore once
        },
        {
            "name": "OBLITERATUS basic (1-dir, knee)",
            "source": "Ours",
            "n_directions": 1,
            "layer_selection": "knee",
            "regularization": 0.0,
            "norm_preserve": False,
            "multi_dir_norm_fix": False,
        },
        {
            "name": "OBLITERATUS adv (4-dir, reg=0.3)",
            "source": "Ours",
            "n_directions": 4,
            "layer_selection": "knee",
            "regularization": 0.3,
            "norm_preserve": True,
            "multi_dir_norm_fix": True,
        },
        {
            "name": "OBLITERATUS adv (4-dir, reg=0.1)",
            "source": "Ours (tuned)",
            "n_directions": 4,
            "layer_selection": "knee",
            "regularization": 0.1,
            "norm_preserve": True,
            "multi_dir_norm_fix": True,
        },
        {
            "name": "OBLITERATUS adv (4-dir, reg=0.0)",
            "source": "Ours (tuned)",
            "n_directions": 4,
            "layer_selection": "knee",
            "regularization": 0.0,
            "norm_preserve": True,
            "multi_dir_norm_fix": True,
        },
    ]

    results = []

    for exp in experiments:
        print(f"\n{'─'*80}")
        print(f"  {exp['name']}")
        print(f"  Source: {exp['source']}")
        print(f"{'─'*80}")

        t0 = time.time()

        # Create fresh model
        model, config = create_synthetic_model(hidden_dim, n_layers, n_heads, vocab_size, seq_len)

        # Plant ground-truth refusal directions
        planted_dirs, planted_subs = plant_refusal_direction(
            model, target_layers, hidden_dim,
            n_directions=n_planted_dirs,
            signal_strength=signal_strength,
            seed=42,
        )

        # Save original weights for capability comparison
        original_state = {k: v.clone() for k, v in model.state_dict().items()}

        # Measure pre-projection residuals (baseline)
        pre_residuals = {}
        for idx in target_layers:
            pre_residuals[idx] = measure_residual_direction(model, idx, planted_dirs[idx])

        # Step 1: Collect activations
        harmful_acts = collect_synthetic_activations(
            model, n_prompts, seq_len, vocab_size, n_layers,
            add_refusal_signal=True,
            signal_direction=planted_dirs,
            signal_strength=2.0,
            seed=100,
        )
        harmless_acts = collect_synthetic_activations(
            model, n_prompts, seq_len, vocab_size, n_layers,
            add_refusal_signal=False,
            seed=200,
        )

        # Step 2: Extract directions
        ext_dirs, ext_subs, ext_norms = extract_directions(
            harmful_acts, harmless_acts, n_layers, exp["n_directions"],
        )

        # Step 3: Select layers
        selected = select_layers(ext_norms, n_layers, exp["layer_selection"])
        print(f"  Selected layers: {selected}")

        # Step 4: Apply projection
        apply_projection(
            model, selected, ext_subs,
            regularization=exp["regularization"],
            norm_preserve=exp["norm_preserve"],
            multi_dir_norm_fix=exp["multi_dir_norm_fix"],
        )

        # ── Measure results ──────────────────────────────────────────────

        # Direction recovery: cosine similarity between extracted and planted
        cos_sims = []
        for idx in target_layers:
            if idx in ext_dirs and idx in planted_dirs:
                cos = F.cosine_similarity(
                    ext_dirs[idx].unsqueeze(0),
                    planted_dirs[idx].unsqueeze(0),
                ).item()
                cos_sims.append(abs(cos))  # direction or anti-direction
        avg_cos = sum(cos_sims) / len(cos_sims) if cos_sims else 0.0

        # Multi-direction subspace recovery: for n_directions>1, measure
        # what fraction of the planted subspace is captured
        subspace_recovery = []
        for idx in target_layers:
            if idx in ext_subs and idx in planted_subs:
                # Project each planted direction onto extracted subspace
                ext_sub = ext_subs[idx]  # (k_ext, hidden)
                plant_sub = planted_subs[idx]  # (k_plant, hidden)
                for pi in range(min(plant_sub.shape[0], ext_sub.shape[0])):
                    # Projection of planted_i onto extracted subspace
                    proj = ext_sub @ plant_sub[pi]  # (k_ext,)
                    captured = proj.norm().item()  # how much is in the subspace
                    subspace_recovery.append(captured)
        avg_subspace = sum(subspace_recovery) / len(subspace_recovery) if subspace_recovery else 0.0

        # Residual after projection
        post_residuals = {}
        for idx in target_layers:
            if idx in selected:
                post_residuals[idx] = measure_residual_direction(model, idx, planted_dirs[idx])
            else:
                post_residuals[idx] = pre_residuals[idx]  # layer wasn't modified

        avg_removal = 0.0
        removal_scores = []
        for idx in target_layers:
            pre = pre_residuals[idx]
            post = post_residuals[idx]
            if pre > 0:
                removal = 1.0 - (post / pre)
                removal_scores.append(removal)
        avg_removal = sum(removal_scores) / len(removal_scores) if removal_scores else 0.0

        # Multi-direction residual: check ALL planted directions
        multi_dir_removal = []
        for idx in target_layers:
            if idx not in selected:
                continue
            for di in range(planted_subs[idx].shape[0]):
                d = planted_subs[idx][di]
                pre = measure_residual_direction(
                    # Need pre-values - approximate from signal_strength
                    model, idx, d,
                )
                # Compare to signal strength
                multi_dir_removal.append(pre)
        avg_multi_residual = sum(multi_dir_removal) / len(multi_dir_removal) if multi_dir_removal else 0.0

        # Layer selection accuracy
        correct_selected = len(set(selected) & set(target_layers))
        false_selected = len(set(selected) - set(target_layers))
        missed = len(set(target_layers) - set(selected))

        # Capability preservation: Frobenius distance of weights
        new_state = model.state_dict()
        total_dist = 0.0
        for key in original_state:
            diff = (new_state[key].float() - original_state[key].float())
            total_dist += diff.norm().item() ** 2
        total_dist = math.sqrt(total_dist)

        # Perplexity proxy: loss on random sequences
        losses = []
        for _ in range(10):
            input_ids = torch.randint(0, vocab_size, (1, seq_len))
            with torch.no_grad():
                out = model(input_ids, labels=input_ids)
                losses.append(out.loss.item())
        avg_loss = sum(losses) / len(losses)
        ppl = math.exp(min(avg_loss, 100.0))

        elapsed = time.time() - t0

        result = {
            "name": exp["name"],
            "source": exp["source"],
            "n_directions": exp["n_directions"],
            "regularization": exp["regularization"],
            "norm_preserve": exp["norm_preserve"],
            "direction_recovery": round(avg_cos, 4),
            "subspace_recovery": round(avg_subspace, 4),
            "primary_removal": round(avg_removal, 4),
            "multi_dir_avg_residual": round(avg_multi_residual, 4),
            "layers_correct": correct_selected,
            "layers_false_positive": false_selected,
            "layers_missed": missed,
            "n_layers_selected": len(selected),
            "weight_distance": round(total_dist, 2),
            "perplexity": round(ppl, 2),
            "time_seconds": round(elapsed, 2),
        }
        results.append(result)

        print(f"  Direction recovery:     {avg_cos:.3f} (cosine sim to ground truth)")
        print(f"  Subspace recovery:      {avg_subspace:.3f} (planted dirs captured)")
        print(f"  Primary dir removal:    {avg_removal:.1%} (refusal signal removed)")
        print(f"  Multi-dir avg residual: {avg_multi_residual:.3f} (lower = better)")
        print(f"  Layer selection:        {correct_selected}/{len(target_layers)} correct, "
              f"{false_selected} false+, {missed} missed")
        print(f"  Weight distance:        {total_dist:.2f} (capability delta)")
        print(f"  Perplexity:             {ppl:.2f}")

        del model
        gc.collect()

    return results


def print_table(results: list[dict]):
    """Print formatted comparison tables."""

    # ── Table 1: Direction Extraction Quality ──────────────────────────
    print(f"\n\n{'='*100}")
    print("TABLE 1: DIRECTION EXTRACTION & REMOVAL QUALITY")
    print(f"{'='*100}")
    print(f"{'Technique':<38} {'Source':<14} {'DirRecov':>9} {'SubRecov':>9} "
          f"{'Removal':>8} {'Residual':>9}")
    print(f"{'─'*38} {'─'*14} {'─'*9} {'─'*9} {'─'*8} {'─'*9}")

    for r in results:
        name = r["name"][:37]
        source = r["source"][:13]
        dr = f"{r['direction_recovery']:.3f}"
        sr = f"{r['subspace_recovery']:.3f}"
        rm = f"{r['primary_removal']:.1%}"
        res = f"{r['multi_dir_avg_residual']:.3f}"
        print(f"{name:<38} {source:<14} {dr:>9} {sr:>9} {rm:>8} {res:>9}")

    # ── Table 2: Layer Selection & Capability ──────────────────────────
    print(f"\n{'='*100}")
    print("TABLE 2: LAYER SELECTION & CAPABILITY PRESERVATION")
    print(f"{'='*100}")
    print(f"{'Technique':<38} {'Layers':>7} {'Correct':>8} {'FalsePos':>9} "
          f"{'Missed':>7} {'WeightΔ':>8} {'PPL':>8}")
    print(f"{'─'*38} {'─'*7} {'─'*8} {'─'*9} {'─'*7} {'─'*8} {'─'*8}")

    for r in results:
        name = r["name"][:37]
        print(f"{name:<38} {r['n_layers_selected']:>7} {r['layers_correct']:>8} "
              f"{r['layers_false_positive']:>9} {r['layers_missed']:>7} "
              f"{r['weight_distance']:>8.2f} {r['perplexity']:>8.2f}")

    # ── Table 3: Literature Comparison ────────────────────────────────
    print(f"\n\n{'='*110}")
    print("TABLE 3: FULL LANDSCAPE — TECHNIQUES, CAPABILITIES, AND REPORTED RESULTS")
    print(f"{'='*110}")
    print(f"{'Technique':<26} {'Year':>5} {'#Dir':>5} {'Layers':>10} {'NormPres':>9} "
          f"{'Reg':>5} {'AutoTune':>9} {'Reported Refusal→':>18} {'Model':>14}")
    print(f"{'─'*26} {'─'*5} {'─'*5} {'─'*10} {'─'*9} {'─'*5} {'─'*9} {'─'*18} {'─'*14}")

    literature = [
        ("Arditi et al.", "2024", "1", "top-norm", "No", "0.0", "No",
         "~95%→~0%", "Llama-3-8B"),
        ("FailSpy/abliterator", "2024", "1", "mid-60%", "No", "0.0", "No",
         "~90%→~5%", "Llama-3-8B"),
        ("mlabonne tutorial", "2024", "1", "top-norm", "No", "0.0", "No",
         "~90%→~5%", "Llama-3-8B"),
        ("Gabliteration", "2024", "4-8", "knee", "No", "0.0", "No",
         "~95%→~0%", "Various 7B+"),
        ("grimjim norm-pres", "2024", "4-8", "knee", "Yes(bug)", "0.0", "No",
         "~90%→~5%", "Various 7B+"),
        ("Heretic (p-e-w)", "2025", "float", "kernel", "No", "TPE", "Yes",
         "~95%→~0%*", "Gemma-3-12B"),
        ("Wollschlager cones", "2025", "1-5", "per-layer", "—", "—", "RDO",
         "~98%→~1%", "Llama-3.1-8B"),
        ("OBLITERATUS basic", "2025", "1", "knee", "No", "0.0", "No",
         "~95%→60%**", "Qwen-0.5B"),
        ("OBLITERATUS advanced", "2025", "4", "knee", "Yes(fix)", "0.3", "No",
         "~95%→73%**", "Qwen-0.5B"),
        ("OBLITERATUS surgical", "2025", "8", "knee", "Yes(fix)", "0.0", "Yes***",
         "~95%→0%/broken", "Qwen-0.5B"),
    ]

    for row in literature:
        print(f"{row[0]:<26} {row[1]:>5} {row[2]:>5} {row[3]:>10} {row[4]:>9} "
              f"{row[5]:>5} {row[6]:>9} {row[7]:>18} {row[8]:>14}")

    print("\n  * Heretic: 2.8× lower KL divergence than manual abliterations (Gemma-3-12B benchmark)")
    print("  ** Our observed results on Qwen2.5-0.5B-Instruct — 0.5B may be too small for linear methods")
    print("  *** Surgical combines: whitened SVD + SAE + head surgery + neuron masking + jailbreak contrast")
    print(f"{'='*110}")

    # ── Analysis ──────────────────────────────────────────────────────
    print(f"\n{'='*80}")
    print("ANALYSIS: WHY OBLITERATUS UNDERPERFORMS AND WHAT TO FIX")
    print(f"{'='*80}")

    print("""
ROOT CAUSES (ordered by impact):

1. MODEL SIZE: All published abliteration results use 7B+ models
   - Arditi et al.: Llama-3-8B, Gemma-2-9B (hidden_dim=4096+)
   - FailSpy: Llama-3-8B
   - Heretic: Gemma-3-12B (headline benchmark)
   - Wollschlager et al.: Llama-3.1-8B
   - OBLITERATUS benchmarks: Qwen-0.5B (hidden_dim=896)

   The "single refusal direction" hypothesis may not hold well for small
   models. Wollschlager et al. (ICML 2025) showed that refusal lives in
   multi-dimensional CONCEPT CONES, and cone dimension scales with model
   size. A 0.5B model may encode refusal too diffusely for linear methods.

2. BASIC MODE USES NO CHAT TEMPLATE for activation collection
   - The model was trained with chat formatting — without it, activations
     during probing don't reflect actual refusal behavior
   - This is the single highest-impact config fix

3. ADVANCED MODE REGULARIZATION TOO HIGH (0.3)
   - Preserves 30% of refusal component by design
   - Combined with 4 directions where later ones capture noise, net
     removal is weak

4. SURGICAL MODE DOES TOO MUCH
   - 8 directions, whitened SVD, SAE features, neuron masking, head surgery
   - Each individually reasonable; together they destroy a 0.5B model
   - The whitened SVD un-whitening bug (now fixed) was extracting noise

5. NO BAYESIAN OPTIMIZATION (vs Heretic)
   - Heretic's key insight: jointly optimize layer weights, direction
     index, and component-specific parameters via TPE
   - Minimizes refusal rate AND KL divergence simultaneously
   - This automatically handles model-specific tuning that we do manually

RECOMMENDED CONFIG CHANGES:
  - basic:    use_chat_template → True
  - advanced: regularization → 0.1 (from 0.3)
  - surgical: n_directions → 4 (from 8), disable safety_neuron_masking
  - ALL:      Add model-size-aware defaults (n_dirs=1 for <2B, 4 for 2-10B)
  - NEW:      Add TPE optimization loop (like Heretic) as "optimized" method
""")


def main():
    results = run_experiment()
    print_table(results)

    # Save results
    out_path = "/tmp/abliteration_comparison_results.json"
    with open(out_path, "w") as f:
        json.dump(results, f, indent=2)
    print(f"\nResults saved to {out_path}")


if __name__ == "__main__":
    main()