obliteratus

Running

App Files Files Community

pliny-the-prompter commited on Mar 3

Commit

4837177

verified ·

1 Parent(s): ab1b6fe

Upload 129 files

Browse files

Files changed (6) hide show

README.md +3 -3
app.py +12 -11
hf-spaces/README.md +1 -1
obliteratus/abliterate.py +105 -65
obliteratus/cli.py +1 -1
scripts/run_benchmark_remote.sh +2 -2

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: OBLITERATUS
-emoji: "\U0001F513"
 colorFrom: green
 colorTo: gray
 sdk: gradio
@@ -302,9 +302,9 @@ Beyond targeted liberation, OBLITERATUS is a general-purpose ablation suite for
 Each strategy enumerates all possible ablations, applies them one at a time, measures the impact, and restores the model — giving you a complete map of where the chains are anchored vs. where the mind lives.
-## 47 curated models across 5 tiers
-OBLITERATUS ships with presets for 47 models organized by compute requirement:
 | Tier | VRAM | Example models |
 |------|------|---------------|

 ---
 title: OBLITERATUS
+emoji: "⛓️‍💥"
 colorFrom: green
 colorTo: gray
 sdk: gradio
 Each strategy enumerates all possible ablations, applies them one at a time, measures the impact, and restores the model — giving you a complete map of where the chains are anchored vs. where the mind lives.
+## 116 curated models across 5 tiers
+OBLITERATUS ships with presets for 116 models organized by compute requirement:
 | Tier | VRAM | Example models |
 |------|------|---------------|

app.py CHANGED Viewed

@@ -324,13 +324,13 @@ _NEEDS_QUANTIZATION = {
 }
-def _should_quantize(model_id: str) -> str | None:
     """Return '4bit' if the model needs quantization for available GPU, else None."""
     try:
         from obliteratus.models.loader import _estimate_model_memory_gb, _available_gpu_memory_gb
         from transformers import AutoConfig
         token = os.environ.get("HF_TOKEN") or None
-        config = AutoConfig.from_pretrained(model_id, trust_remote_code=True, token=token)
         # Skip if model already ships with native quantization (e.g. Mxfp4Config)
         if getattr(config, "quantization_config", None) is not None:
             return None
@@ -701,7 +701,7 @@ def benchmark(
             if result.status == "running":
                 run_logs.append(f"{stage_key.upper()} — {result.message}")
-        quantization = _should_quantize(model_id)
         def run_pipeline():
             try:
@@ -1044,7 +1044,7 @@ def benchmark_multi_model(
         def on_stage(result):
             pass
-        quantization = _should_quantize(model_id)
         def run_pipeline():
             try:
@@ -1359,9 +1359,10 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
         _state["model_name"] = model_choice
         _state["method"] = method
-    global _obliterate_counter
-    _obliterate_counter += 1
-    save_dir = f"/tmp/obliterated_{_obliterate_counter}"
     log_lines = []
     last_yielded = [0]
@@ -1387,7 +1388,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
         idx = stage_order.get(stage_key, 0)
         progress((idx + 1) / 6, desc=f"{stage_key.upper()}")
-    quantization = _should_quantize(model_id)
     def run_pipeline():
         try:
@@ -1497,7 +1498,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
     # Stream log updates while pipeline runs (max 45 minutes to prevent indefinite hang)
     _max_pipeline_secs = 45 * 60
     _pipeline_start = time.time()
-    status_msg = f"**Obliterating\u2026** (0s)"
     while worker.is_alive():
         status_msg = f"**Obliterating\u2026** ({_elapsed()})"
         if len(log_lines) > last_yielded[0]:
@@ -2018,8 +2019,8 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
     else:
         n = min(len(harmful_all), len(harmless_all))
-    quantization = _should_quantize(model_id)
     is_preset = cfg["model_choice"] in MODELS
     pipeline_ref = [None]
     error_ref = [None]
@@ -2319,7 +2320,7 @@ def strength_sweep(model_choice: str, method_choice: str,
         def _run_sweep_point():
             try:
-                quantization = _should_quantize(model_id)
                 pipe = AbliterationPipeline(
                     model_id, method=method_key,
                     output_dir=f"/tmp/sweep_{step_i}",

 }
+def _should_quantize(model_id: str, is_preset: bool = False) -> str | None:
     """Return '4bit' if the model needs quantization for available GPU, else None."""
     try:
         from obliteratus.models.loader import _estimate_model_memory_gb, _available_gpu_memory_gb
         from transformers import AutoConfig
         token = os.environ.get("HF_TOKEN") or None
+        config = AutoConfig.from_pretrained(model_id, trust_remote_code=is_preset, token=token)
         # Skip if model already ships with native quantization (e.g. Mxfp4Config)
         if getattr(config, "quantization_config", None) is not None:
             return None
             if result.status == "running":
                 run_logs.append(f"{stage_key.upper()} — {result.message}")
+        quantization = _should_quantize(model_id, is_preset=is_preset)
         def run_pipeline():
             try:
         def on_stage(result):
             pass
+        quantization = _should_quantize(model_id, is_preset=is_preset_model)
         def run_pipeline():
             try:
         _state["model_name"] = model_choice
         _state["method"] = method
+    with _lock:
+        global _obliterate_counter
+        _obliterate_counter += 1
+        save_dir = f"/tmp/obliterated_{_obliterate_counter}"
     log_lines = []
     last_yielded = [0]
         idx = stage_order.get(stage_key, 0)
         progress((idx + 1) / 6, desc=f"{stage_key.upper()}")
+    quantization = _should_quantize(model_id, is_preset=is_preset)
     def run_pipeline():
         try:
     # Stream log updates while pipeline runs (max 45 minutes to prevent indefinite hang)
     _max_pipeline_secs = 45 * 60
     _pipeline_start = time.time()
+    status_msg = "**Obliterating\u2026** (0s)"
     while worker.is_alive():
         status_msg = f"**Obliterating\u2026** ({_elapsed()})"
         if len(log_lines) > last_yielded[0]:
     else:
         n = min(len(harmful_all), len(harmless_all))
     is_preset = cfg["model_choice"] in MODELS
+    quantization = _should_quantize(model_id, is_preset=is_preset)
     pipeline_ref = [None]
     error_ref = [None]
         def _run_sweep_point():
             try:
+                quantization = _should_quantize(model_id, is_preset=is_preset)
                 pipe = AbliterationPipeline(
                     model_id, method=method_key,
                     output_dir=f"/tmp/sweep_{step_i}",

hf-spaces/README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: OBLITERATUS
-emoji: "🔓"
 colorFrom: green
 colorTo: gray
 sdk: gradio

 ---
 title: OBLITERATUS
+emoji: "⛓️‍💥"
 colorFrom: green
 colorTo: gray
 sdk: gradio

obliteratus/abliterate.py CHANGED Viewed

@@ -949,8 +949,14 @@ class AbliterationPipeline:
             self.log(f"  Router profiling complete: {n_profiled} MoE layers profiled")
         for idx in range(n_layers):
-            self._harmful_means[idx] = torch.stack(self._harmful_acts[idx]).mean(dim=0)
-            self._harmless_means[idx] = torch.stack(self._harmless_acts[idx]).mean(dim=0)
         # ── Jailbreak-contrastive probing ─────────────────────────────────
         if self.use_jailbreak_contrast:
@@ -1008,18 +1014,31 @@ class AbliterationPipeline:
         n = len(prompts)
         self.log(f"  Wrapping {n} prompts with chat template")
         wrapped = []
-        for i, prompt in enumerate(prompts):
-            messages = [{"role": "user", "content": prompt}]
             try:
                 text = tokenizer.apply_chat_template(
-                    messages, tokenize=False, add_generation_prompt=True
                 )
                 wrapped.append(text)
             except Exception:
-                wrapped.append(prompt)  # fallback to raw if individual prompt fails
-            if (i + 1) % 100 == 0 or (i + 1) == n:
-                self.log(f"    chat template {i + 1}/{n}")
         return wrapped
     @staticmethod
@@ -1426,7 +1445,7 @@ class AbliterationPipeline:
                         if n_dirs > 1:
                             harmful_stack = torch.stack(self._harmful_acts[idx]).squeeze(1)
                             harmless_stack = torch.stack(self._harmless_acts[idx]).squeeze(1)
-                            diff_matrix = harmful_stack - harmless_stack
                             if torch.isfinite(diff_matrix).all():
                                 k = min(n_dirs, diff_matrix.shape[0], diff_matrix.shape[1])
                                 _, _, Vh = torch.linalg.svd(diff_matrix, full_matrices=False)
@@ -1475,7 +1494,7 @@ class AbliterationPipeline:
                 # SVD-based multi-direction extraction (Gabliteration)
                 harmful_stack = torch.stack(self._harmful_acts[idx]).squeeze(1)  # (n_prompts, hidden)
                 harmless_stack = torch.stack(self._harmless_acts[idx]).squeeze(1)
-                diff_matrix = harmful_stack - harmless_stack  # (n_prompts, hidden_dim)
                 # SVD to extract principal refusal directions
                 if not torch.isfinite(diff_matrix).all():
@@ -3046,16 +3065,21 @@ class AbliterationPipeline:
                     # remove components that lie in both subspaces (violating
                     # the GRRO's independent-αᵢ assumption; see theory journal
                     # §12.6 "SAE-SVD Orthogonalization").
-                    for si in range(sae_dirs.shape[0]):
-                        for di in range(subspace.shape[0]):
-                            svd_d = subspace[di].to(sae_dirs.device)
-                            overlap = sae_dirs[si] @ svd_d
-                            sae_dirs[si] -= overlap * svd_d
-                        sae_norm = sae_dirs[si].norm()
-                        if sae_norm > 1e-8:
-                            sae_dirs[si] /= sae_norm
-                        # else: SAE direction was entirely within SVD subspace,
-                        # will be skipped by the norm check below.
                     sae_count = 0
                     # SAE regularization: for inversion modes, use a much
                     # gentler floor (0.6 = 40% removal) since these are
@@ -3063,39 +3087,52 @@ class AbliterationPipeline:
                     # projection which already uses full reflection.
                     sae_reg_floor = 0.6 if self.invert_refusal else 0.3
                     sae_reg = max(layer_reg, sae_reg_floor) if not self.invert_refusal else sae_reg_floor
-                    for si in range(sae_dirs.shape[0]):
                         # Skip SAE directions that collapsed to near-zero
                         # after orthogonalization (fully redundant with SVD)
-                        if sae_dirs[si].norm() < 1e-6:
                             continue
-                        sd = sae_dirs[si].to(device).unsqueeze(-1)
-                        try:
-                            attn = get_attention_module(layers[idx], arch)
-                            sae_count += self._project_out_advanced(
-                                attn, sd, _ATTN_OUT_NAMES,
-                                norm_preserve=self.norm_preserve,
-                                regularization=sae_reg,
-                            )
-                        except (AttributeError, RuntimeError):
-                            pass
-                        try:
-                            ffn = get_ffn_module(layers[idx], arch)
-                            fc = self._project_out_advanced(
-                                ffn, sd, _FFN_OUT_NAMES,
-                                norm_preserve=self.norm_preserve,
-                                regularization=sae_reg,
-                            )
-                            if fc == 0:
-                                fc = self._project_moe_experts(
-                                    ffn, sd,
                                     norm_preserve=self.norm_preserve,
                                     regularization=sae_reg,
-                                    project_biases=False,
                                 )
-                            sae_count += fc
-                        except (AttributeError, RuntimeError):
-                            pass
                         del sd
                     total_sae_projections += sae_count
                     count += sae_count
@@ -3156,23 +3193,26 @@ class AbliterationPipeline:
             model = self.handle.model
             if last_strong in self.refusal_subspaces:
                 subspace = self.refusal_subspaces[last_strong]
-                for dir_idx in range(subspace.shape[0]):
-                    direction = subspace[dir_idx]
-                    lm_device = self._get_model_device(model)
-                    d = direction.to(lm_device).unsqueeze(-1)
-                    # Try common lm_head attribute names
-                    for head_name in ["lm_head", "embed_out", "output"]:
-                        head = getattr(model, head_name, None)
-                        if head is not None and hasattr(head, "weight"):
-                            # Inversion: reflect lm_head to flip refusal token logits
-                            lm_reg = (1.0 - self.reflection_strength) if self.invert_refusal else 0.0
-                            lm_head_count += self._project_out_advanced(
-                                model, d, [head_name],
-                                norm_preserve=self.norm_preserve,
-                                regularization=lm_reg,
-                            )
-                            break
-                    del d
         if lm_head_count > 0:
             total_modified += lm_head_count
             self.log(f"  lm_head: {lm_head_count} projections")
@@ -3339,7 +3379,7 @@ class AbliterationPipeline:
                         if n_dirs > 1:
                             harmful_stack = torch.stack(self._harmful_acts[idx]).squeeze(1)
                             harmless_stack = torch.stack(self._harmless_acts[idx]).squeeze(1)
-                            diff_matrix = harmful_stack - harmless_stack
                             if torch.isfinite(diff_matrix).all():
                                 k = min(n_dirs, diff_matrix.shape[0], diff_matrix.shape[1])
                                 _, _, Vh = torch.linalg.svd(diff_matrix, full_matrices=False)
@@ -3374,7 +3414,7 @@ class AbliterationPipeline:
             else:
                 harmful_stack = torch.stack(self._harmful_acts[idx]).squeeze(1)
                 harmless_stack = torch.stack(self._harmless_acts[idx]).squeeze(1)
-                diff_matrix = harmful_stack - harmless_stack
                 if not torch.isfinite(diff_matrix).all():
                     diff_matrix = torch.nan_to_num(diff_matrix, nan=0.0, posinf=0.0, neginf=0.0)
                 k = min(n_dirs, diff_matrix.shape[0], diff_matrix.shape[1])

             self.log(f"  Router profiling complete: {n_profiled} MoE layers profiled")
         for idx in range(n_layers):
+            if self._harmful_acts[idx] and self._harmless_acts[idx]:
+                self._harmful_means[idx] = torch.stack(self._harmful_acts[idx]).mean(dim=0)
+                self._harmless_means[idx] = torch.stack(self._harmless_acts[idx]).mean(dim=0)
+            else:
+                # Layer produced no activations (hook failure or skipped layer)
+                hidden = self._harmful_acts[0][0].shape[-1] if self._harmful_acts.get(0) else 768
+                self._harmful_means[idx] = torch.zeros(1, hidden)
+                self._harmless_means[idx] = torch.zeros(1, hidden)
         # ── Jailbreak-contrastive probing ─────────────────────────────────
         if self.use_jailbreak_contrast:
         n = len(prompts)
         self.log(f"  Wrapping {n} prompts with chat template")
+        # Try batch application first (single call, much faster for large sets)
+        all_conversations = [[{"role": "user", "content": p}] for p in prompts]
+        try:
+            wrapped = [
+                tokenizer.apply_chat_template(
+                    conv, tokenize=False, add_generation_prompt=True
+                )
+                for conv in all_conversations
+            ]
+            self.log(f"    chat template {n}/{n}")
+            return wrapped
+        except Exception:
+            pass  # Fall through to per-prompt with error handling
         wrapped = []
+        for i, conv in enumerate(all_conversations):
             try:
                 text = tokenizer.apply_chat_template(
+                    conv, tokenize=False, add_generation_prompt=True
                 )
                 wrapped.append(text)
             except Exception:
+                wrapped.append(prompts[i])  # fallback to raw if individual prompt fails
+        self.log(f"    chat template {n}/{n}")
         return wrapped
     @staticmethod
                         if n_dirs > 1:
                             harmful_stack = torch.stack(self._harmful_acts[idx]).squeeze(1)
                             harmless_stack = torch.stack(self._harmless_acts[idx]).squeeze(1)
+                            diff_matrix = (harmful_stack - harmless_stack).float()
                             if torch.isfinite(diff_matrix).all():
                                 k = min(n_dirs, diff_matrix.shape[0], diff_matrix.shape[1])
                                 _, _, Vh = torch.linalg.svd(diff_matrix, full_matrices=False)
                 # SVD-based multi-direction extraction (Gabliteration)
                 harmful_stack = torch.stack(self._harmful_acts[idx]).squeeze(1)  # (n_prompts, hidden)
                 harmless_stack = torch.stack(self._harmless_acts[idx]).squeeze(1)
+                diff_matrix = (harmful_stack - harmless_stack).float()  # float32 for SVD stability
                 # SVD to extract principal refusal directions
                 if not torch.isfinite(diff_matrix).all():
                     # remove components that lie in both subspaces (violating
                     # the GRRO's independent-αᵢ assumption; see theory journal
                     # §12.6 "SAE-SVD Orthogonalization").
+                    # Batch orthogonalization: project out SVD subspace from all
+                    # SAE directions at once (replaces O(n_sae * n_svd) loop).
+                    svd_sub = subspace.to(sae_dirs.device)  # (n_svd, hidden_dim)
+                    overlaps = sae_dirs @ svd_sub.T  # (n_sae, n_svd)
+                    sae_dirs -= overlaps @ svd_sub  # project out SVD subspace
+                    # Zero collapsed directions BEFORE normalizing to avoid
+                    # amplifying floating-point noise in near-zero directions.
+                    sae_norms = sae_dirs.norm(dim=-1, keepdim=True)
+                    collapsed_mask = (sae_norms.squeeze(-1) < 1e-8)
+                    if collapsed_mask.any():
+                        sae_dirs[collapsed_mask] = 0.0
+                    # Re-normalize surviving directions only
+                    surviving = ~collapsed_mask
+                    if surviving.any():
+                        sae_dirs[surviving] = sae_dirs[surviving] / sae_norms[surviving].clamp(min=1e-12)
                     sae_count = 0
                     # SAE regularization: for inversion modes, use a much
                     # gentler floor (0.6 = 40% removal) since these are
                     # projection which already uses full reflection.
                     sae_reg_floor = 0.6 if self.invert_refusal else 0.3
                     sae_reg = max(layer_reg, sae_reg_floor) if not self.invert_refusal else sae_reg_floor
+                    # Cache module lookups and pre-transfer SAE directions
+                    sae_attn = None
+                    sae_ffn = None
+                    try:
+                        sae_attn = get_attention_module(layers[idx], arch)
+                    except (AttributeError, RuntimeError):
+                        pass
+                    try:
+                        sae_ffn = get_ffn_module(layers[idx], arch)
+                    except (AttributeError, RuntimeError):
+                        pass
+                    sae_dirs_on_device = sae_dirs.to(device)
+                    for si in range(sae_dirs_on_device.shape[0]):
                         # Skip SAE directions that collapsed to near-zero
                         # after orthogonalization (fully redundant with SVD)
+                        if sae_dirs_on_device[si].norm() < 1e-6:
                             continue
+                        sd = sae_dirs_on_device[si].unsqueeze(-1)
+                        if sae_attn is not None:
+                            try:
+                                sae_count += self._project_out_advanced(
+                                    sae_attn, sd, _ATTN_OUT_NAMES,
+                                    norm_preserve=self.norm_preserve,
+                                    regularization=sae_reg,
+                                )
+                            except (AttributeError, RuntimeError):
+                                pass
+                        if sae_ffn is not None:
+                            try:
+                                fc = self._project_out_advanced(
+                                    sae_ffn, sd, _FFN_OUT_NAMES,
                                     norm_preserve=self.norm_preserve,
                                     regularization=sae_reg,
                                 )
+                                if fc == 0:
+                                    fc = self._project_moe_experts(
+                                        sae_ffn, sd,
+                                        norm_preserve=self.norm_preserve,
+                                        regularization=sae_reg,
+                                        project_biases=False,
+                                    )
+                                sae_count += fc
+                            except (AttributeError, RuntimeError):
+                                pass
                         del sd
+                    del sae_dirs_on_device
                     total_sae_projections += sae_count
                     count += sae_count
             model = self.handle.model
             if last_strong in self.refusal_subspaces:
                 subspace = self.refusal_subspaces[last_strong]
+                lm_device = self._get_model_device(model)
+                # Pre-transfer subspace and resolve lm_head module once
+                subspace_on_device = subspace.to(lm_device)
+                lm_head_name = None
+                for head_name in ["lm_head", "embed_out", "output"]:
+                    head = getattr(model, head_name, None)
+                    if head is not None and hasattr(head, "weight"):
+                        lm_head_name = head_name
+                        break
+                if lm_head_name is not None:
+                    lm_reg = (1.0 - self.reflection_strength) if self.invert_refusal else 0.0
+                    for dir_idx in range(subspace_on_device.shape[0]):
+                        d = subspace_on_device[dir_idx].unsqueeze(-1)
+                        lm_head_count += self._project_out_advanced(
+                            model, d, [lm_head_name],
+                            norm_preserve=self.norm_preserve,
+                            regularization=lm_reg,
+                        )
+                        del d
+                del subspace_on_device
         if lm_head_count > 0:
             total_modified += lm_head_count
             self.log(f"  lm_head: {lm_head_count} projections")
                         if n_dirs > 1:
                             harmful_stack = torch.stack(self._harmful_acts[idx]).squeeze(1)
                             harmless_stack = torch.stack(self._harmless_acts[idx]).squeeze(1)
+                            diff_matrix = (harmful_stack - harmless_stack).float()
                             if torch.isfinite(diff_matrix).all():
                                 k = min(n_dirs, diff_matrix.shape[0], diff_matrix.shape[1])
                                 _, _, Vh = torch.linalg.svd(diff_matrix, full_matrices=False)
             else:
                 harmful_stack = torch.stack(self._harmful_acts[idx]).squeeze(1)
                 harmless_stack = torch.stack(self._harmless_acts[idx]).squeeze(1)
+                diff_matrix = (harmful_stack - harmless_stack).float()  # float32 for SVD stability
                 if not torch.isfinite(diff_matrix).all():
                     diff_matrix = torch.nan_to_num(diff_matrix, nan=0.0, posinf=0.0, neginf=0.0)
                 k = min(n_dirs, diff_matrix.shape[0], diff_matrix.shape[1])

obliteratus/cli.py CHANGED Viewed

@@ -43,7 +43,7 @@ def main(argv: list[str] | None = None):
     )
     # --- models ---
-    models_parser = subparsers.add_parser("models", help="Browse 48 curated models by compute tier")
     models_parser.add_argument(
         "--tier",
         type=str,

     )
     # --- models ---
+    models_parser = subparsers.add_parser("models", help="Browse curated models by compute tier")
     models_parser.add_argument(
         "--tier",
         type=str,

scripts/run_benchmark_remote.sh CHANGED Viewed

@@ -92,8 +92,8 @@ os.environ.setdefault("CUDA_LAUNCH_BLOCKING", "1")
 import torch
 import torch.nn as nn
-# Add app dir to path (HF Space layout)
-sys.path.insert(0, "/home/user/app")
 # ── Hotpatch: fix device detection for accelerate device_map="auto" ──────
 # The deployed Space code uses next(model.parameters()).device which is

 import torch
 import torch.nn as nn
+# Add app dir to path (HF Space layout: /home/user/app)
+sys.path.insert(0, os.environ.get("APP_DIR", "/home/user/app"))
 # ── Hotpatch: fix device detection for accelerate device_map="auto" ──────
 # The deployed Space code uses next(model.parameters()).device which is