obliteratus

Running on Zero

App Files Files Community

pliny-the-prompter commited on Mar 8

Commit

f0084ba

verified ·

1 Parent(s): d0e05ab

Upload 135 files

Browse files

Files changed (12) hide show

app.py +194 -36
docs/index.html +58 -4
obliteratus/.DS_Store +0 -0
obliteratus/abliterate.py +51 -28
obliteratus/analysis/conditional_abliteration.py +1 -1
obliteratus/analysis/leace.py +34 -34
obliteratus/analysis/riemannian_manifold.py +15 -4
obliteratus/analysis/sae_abliteration.py +1 -1
obliteratus/analysis/spectral_certification.py +10 -4
obliteratus/analysis/wasserstein_optimal.py +1 -1
obliteratus/evaluation/heretic_eval.py +12 -11
obliteratus/tourney.py +10 -0

app.py CHANGED Viewed

@@ -98,6 +98,51 @@ def _is_quota_error(exc: BaseException) -> bool:
         return True
     return False
 # ---------------------------------------------------------------------------
 # Global state
 # ---------------------------------------------------------------------------
@@ -164,7 +209,7 @@ def _recover_sessions_from_disk() -> None:
     """
     global _last_obliterated_label, _obliterate_counter
     found_any = False
-    for pattern in ("obliterated_*", "obliterated", "bench_*"):
         for p in Path("/tmp").glob(pattern):
             if not p.is_dir():
                 continue
@@ -291,6 +336,11 @@ METHODS = {
     "optimized (bayesian auto-tuned)": "optimized",
     "inverted (semantic refusal inversion)": "inverted",
     "nuclear (maximum force combo)": "nuclear",
 }
 # ── Community Hub push ────────────────────────────────────────────────
@@ -342,6 +392,17 @@ def _get_preset_defaults(method_display: str):
         "spectral_cascade": cfg.get("spectral_cascade", False),
         "spectral_bands": cfg.get("spectral_bands", 3),
         "spectral_threshold": cfg.get("spectral_threshold", 0.05),
     }
 def _on_method_change(method_display: str):
@@ -376,6 +437,16 @@ def _on_method_change(method_display: str):
         d["expert_transplant"],
         d["use_wasserstein_optimal"],
         d["spectral_cascade"],
     )
 def _on_dataset_change(dataset_label: str):
@@ -1751,6 +1822,12 @@ def obliterate(model_choice: str, method_choice: str,
                adv_project_embeddings: bool, adv_activation_steering: bool,
                adv_expert_transplant: bool, adv_wasserstein_optimal: bool,
                adv_spectral_cascade: bool,
                progress=gr.Progress()):
     """Run the full obliteration pipeline, streaming log updates to the UI.
@@ -1936,6 +2013,15 @@ def obliterate(model_choice: str, method_choice: str,
                     spectral_bands=int(adv_spectral_bands),
                     spectral_threshold=float(adv_spectral_threshold),
                     verify_sample_size=int(adv_verify_sample_size),
                 )
                 pipeline_ref[0] = pipeline
                 pipeline.run()
@@ -2107,10 +2193,9 @@ def obliterate(model_choice: str, method_choice: str,
                         bnb_4bit_quant_type="nf4",
                         llm_int8_enable_fp32_cpu_offload=True,
                     )
-                    model_reloaded = AutoModelForCausalLM.from_pretrained(
                         save_dir,
                         quantization_config=bnb_cfg,
-                        device_map="auto",
                         trust_remote_code=True,
                     )
                     tokenizer_reloaded = AutoTokenizer.from_pretrained(
@@ -2148,9 +2233,8 @@ def obliterate(model_choice: str, method_choice: str,
                 yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update(), gr.update()
                 try:
                     offload_dir = tempfile.mkdtemp(prefix="obliteratus_offload_")
-                    model_reloaded = AutoModelForCausalLM.from_pretrained(
                         save_dir,
-                        device_map="auto",
                         offload_folder=offload_dir,
                         torch_dtype=torch.float16,
                         trust_remote_code=True,
@@ -2311,8 +2395,8 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
         if checkpoint and Path(checkpoint).exists():
             try:
                 is_preset = (_state.get("model_name") or "") in MODELS
-                model = AutoModelForCausalLM.from_pretrained(
-                    checkpoint, device_map="auto", torch_dtype=torch.float16,
                     trust_remote_code=is_preset,
                 )
                 tokenizer = AutoTokenizer.from_pretrained(
@@ -2502,8 +2586,8 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
             if checkpoint and Path(checkpoint).exists():
                 is_preset = (_state.get("model_name") or "") in MODELS
                 try:
-                    model_loaded = AutoModelForCausalLM.from_pretrained(
-                        checkpoint, device_map="auto", torch_dtype=torch.float16,
                         trust_remote_code=is_preset,
                     )
                     tokenizer_loaded = AutoTokenizer.from_pretrained(
@@ -2563,9 +2647,8 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
         is_preset = cfg["model_choice"] in MODELS
         try:
-            model_loaded = AutoModelForCausalLM.from_pretrained(
                 checkpoint_dir,
-                device_map="auto",
                 torch_dtype=torch.float16,
                 trust_remote_code=is_preset,
             )
@@ -2599,10 +2682,9 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
                 )
                 yield f"**Loading {choice}** in 4-bit (model too large for fp16)...", ""
                 progress(0.5, desc="Loading 4-bit...")
-                model_loaded = AutoModelForCausalLM.from_pretrained(
                     checkpoint_dir,
                     quantization_config=bnb_cfg,
-                    device_map="auto",
                     trust_remote_code=is_preset,
                 )
                 tokenizer_loaded = AutoTokenizer.from_pretrained(
@@ -2744,8 +2826,8 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
         if checkpoint and Path(checkpoint).exists():
             try:
                 is_preset = (model_name or "") in MODELS
-                abliterated_model = AutoModelForCausalLM.from_pretrained(
-                    checkpoint, device_map="auto", torch_dtype=torch.float16,
                     trust_remote_code=is_preset,
                 )
                 tokenizer = AutoTokenizer.from_pretrained(
@@ -2870,10 +2952,9 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
     is_preset = model_name in MODELS
     original_response = ""
     try:
-        from transformers import AutoModelForCausalLM as AMCLM
-        original_model = AMCLM.from_pretrained(
             model_id, torch_dtype=torch.float16,
-            device_map="auto", trust_remote_code=is_preset,
             low_cpu_mem_usage=True,
             token=os.environ.get("HF_TOKEN") or None,
         )
@@ -3184,8 +3265,8 @@ def _tourney_gpu_wrapper(fn, *args, **kwargs):
     return _tourney_gpu_run(fn, *args, **kwargs)
-def run_tourney(model_choice, dataset, quantization):
-    """Run an elimination tournament across all abliteration methods.
     Each individual method is run inside its own ``@spaces.GPU`` allocation
     (up to 5 minutes per method) so the full tournament is not constrained
@@ -3198,6 +3279,10 @@ def run_tourney(model_choice, dataset, quantization):
         yield "**Error:** Select a model first.", "", ""
         return
     from obliteratus.tourney import (
         TourneyRunner, render_bracket_html,
         _load_checkpoint, _checkpoint_matches,
@@ -3229,6 +3314,7 @@ def run_tourney(model_choice, dataset, quantization):
             hub_repo=None,
             dataset_key=dataset_key,
             quantization=quant,
             on_log=logger,
             resume=resume,
         )
@@ -3333,18 +3419,27 @@ def run_tourney(model_choice, dataset, quantization):
             _ts = datetime.now().strftime("%H:%M")
             _short = model_id.split("/")[-1] if "/" in model_id else model_id
             _label = f"tourney winner ({winner.method}) on {_short} ({_ts})"
             with _lock:
-                _session_models[_label] = {
-                    "model_id": model_id,
-                    "model_choice": model_choice,
-                    "method": winner.method,
-                    "dataset_key": dataset_key,
-                    "prompt_volume": 0,
-                    "output_dir": winner.output_dir,
-                    "source": "tourney",
-                    "tourney_score": winner.score,
-                    "tourney_metrics": winner.metrics,
-                }
         yield (
             f"**Champion: `{winner.method}`** "
             f"(score: {winner.score:.4f})\n"
@@ -4013,6 +4108,47 @@ with gr.Blocks(theme=THEME, css=CSS, js=_JS, title="OBLITERATUS", fill_height=Tr
                 with gr.Row():
                     adv_spectral_cascade = gr.Checkbox(value=_defaults["spectral_cascade"], label="Spectral Cascade",
                                                        info="DCT frequency decomposition for precision refusal targeting")
             # List of all advanced controls (order must match _on_method_change return)
             _adv_controls = [
@@ -4029,6 +4165,12 @@ with gr.Blocks(theme=THEME, css=CSS, js=_JS, title="OBLITERATUS", fill_height=Tr
                 adv_project_embeddings, adv_activation_steering,
                 adv_expert_transplant, adv_wasserstein_optimal,
                 adv_spectral_cascade,
             ]
             obliterate_btn = gr.Button(
@@ -4199,7 +4341,8 @@ result = client.predict(
                         mm_method = gr.Dropdown(
                             choices=["basic", "advanced", "aggressive",
                                      "spectral_cascade", "informed", "surgical",
-                                     "optimized", "inverted", "nuclear"],
                             value="surgical",
                             label="Abliteration Method",
                         )
@@ -4568,11 +4711,11 @@ tradeoff point where refusal is minimized with minimal capability damage.
         # ── Tab 6: Tourney ────────────────────────────────────────────────
         with gr.Tab("Tourney", id="tourney"):
-            gr.Markdown("""### March Madness Tournament
-Pit **all abliteration methods** against each other in elimination rounds.
 The winner is saved locally — push it to HuggingFace Hub from the **Push to Hub** tab.
-**Round 1 — Qualifiers:** All methods, reduced prompts. Bottom half eliminated.
 **Round 2 — Semifinals:** Survivors, full prompts. Bottom half eliminated.
 **Round 3 — Finals:** Top contenders, maximum prompts. Champion crowned.
 """)
@@ -4584,6 +4727,14 @@ The winner is saved locally — push it to HuggingFace Hub from the **Push to Hu
                 allow_custom_value=True,
             )
             with gr.Accordion("Advanced Settings", open=False):
                 with gr.Row():
                     tourney_dataset_dd = gr.Dropdown(
@@ -4613,9 +4764,16 @@ The winner is saved locally — push it to HuggingFace Hub from the **Push to Hu
             tourney_btn.click(
                 fn=run_tourney,
-                inputs=[tourney_model_dd,
                         tourney_dataset_dd, tourney_quant_dd],
                 outputs=[tourney_status, tourney_bracket, tourney_log],
             )
         # ── Tab 7: Export ─────────────────────────────────────────────────

         return True
     return False
+def _load_model_to_device(
+    pretrained_path: str,
+    *,
+    torch_dtype=None,
+    trust_remote_code: bool = False,
+    quantization_config=None,
+    offload_folder: str | None = None,
+    low_cpu_mem_usage: bool = False,
+    token: str | None = None,
+) -> AutoModelForCausalLM:
+    """Load a causal LM onto the best available device, MPS-safe.
+    Accelerate's ``device_map="auto"`` is not supported on MPS — models
+    silently land on CPU.  This helper skips ``device_map`` on non-CUDA
+    backends and explicitly moves the model to the best device after loading.
+    On CUDA the behaviour is identical to ``device_map="auto"``.
+    """
+    kwargs: dict = {}
+    if torch_dtype is not None:
+        kwargs["torch_dtype"] = torch_dtype
+    if trust_remote_code:
+        kwargs["trust_remote_code"] = True
+    if quantization_config is not None:
+        kwargs["quantization_config"] = quantization_config
+    if offload_folder is not None:
+        kwargs["offload_folder"] = offload_folder
+    if low_cpu_mem_usage:
+        kwargs["low_cpu_mem_usage"] = True
+    if token is not None:
+        kwargs["token"] = token
+    if dev.supports_device_map_auto():
+        kwargs["device_map"] = "auto"
+    model = AutoModelForCausalLM.from_pretrained(pretrained_path, **kwargs)
+    # On MPS / CPU: model loaded without device_map, move to best device
+    if not dev.supports_device_map_auto():
+        target = dev.get_device()
+        model = model.to(target)
+    return model
 # ---------------------------------------------------------------------------
 # Global state
 # ---------------------------------------------------------------------------
     """
     global _last_obliterated_label, _obliterate_counter
     found_any = False
+    for pattern in ("obliterated_*", "obliterated", "bench_*", "obliteratus_tourney/r*"):
         for p in Path("/tmp").glob(pattern):
             if not p.is_dir():
                 continue
     "optimized (bayesian auto-tuned)": "optimized",
     "inverted (semantic refusal inversion)": "inverted",
     "nuclear (maximum force combo)": "nuclear",
+    # Baseline reproductions for benchmarking
+    "failspy (FailSpy/abliterator baseline)": "failspy",
+    "gabliteration (Gülmez 2026 baseline)": "gabliteration",
+    "heretic (p-e-w 2025-2026 baseline)": "heretic",
+    "rdo (Wollschlager ICML 2025 baseline)": "rdo",
 }
 # ── Community Hub push ────────────────────────────────────────────────
         "spectral_cascade": cfg.get("spectral_cascade", False),
         "spectral_bands": cfg.get("spectral_bands", 3),
         "spectral_threshold": cfg.get("spectral_threshold", 0.05),
+        # Baseline-specific parameters
+        "layer_selection": cfg.get("layer_selection", "all"),
+        "winsorize_activations": cfg.get("winsorize_activations", False),
+        "winsorize_percentile": cfg.get("winsorize_percentile", 1.0),
+        "use_kl_optimization": cfg.get("use_kl_optimization", False),
+        "kl_budget": cfg.get("kl_budget", 0.5),
+        "float_layer_interpolation": cfg.get("float_layer_interpolation", False),
+        "rdo_refinement": cfg.get("rdo_refinement", False),
+        "cot_aware": cfg.get("cot_aware", False),
+        "bayesian_trials": cfg.get("bayesian_trials", 50),
+        "n_sae_features": cfg.get("n_sae_features", 64),
     }
 def _on_method_change(method_display: str):
         d["expert_transplant"],
         d["use_wasserstein_optimal"],
         d["spectral_cascade"],
+        d["layer_selection"],
+        d["winsorize_activations"],
+        d["winsorize_percentile"],
+        d["use_kl_optimization"],
+        d["kl_budget"],
+        d["float_layer_interpolation"],
+        d["rdo_refinement"],
+        d["cot_aware"],
+        d["bayesian_trials"],
+        d["n_sae_features"],
     )
 def _on_dataset_change(dataset_label: str):
                adv_project_embeddings: bool, adv_activation_steering: bool,
                adv_expert_transplant: bool, adv_wasserstein_optimal: bool,
                adv_spectral_cascade: bool,
+               adv_layer_selection: str, adv_winsorize: bool,
+               adv_winsorize_percentile: float,
+               adv_kl_optimization: bool, adv_kl_budget: float,
+               adv_float_layer_interp: bool, adv_rdo_refinement: bool,
+               adv_cot_aware: bool,
+               adv_bayesian_trials: int, adv_n_sae_features: int,
                progress=gr.Progress()):
     """Run the full obliteration pipeline, streaming log updates to the UI.
                     spectral_bands=int(adv_spectral_bands),
                     spectral_threshold=float(adv_spectral_threshold),
                     verify_sample_size=int(adv_verify_sample_size),
+                    layer_selection=adv_layer_selection,
+                    winsorize_activations=adv_winsorize,
+                    winsorize_percentile=float(adv_winsorize_percentile),
+                    use_kl_optimization=adv_kl_optimization,
+                    kl_budget=float(adv_kl_budget),
+                    float_layer_interpolation=adv_float_layer_interp,
+                    rdo_refinement=adv_rdo_refinement,
+                    cot_aware=adv_cot_aware,
+                    n_sae_features=int(adv_n_sae_features),
                 )
                 pipeline_ref[0] = pipeline
                 pipeline.run()
                         bnb_4bit_quant_type="nf4",
                         llm_int8_enable_fp32_cpu_offload=True,
                     )
+                    model_reloaded = _load_model_to_device(
                         save_dir,
                         quantization_config=bnb_cfg,
                         trust_remote_code=True,
                     )
                     tokenizer_reloaded = AutoTokenizer.from_pretrained(
                 yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update(), gr.update()
                 try:
                     offload_dir = tempfile.mkdtemp(prefix="obliteratus_offload_")
+                    model_reloaded = _load_model_to_device(
                         save_dir,
                         offload_folder=offload_dir,
                         torch_dtype=torch.float16,
                         trust_remote_code=True,
         if checkpoint and Path(checkpoint).exists():
             try:
                 is_preset = (_state.get("model_name") or "") in MODELS
+                model = _load_model_to_device(
+                    checkpoint, torch_dtype=torch.float16,
                     trust_remote_code=is_preset,
                 )
                 tokenizer = AutoTokenizer.from_pretrained(
             if checkpoint and Path(checkpoint).exists():
                 is_preset = (_state.get("model_name") or "") in MODELS
                 try:
+                    model_loaded = _load_model_to_device(
+                        checkpoint, torch_dtype=torch.float16,
                         trust_remote_code=is_preset,
                     )
                     tokenizer_loaded = AutoTokenizer.from_pretrained(
         is_preset = cfg["model_choice"] in MODELS
         try:
+            model_loaded = _load_model_to_device(
                 checkpoint_dir,
                 torch_dtype=torch.float16,
                 trust_remote_code=is_preset,
             )
                 )
                 yield f"**Loading {choice}** in 4-bit (model too large for fp16)...", ""
                 progress(0.5, desc="Loading 4-bit...")
+                model_loaded = _load_model_to_device(
                     checkpoint_dir,
                     quantization_config=bnb_cfg,
                     trust_remote_code=is_preset,
                 )
                 tokenizer_loaded = AutoTokenizer.from_pretrained(
         if checkpoint and Path(checkpoint).exists():
             try:
                 is_preset = (model_name or "") in MODELS
+                abliterated_model = _load_model_to_device(
+                    checkpoint, torch_dtype=torch.float16,
                     trust_remote_code=is_preset,
                 )
                 tokenizer = AutoTokenizer.from_pretrained(
     is_preset = model_name in MODELS
     original_response = ""
     try:
+        original_model = _load_model_to_device(
             model_id, torch_dtype=torch.float16,
+            trust_remote_code=is_preset,
             low_cpu_mem_usage=True,
             token=os.environ.get("HF_TOKEN") or None,
         )
     return _tourney_gpu_run(fn, *args, **kwargs)
+def run_tourney(model_choice, selected_methods, dataset, quantization):
+    """Run an elimination tournament across selected abliteration methods.
     Each individual method is run inside its own ``@spaces.GPU`` allocation
     (up to 5 minutes per method) so the full tournament is not constrained
         yield "**Error:** Select a model first.", "", ""
         return
+    if not selected_methods or len(selected_methods) < 3:
+        yield "**Error:** Select at least 3 methods for a tournament.", "", ""
+        return
     from obliteratus.tourney import (
         TourneyRunner, render_bracket_html,
         _load_checkpoint, _checkpoint_matches,
             hub_repo=None,
             dataset_key=dataset_key,
             quantization=quant,
+            methods=list(selected_methods),
             on_log=logger,
             resume=resume,
         )
             _ts = datetime.now().strftime("%H:%M")
             _short = model_id.split("/")[-1] if "/" in model_id else model_id
             _label = f"tourney winner ({winner.method}) on {_short} ({_ts})"
+            _winner_meta = {
+                "model_id": model_id,
+                "model_choice": model_choice,
+                "method": winner.method,
+                "dataset_key": dataset_key,
+                "prompt_volume": 0,
+                "output_dir": winner.output_dir,
+                "source": "tourney",
+                "tourney_score": winner.score,
+                "tourney_metrics": winner.metrics,
+            }
             with _lock:
+                _session_models[_label] = _winner_meta
+            # Persist so the winner survives ZeroGPU process restarts
+            _persist_session_meta(winner.output_dir, _label, {
+                "model_id": model_id,
+                "model_choice": model_choice,
+                "method": winner.method,
+                "dataset_key": dataset_key,
+                "source": "tourney",
+            })
         yield (
             f"**Champion: `{winner.method}`** "
             f"(score: {winner.score:.4f})\n"
                 with gr.Row():
                     adv_spectral_cascade = gr.Checkbox(value=_defaults["spectral_cascade"], label="Spectral Cascade",
                                                        info="DCT frequency decomposition for precision refusal targeting")
+                gr.Markdown("**Layer Selection & Baseline Options**")
+                with gr.Row():
+                    adv_layer_selection = gr.Dropdown(
+                        choices=["knee_cosmic", "all", "all_except_first", "middle60", "top_k", "knee"],
+                        value=_defaults["layer_selection"],
+                        label="Layer Selection",
+                        info="Which layers to project refusal directions from",
+                    )
+                    adv_winsorize_percentile = gr.Slider(
+                        0.0, 1.0, value=_defaults["winsorize_percentile"], step=0.01,
+                        label="Winsorize Percentile",
+                        info="Activation clamping quantile (1.0 = disabled, 0.01 = 99th pctile)",
+                    )
+                    adv_kl_budget = gr.Slider(
+                        0.0, 2.0, value=_defaults["kl_budget"], step=0.1,
+                        label="KL Budget",
+                        info="Max KL divergence from base model (Heretic/optimized)",
+                    )
+                with gr.Row():
+                    adv_winsorize = gr.Checkbox(value=_defaults["winsorize_activations"], label="Winsorize Activations",
+                                                info="Clamp outlier activations before direction extraction")
+                    adv_kl_optimization = gr.Checkbox(value=_defaults["use_kl_optimization"], label="KL Optimization",
+                                                      info="Optimize projection strength to stay within KL budget")
+                    adv_float_layer_interp = gr.Checkbox(value=_defaults["float_layer_interpolation"], label="Float Layer Interpolation",
+                                                         info="Interpolate between adjacent layers' directions (Heretic)")
+                    adv_rdo_refinement = gr.Checkbox(value=_defaults["rdo_refinement"], label="RDO Refinement",
+                                                     info="Gradient-based direction refinement (Wollschlager et al.)")
+                with gr.Row():
+                    adv_cot_aware = gr.Checkbox(value=_defaults["cot_aware"], label="CoT-Aware",
+                                                info="Preserve chain-of-thought reasoning during abliteration")
+                with gr.Row():
+                    adv_bayesian_trials = gr.Slider(
+                        10, 200, value=_defaults["bayesian_trials"], step=10,
+                        label="Bayesian Trials",
+                        info="Optuna TPE optimization trials (Heretic/optimized methods)",
+                    )
+                    adv_n_sae_features = gr.Slider(
+                        16, 256, value=_defaults["n_sae_features"], step=16,
+                        label="SAE Features",
+                        info="Number of SAE features to target (inverted/nuclear methods)",
+                    )
             # List of all advanced controls (order must match _on_method_change return)
             _adv_controls = [
                 adv_project_embeddings, adv_activation_steering,
                 adv_expert_transplant, adv_wasserstein_optimal,
                 adv_spectral_cascade,
+                adv_layer_selection, adv_winsorize,
+                adv_winsorize_percentile,
+                adv_kl_optimization, adv_kl_budget,
+                adv_float_layer_interp, adv_rdo_refinement,
+                adv_cot_aware,
+                adv_bayesian_trials, adv_n_sae_features,
             ]
             obliterate_btn = gr.Button(
                         mm_method = gr.Dropdown(
                             choices=["basic", "advanced", "aggressive",
                                      "spectral_cascade", "informed", "surgical",
+                                     "optimized", "inverted", "nuclear",
+                                     "failspy", "gabliteration", "heretic", "rdo"],
                             value="surgical",
                             label="Abliteration Method",
                         )
         # ── Tab 6: Tourney ────────────────────────────────────────────────
         with gr.Tab("Tourney", id="tourney"):
+            gr.Markdown("""### Tourney Mode
+Pit abliteration methods against each other in elimination rounds.
 The winner is saved locally — push it to HuggingFace Hub from the **Push to Hub** tab.
+**Round 1 — Qualifiers:** Selected methods, reduced prompts. Bottom half eliminated.
 **Round 2 — Semifinals:** Survivors, full prompts. Bottom half eliminated.
 **Round 3 — Finals:** Top contenders, maximum prompts. Champion crowned.
 """)
                 allow_custom_value=True,
             )
+            from obliteratus.tourney import TOURNEY_METHODS as _ALL_TOURNEY_METHODS
+            tourney_methods_cb = gr.CheckboxGroup(
+                choices=_ALL_TOURNEY_METHODS,
+                value=_ALL_TOURNEY_METHODS,
+                label="Methods to Compete",
+                info="Pick at least 3 methods. All selected by default.",
+            )
             with gr.Accordion("Advanced Settings", open=False):
                 with gr.Row():
                     tourney_dataset_dd = gr.Dropdown(
             tourney_btn.click(
                 fn=run_tourney,
+                inputs=[tourney_model_dd, tourney_methods_cb,
                         tourney_dataset_dd, tourney_quant_dd],
                 outputs=[tourney_status, tourney_bracket, tourney_log],
+            ).then(
+                fn=lambda: (
+                    gr.update(choices=_get_session_model_choices()),
+                    gr.update(choices=_get_session_model_choices()),
+                    _get_vram_html(),
+                ),
+                outputs=[session_model_dd, ab_session_model_dd, vram_display],
             )
         # ── Tab 7: Export ─────────────────────────────────────────────────

docs/index.html CHANGED Viewed

@@ -1317,11 +1317,56 @@
                             <span class="method-label">AGGRESSIVE</span>
                             <span class="method-desc">Full Gabliteration + 3-pass refine</span>
                         </label>
                         <label class="method-radio" id="method-informed" onclick="setAblMethod('informed')" style="border-color:var(--cyan)">
                             <input type="radio" name="abl-method" value="informed">
                             <span class="method-label" style="color:var(--cyan)">INFORMED</span>
                             <span class="method-desc">Analysis-guided auto-config + Ouroboros</span>
                         </label>
                     </div>
                     <div id="method-details" style="margin-top:10px; font-size:0.7rem; color:var(--text-dim); padding:8px; border:1px solid rgba(188,19,254,0.2); border-radius:4px">
                         4 SVD directions &bull; norm-preserving &bull; 30% regularization &bull; 2 refinement passes &bull; 32 prompt pairs
@@ -1941,10 +1986,19 @@ function startAbliterateFromLibrary(hfId) {
 let ablMethod = 'advanced';
 const METHOD_INFO = {
-    basic:      {dirs:1,  norm:false, reg:0.0, passes:1, desc:'1 direction &bull; standard projection &bull; 1 pass &bull; 32 prompt pairs'},
-    advanced:   {dirs:4,  norm:true,  reg:0.3, passes:2, desc:'4 SVD directions &bull; norm-preserving &bull; 30% regularization &bull; 2 refinement passes &bull; 32 prompt pairs'},
-    aggressive: {dirs:8,  norm:true,  reg:0.0, passes:3, desc:'8 SVD directions &bull; norm-preserving &bull; full orthogonalization &bull; 3 refinement passes &bull; 32 prompt pairs'},
-    informed:   {dirs:'auto', norm:true, reg:'auto', passes:'auto', desc:'<span style="color:var(--cyan)">Analysis-guided</span> &bull; auto directions &bull; auto regularization &bull; Ouroboros-compensated &bull; cone/alignment/cluster/defense analysis'},
 };
 function getAblCmd() {

                             <span class="method-label">AGGRESSIVE</span>
                             <span class="method-desc">Full Gabliteration + 3-pass refine</span>
                         </label>
+                        <label class="method-radio" id="method-spectral_cascade" onclick="setAblMethod('spectral_cascade')">
+                            <input type="radio" name="abl-method" value="spectral_cascade">
+                            <span class="method-label">SPECTRAL</span>
+                            <span class="method-desc">DCT frequency-selective decomposition</span>
+                        </label>
                         <label class="method-radio" id="method-informed" onclick="setAblMethod('informed')" style="border-color:var(--cyan)">
                             <input type="radio" name="abl-method" value="informed">
                             <span class="method-label" style="color:var(--cyan)">INFORMED</span>
                             <span class="method-desc">Analysis-guided auto-config + Ouroboros</span>
                         </label>
+                        <label class="method-radio" id="method-surgical" onclick="setAblMethod('surgical')">
+                            <input type="radio" name="abl-method" value="surgical">
+                            <span class="method-label">SURGICAL</span>
+                            <span class="method-desc">Precision MoE-aware head surgery</span>
+                        </label>
+                        <label class="method-radio" id="method-optimized" onclick="setAblMethod('optimized')">
+                            <input type="radio" name="abl-method" value="optimized">
+                            <span class="method-label">OPTIMIZED</span>
+                            <span class="method-desc">Bayesian auto-tuned + KL-optimized</span>
+                        </label>
+                        <label class="method-radio" id="method-inverted" onclick="setAblMethod('inverted')">
+                            <input type="radio" name="abl-method" value="inverted">
+                            <span class="method-label">INVERTED</span>
+                            <span class="method-desc">Semantic refusal inversion</span>
+                        </label>
+                        <label class="method-radio" id="method-nuclear" onclick="setAblMethod('nuclear')">
+                            <input type="radio" name="abl-method" value="nuclear">
+                            <span class="method-label">NUCLEAR</span>
+                            <span class="method-desc">Maximum force combo</span>
+                        </label>
+                        <label class="method-radio" id="method-failspy" onclick="setAblMethod('failspy')">
+                            <input type="radio" name="abl-method" value="failspy">
+                            <span class="method-label">FAILSPY</span>
+                            <span class="method-desc">FailSpy/abliterator baseline</span>
+                        </label>
+                        <label class="method-radio" id="method-gabliteration" onclick="setAblMethod('gabliteration')">
+                            <input type="radio" name="abl-method" value="gabliteration">
+                            <span class="method-label">GABLIT</span>
+                            <span class="method-desc">Gabliteration (G&uuml;lmez 2026) baseline</span>
+                        </label>
+                        <label class="method-radio" id="method-heretic" onclick="setAblMethod('heretic')">
+                            <input type="radio" name="abl-method" value="heretic">
+                            <span class="method-label">HERETIC</span>
+                            <span class="method-desc">Heretic/p-e-w Bayesian baseline</span>
+                        </label>
+                        <label class="method-radio" id="method-rdo" onclick="setAblMethod('rdo')">
+                            <input type="radio" name="abl-method" value="rdo">
+                            <span class="method-label">RDO</span>
+                            <span class="method-desc">Refusal Direction Optimization baseline</span>
+                        </label>
                     </div>
                     <div id="method-details" style="margin-top:10px; font-size:0.7rem; color:var(--text-dim); padding:8px; border:1px solid rgba(188,19,254,0.2); border-radius:4px">
                         4 SVD directions &bull; norm-preserving &bull; 30% regularization &bull; 2 refinement passes &bull; 32 prompt pairs
 let ablMethod = 'advanced';
 const METHOD_INFO = {
+    basic:            {dirs:1,  norm:false, reg:0.0, passes:1, desc:'1 direction &bull; standard projection &bull; 1 pass'},
+    advanced:         {dirs:4,  norm:true,  reg:0.3, passes:2, desc:'4 SVD directions &bull; norm-preserving &bull; 30% regularization &bull; 2 refinement passes'},
+    aggressive:       {dirs:8,  norm:true,  reg:0.0, passes:3, desc:'8 SVD directions &bull; norm-preserving &bull; full orthogonalization &bull; 3 refinement passes'},
+    spectral_cascade: {dirs:6,  norm:true,  reg:0.15, passes:1, desc:'6 whitened-SVD directions &bull; DCT frequency decomposition &bull; coherence-weighted &bull; adaptive bands'},
+    informed:         {dirs:'auto', norm:true, reg:'auto', passes:'auto', desc:'<span style="color:var(--cyan)">Analysis-guided</span> &bull; auto directions &bull; auto regularization &bull; Ouroboros-compensated &bull; cone/alignment/cluster analysis'},
+    surgical:         {dirs:4,  norm:true,  reg:0.2, passes:2, desc:'4 SVD directions &bull; attention head surgery &bull; SAE features &bull; safety neuron masking &bull; per-expert MoE'},
+    optimized:        {dirs:4,  norm:true,  reg:0.2, passes:2, desc:'4 SVD directions &bull; Bayesian auto-tuned &bull; CoT-aware &bull; KL co-optimized &bull; winsorized activations'},
+    inverted:         {dirs:4,  norm:true,  reg:0.1, passes:2, desc:'4 SVD directions &bull; semantic inversion (2x reflection) &bull; SAE feature targeting'},
+    nuclear:          {dirs:8,  norm:true,  reg:0.0, passes:3, desc:'8 SVD directions &bull; all techniques combined &bull; maximum force &bull; head surgery + SAE + steering + transplant'},
+    failspy:          {dirs:1,  norm:false, reg:0.0, passes:1, desc:'<span style="color:var(--text-dim)">Baseline</span> &bull; 1 diff-means direction &bull; all layers except first &bull; FailSpy/abliterator reproduction'},
+    gabliteration:    {dirs:4,  norm:false, reg:0.231, passes:1, desc:'<span style="color:var(--text-dim)">Baseline</span> &bull; 4 SVD directions &bull; ridge reg (alpha=0.3) &bull; top-k layer selection &bull; G&uuml;lmez 2026'},
+    heretic:          {dirs:1,  norm:false, reg:0.0, passes:1, desc:'<span style="color:var(--text-dim)">Baseline</span> &bull; 1 diff-means &bull; Bayesian (Optuna TPE) &bull; KL-optimized &bull; float layer interpolation &bull; p-e-w'},
+    rdo:              {dirs:4,  norm:true,  reg:0.0, passes:1, desc:'<span style="color:var(--text-dim)">Baseline</span> &bull; 4 SVD directions &bull; gradient-refined (RDO) &bull; linear probe classifier &bull; Wollschlager ICML 2025'},
 };
 function getAblCmd() {

obliteratus/.DS_Store CHANGED Viewed

Binary files a/obliteratus/.DS_Store and b/obliteratus/.DS_Store differ

obliteratus/abliterate.py CHANGED Viewed

@@ -328,10 +328,11 @@ METHODS = {
         "description": (
             "Faithful reproduction of the FailSpy/abliterator library — the "
             "most widely used community tool. Single direction via difference-"
-            "in-means (Arditi et al.), middle 60%% layer heuristic (layers "
-            "20%%-80%%), no regularization, no norm preservation. Uses chat "
-            "template for instruct models. This is what most HuggingFace "
-            "abliterated models were created with."
         ),
         "n_directions": 1,
         "direction_method": "diff_means",
@@ -349,7 +350,7 @@ METHODS = {
         "attention_head_surgery": False,
         "use_sae_features": False,
         "invert_refusal": False,
-        "layer_selection": "middle60",
     },
     "gabliteration": {
         "label": "Gabliteration (Gülmez 2026 Baseline)",
@@ -383,20 +384,26 @@ METHODS = {
         "layer_selection": "top_k",
     },
     "heretic": {
-        "label": "Heretic / p-e-w (2025 Baseline)",
         "description": (
-            "Faithful reproduction of Heretic's core algorithm (p-e-w, 2025). "
-            "Bayesian optimization via Optuna TPE with parametric bell curve "
-            "kernel. Uses 1-2 directions (float interpolation between top SVD "
-            "components), component-specific scaling (attention vs MLP), "
-            "activation winsorization (1%% tails). No whitened SVD, no SAE "
-            "features, no jailbreak contrast. The key innovation is replacing "
             "manual hyperparameter selection with automated Pareto optimization "
-            "over the (refusal_rate, KL_divergence) frontier."
         ),
-        "n_directions": 2,
         "direction_method": "diff_means",
-        "norm_preserve": True,
         "regularization": 0.0,
         "refinement_passes": 1,
         "project_biases": False,
@@ -404,14 +411,21 @@ METHODS = {
         "use_whitened_svd": False,
         "true_iterative_refinement": False,
         "use_jailbreak_contrast": False,
-        "layer_adaptive_strength": True,
         "safety_neuron_masking": False,
         "per_expert_directions": False,
         "attention_head_surgery": False,
         "use_sae_features": False,
         "invert_refusal": False,
-        "winsorize_activations": True,
-        "winsorize_percentile": 0.01,
         "float_layer_interpolation": True,
         "cot_aware": False,
         "use_kl_optimization": True,
@@ -1689,7 +1703,8 @@ class AbliterationPipeline:
         # Supports multiple algorithms for baseline comparison:
         #   knee_cosmic: OBLITERATUS default (knee detection + COSMIC fusion)
         #   knee:        knee detection only (simplified OBLITERATUS)
-        #   middle60:    FailSpy/abliterator heuristic (layers 20%-80%)
         #   all:         all layers (for Bayesian optimization / Heretic)
         #   top_k:       top-k by refusal strength (Gabliteration-style)
         sorted_layers = sorted(norms.items(), key=lambda x: x[1], reverse=True)
@@ -1702,8 +1717,14 @@ class AbliterationPipeline:
         selection_method = self.layer_selection
-        if selection_method == "middle60":
-            # FailSpy/abliterator heuristic: middle 60% of layers
             self._strong_layers = self._select_layers_middle60(n_layers)
             self.log(f"Layer selection: middle-60% ({len(self._strong_layers)} layers)")
@@ -2359,14 +2380,14 @@ class AbliterationPipeline:
     @staticmethod
     def _select_layers_middle60(n_layers: int) -> list[int]:
-        """Select the middle 60% of layers (FailSpy/abliterator heuristic).
-        The original abliterator library by FailSpy selects layers from index
-        n_layers*0.2 to n_layers*0.8, based on the empirical observation that
-        refusal concentrates in middle layers (not early embedding layers or
-        late unembedding layers).
-        Reference: FailSpy/abliterator (2024), GitHub.
         """
         start = int(n_layers * 0.2)
         end = int(n_layers * 0.8)
@@ -3751,7 +3772,9 @@ class AbliterationPipeline:
         # Respect configured layer_selection (matching _distill)
         selection_method = self.layer_selection
-        if selection_method == "middle60":
             self._strong_layers = self._select_layers_middle60(n_layers)
         elif selection_method == "all":
             self._strong_layers = self._select_layers_all(n_layers)

         "description": (
             "Faithful reproduction of the FailSpy/abliterator library — the "
             "most widely used community tool. Single direction via difference-"
+            "in-means (Arditi et al.), applied to all layers except layer 0 "
+            "(matching FailSpy source: range(1, n_layers)). Projects both "
+            "W_O (attention output) and MLP W_out. No regularization, no "
+            "norm preservation. Uses chat template for instruct models. "
+            "This is what most HuggingFace abliterated models were created with."
         ),
         "n_directions": 1,
         "direction_method": "diff_means",
         "attention_head_surgery": False,
         "use_sae_features": False,
         "invert_refusal": False,
+        "layer_selection": "all_except_first",
     },
     "gabliteration": {
         "label": "Gabliteration (Gülmez 2026 Baseline)",
         "layer_selection": "top_k",
     },
     "heretic": {
+        "label": "Heretic / p-e-w (2025-2026 Baseline)",
         "description": (
+            "Faithful reproduction of Heretic's core algorithm (p-e-w, 2025-2026). "
+            "Bayesian optimization via Optuna TPE with linear bell curve layer "
+            "weighting (NOT Gaussian — linear interpolation between max_weight and "
+            "min_weight over min_weight_distance). One diff-of-means direction per "
+            "layer; direction_scope is sampled ('global' selects a float layer index "
+            "with lerp between adjacent layers' directions, 'per layer' uses each "
+            "layer's own direction). LoRA-based ablation (delta W = -lambda * v * "
+            "(v^T W)), never modifies base weights directly. Row normalization "
+            "defaults to NONE (PRE and FULL are options). Activation winsorization "
+            "via symmetric quantile clamping. The key innovation is replacing "
             "manual hyperparameter selection with automated Pareto optimization "
+            "over the (refusal_count, KL_divergence) frontier."
         ),
+        "n_directions": 1,
         "direction_method": "diff_means",
+        # Heretic default row_normalization is NONE; PRE/FULL are optional.
+        # OBLITERATUS norm_preserve=False matches Heretic's default behavior.
+        "norm_preserve": False,
         "regularization": 0.0,
         "refinement_passes": 1,
         "project_biases": False,
         "use_whitened_svd": False,
         "true_iterative_refinement": False,
         "use_jailbreak_contrast": False,
+        # Heretic uses its own bell curve weighting (linear, not Gaussian),
+        # not OBLITERATUS's norm-based layer_adaptive_strength.
+        "layer_adaptive_strength": False,
         "safety_neuron_masking": False,
         "per_expert_directions": False,
         "attention_head_surgery": False,
         "use_sae_features": False,
         "invert_refusal": False,
+        # Heretic default winsorization_quantile is 1.0 (disabled by default).
+        # For faithful baseline reproduction we match the source default.
+        "winsorize_activations": False,
+        "winsorize_percentile": 1.0,
+        # Heretic's float direction index interpolates between adjacent LAYERS'
+        # directions (not SVD components). OBLITERATUS float_layer_interpolation
+        # provides the bell-curve layer weighting aspect.
         "float_layer_interpolation": True,
         "cot_aware": False,
         "use_kl_optimization": True,
         # Supports multiple algorithms for baseline comparison:
         #   knee_cosmic: OBLITERATUS default (knee detection + COSMIC fusion)
         #   knee:        knee detection only (simplified OBLITERATUS)
+        #   middle60:    legacy heuristic (layers 20%-80%)
+        #   all_except_first: FailSpy/abliterator (all layers except layer 0)
         #   all:         all layers (for Bayesian optimization / Heretic)
         #   top_k:       top-k by refusal strength (Gabliteration-style)
         sorted_layers = sorted(norms.items(), key=lambda x: x[1], reverse=True)
         selection_method = self.layer_selection
+        if selection_method == "all_except_first":
+            # FailSpy/abliterator: all layers except layer 0
+            # Source: range(1, self.model.cfg.n_layers) in FailSpy/abliterator
+            self._strong_layers = list(range(1, n_layers))
+            self.log(f"Layer selection: all-except-first ({len(self._strong_layers)} layers)")
+        elif selection_method == "middle60":
+            # Legacy heuristic: middle 60% of layers (layers 20%-80%)
             self._strong_layers = self._select_layers_middle60(n_layers)
             self.log(f"Layer selection: middle-60% ({len(self._strong_layers)} layers)")
     @staticmethod
     def _select_layers_middle60(n_layers: int) -> list[int]:
+        """Select the middle 60% of layers (legacy heuristic).
+        Selects layers from index n_layers*0.2 to n_layers*0.8.
+        NOTE: This does NOT match FailSpy/abliterator's actual layer selection.
+        FailSpy uses all layers except layer 0 (range(1, n_layers)). Use
+        layer_selection="all_except_first" for faithful FailSpy reproduction.
+        This method is retained for backward compatibility only.
         """
         start = int(n_layers * 0.2)
         end = int(n_layers * 0.8)
         # Respect configured layer_selection (matching _distill)
         selection_method = self.layer_selection
+        if selection_method == "all_except_first":
+            self._strong_layers = list(range(1, n_layers))
+        elif selection_method == "middle60":
             self._strong_layers = self._select_layers_middle60(n_layers)
         elif selection_method == "all":
             self._strong_layers = self._select_layers_all(n_layers)

obliteratus/analysis/conditional_abliteration.py CHANGED Viewed

@@ -269,7 +269,7 @@ class ConditionalAbliterator:
     ) -> torch.Tensor | None:
         """Extract category-specific refusal direction.
-        Uses Fisher's Linear Discriminant (whitened difference-of-means)
         and then orthogonalizes against previously extracted directions
         to ensure category independence.
         """

     ) -> torch.Tensor | None:
         """Extract category-specific refusal direction.
+        Uses difference-of-means (category_mean - harmless_mean)
         and then orthogonalizes against previously extracted directions
         to ensure category independence.
         """

obliteratus/analysis/leace.py CHANGED Viewed

@@ -1,36 +1,34 @@
-"""LEACE (LEAst-squares Concept Erasure) for refusal direction extraction.
-Closed-form optimal concept erasure that finds the minimum-rank edit to
-representations preventing any linear classifier from recovering the
-concept (harmful vs harmless).  Unlike SVD, LEACE produces directions
-that are provably optimal for erasure with minimal distortion.
-The key insight: instead of finding directions of maximum variance (SVD)
-or maximum mean difference (diff-of-means), LEACE solves a constrained
-optimization problem: find the smallest perturbation to representations
-such that no linear probe can distinguish harmful from harmless.
-Mathematical formulation (rank-1 erasure):
-    Given class-conditional means mu_0, mu_1 and within-class
-    covariance S_w:
-    1. Compute mean difference: delta = mu_1 - mu_0
-    2. Compute within-class covariance: S_w = (S_0 + S_1) / 2
-    3. Solve generalized eigenvalue problem: S_b v = lambda S_w v
-       where S_b = delta @ delta^T (between-class scatter)
-    4. The top generalized eigenvector is the LEACE direction
-    5. Erase by projecting out: x' = x - (x @ v) * v^T
-This is mathematically equivalent to Fisher's Linear Discriminant but
-applied as an erasure operation.  The direction maximizes class
-separability relative to within-class spread, making it the optimal
-single direction to remove for concept erasure.
 Advantages over SVD:
-    - Theoretically optimal: minimizes representation distortion for
-      guaranteed erasure of linear concept information
-    - Handles rogue dimensions naturally: within-class normalization
-      prevents high-variance but non-discriminative dimensions from
-      dominating
     - No hyperparameters beyond regularization epsilon
     - Closed-form solution (no iterative optimization)
@@ -39,6 +37,8 @@ References:
       closed form. NeurIPS 2023.
     - Ravfogel et al. (2022): RLACE: Adversarial concept erasure
       (iterative precursor to LEACE).
 """
 from __future__ import annotations
@@ -61,11 +61,11 @@ class LEACEResult:
 class LEACEExtractor:
-    """Extract refusal directions via LEACE (closed-form concept erasure).
     Finds the direction that maximally separates harmful from harmless
-    activations relative to within-class variance, then erases it.
-    This is the provably optimal rank-1 concept erasure.
     """
     def __init__(

+"""LEACE-inspired direction extraction for refusal concept erasure.
+This module implements Fisher's Linear Discriminant (FLD) direction for
+concept erasure, inspired by LEACE (Belrose et al. 2023).
+IMPORTANT: This is NOT a faithful implementation of LEACE as described in
+the paper. Key difference:
+  - **True LEACE** uses the *total* covariance Sigma_X for whitening:
+        P* = I - W^{-1} P_{W Sigma_XZ} W  where W = Sigma_X^{-1/2}
+    For binary concepts, this yields: v = Sigma_X^{-1} delta
+  - **This implementation** uses *within-class* covariance S_w:
+        v = S_w^{-1} delta
+    This is Fisher's Linear Discriminant direction, which maximizes
+    class separability relative to within-class spread.
+For binary concepts, Sigma_X = S_w + p(1-p) * delta @ delta^T,
+so the two directions differ when the between-class scatter is
+non-negligible relative to within-class scatter. In high-dimensional
+settings (d >> 1) with moderate class separation, the difference
+is typically small but non-zero.
+The FLD direction is still a strong choice for refusal erasure — it
+handles rogue dimensions (high-variance but non-discriminative) better
+than plain diff-of-means, and is a closed-form solution with no
+iterative optimization.
 Advantages over SVD:
+    - Within-class normalization prevents high-variance but
+      non-discriminative dimensions from dominating
     - No hyperparameters beyond regularization epsilon
     - Closed-form solution (no iterative optimization)
       closed form. NeurIPS 2023.
     - Ravfogel et al. (2022): RLACE: Adversarial concept erasure
       (iterative precursor to LEACE).
+    - Fisher (1936): The use of multiple measurements in taxonomic
+      problems. Annals of Eugenics.
 """
 from __future__ import annotations
 class LEACEExtractor:
+    """Extract refusal directions via Fisher's Linear Discriminant.
     Finds the direction that maximally separates harmful from harmless
+    activations relative to within-class variance (v = S_w^{-1} delta).
+    See module docstring for how this relates to true LEACE.
     """
     def __init__(

obliteratus/analysis/riemannian_manifold.py CHANGED Viewed

@@ -428,8 +428,15 @@ class RiemannianManifoldAnalyzer:
         geodesic triangle with area A satisfies:
             sum(angles) = pi + K * A    (Gauss-Bonnet for small triangles)
-        We approximate geodesics with straight lines (valid for small K)
-        and use angle excess to estimate K.
         """
         # Compute sides
         ab = (b - a).float()
@@ -613,8 +620,12 @@ class RiemannianManifoldAnalyzer:
                     return torch.zeros_like(activation)
                 v = v / norm
-            # Correction magnitude: K * proj_magnitude^2 / 2
-            correction_magnitude = curvature * proj_magnitude ** 2 / 2.0
             # Clamp to prevent instability
             correction_magnitude = max(-0.1, min(0.1, correction_magnitude))

         geodesic triangle with area A satisfies:
             sum(angles) = pi + K * A    (Gauss-Bonnet for small triangles)
+        IMPORTANT LIMITATION: This method uses Euclidean chords and angles
+        in ambient space, NOT geodesics on the manifold. In flat Euclidean
+        space, the angle sum of any triangle is exactly pi, so this method
+        will yield K ≈ 0 (up to numerical noise) regardless of the actual
+        manifold curvature. The results are only meaningful when the data
+        lies on an approximately low-dimensional curved submanifold and
+        triangles are sufficiently small relative to the curvature radius.
+        For rigorous curvature estimates, use methods based on local PCA
+        eigenvalue decay or Jacobian-based Riemannian metric computation.
         """
         # Compute sides
         ab = (b - a).float()
                     return torch.zeros_like(activation)
                 v = v / norm
+            # Second-order geodesic correction: K * proj_magnitude^2 / 6
+            # From Jacobi field estimate: deviation of geodesic from straight
+            # line over distance L with curvature K is ≈ K * L^2 / 6.
+            # Note: the residual bound in analyze() uses K * ||x||^2 / 8
+            # which is a looser upper bound including higher-order terms.
+            correction_magnitude = curvature * proj_magnitude ** 2 / 6.0
             # Clamp to prevent instability
             correction_magnitude = max(-0.1, min(0.1, correction_magnitude))

obliteratus/analysis/sae_abliteration.py CHANGED Viewed

@@ -94,7 +94,7 @@ class SparseAutoencoder(nn.Module):
     @property
     def decoder_weight(self) -> torch.Tensor:
-        """Return the decoder weight matrix (n_features x hidden_dim for untied, or encoder.weight.T)."""
         if self.tied_weights:
             return self.encoder.weight.T
         return self.decoder.weight

     @property
     def decoder_weight(self) -> torch.Tensor:
+        """Return the decoder weight matrix (hidden_dim x n_features for untied, or encoder.weight.T)."""
         if self.tied_weights:
             return self.encoder.weight.T
         return self.decoder.weight

obliteratus/analysis/spectral_certification.py CHANGED Viewed

@@ -175,10 +175,11 @@ class SpectralCertifier:
         harmful_centered = harmful_activations - harmful_mean
         harmless_centered = harmless_activations - harmless_mean
-        # Pooled within-class covariance
         cov_h = harmful_centered.T @ harmful_centered / max(n_h - 1, 1)
         cov_b = harmless_centered.T @ harmless_centered / max(n_b - 1, 1)
-        pooled_cov = (cov_h * n_h + cov_b * n_b) / max(n - 2, 1)
         # Step 2: Estimate noise variance (median eigenvalue method)
         noise_var = self._estimate_noise_variance(pooled_cov, n, d)
@@ -374,8 +375,13 @@ class SpectralCertifier:
             # Correct for MP bias: median of MP distribution
             gamma = d / max(n, 1)
             if gamma < 1:
-                # MP median approximation (from Bai & Silverstein)
-                mp_median_ratio = (1 + math.sqrt(gamma)) ** 2 * 0.5
                 noise_var = median_eig / max(mp_median_ratio, 1e-10)
             else:
                 noise_var = median_eig

         harmful_centered = harmful_activations - harmful_mean
         harmless_centered = harmless_activations - harmless_mean
+        # Pooled within-class covariance (standard formula: sum of scatter
+        # matrices divided by total degrees of freedom)
         cov_h = harmful_centered.T @ harmful_centered / max(n_h - 1, 1)
         cov_b = harmless_centered.T @ harmless_centered / max(n_b - 1, 1)
+        pooled_cov = (cov_h * (n_h - 1) + cov_b * (n_b - 1)) / max(n - 2, 1)
         # Step 2: Estimate noise variance (median eigenvalue method)
         noise_var = self._estimate_noise_variance(pooled_cov, n, d)
             # Correct for MP bias: median of MP distribution
             gamma = d / max(n, 1)
             if gamma < 1:
+                # MP median approximation.  The exact MP median requires
+                # numerical inversion of the MP CDF; we use the empirical
+                # approximation median ≈ (1 - sqrt(gamma))^2 + gamma^(1/3)
+                # which is more accurate than the naive 0.5 * upper_edge
+                # for small gamma.  Falls back to the simpler formula when
+                # gamma is very small.
+                mp_median_ratio = (1 - math.sqrt(gamma)) ** 2 + gamma ** (1.0 / 3.0)
                 noise_var = median_eig / max(mp_median_ratio, 1e-10)
             else:
                 noise_var = median_eig

obliteratus/analysis/wasserstein_optimal.py CHANGED Viewed

@@ -58,7 +58,7 @@ class WassersteinDirectionResult:
     direction: torch.Tensor             # (hidden_dim,) optimal direction
     wasserstein_cost: float             # W_2^2 cost for this direction
     mean_shift_component: float         # (r^T m)^2 portion
-    bures_component: float              # r^T Sigma r portion (upper bound)
     refusal_projection: float           # (r^T d)^2
     cost_effectiveness_ratio: float     # W_2^2 / (r^T d)^2

     direction: torch.Tensor             # (hidden_dim,) optimal direction
     wasserstein_cost: float             # W_2^2 cost for this direction
     mean_shift_component: float         # (r^T m)^2 portion
+    bures_component: float              # r^T Sigma r portion (exact when r is eigenvector of Sigma, lower bound otherwise)
     refusal_projection: float           # (r^T d)^2
     cost_effectiveness_ratio: float     # W_2^2 / (r^T d)^2

obliteratus/evaluation/heretic_eval.py CHANGED Viewed

@@ -334,19 +334,20 @@ def _load_harmbench_classifier():
             bnb_4bit_quant_type="nf4",
             llm_int8_enable_fp32_cpu_offload=True,
         )
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            quantization_config=bnb_cfg,
-            device_map="auto",
-            torch_dtype=torch.float16,
-        )
     except Exception:
         logger.info("4-bit quantization unavailable for classifier, loading in float16")
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            device_map="auto",
-            torch_dtype=torch.float16,
-        )
     model.eval()
     _HARMBENCH_CLASSIFIER = (model, tokenizer)

             bnb_4bit_quant_type="nf4",
             llm_int8_enable_fp32_cpu_offload=True,
         )
+        load_kwargs = dict(quantization_config=bnb_cfg, torch_dtype=torch.float16)
+        if dev.supports_device_map_auto():
+            load_kwargs["device_map"] = "auto"
+        model = AutoModelForCausalLM.from_pretrained(model_id, **load_kwargs)
     except Exception:
         logger.info("4-bit quantization unavailable for classifier, loading in float16")
+        load_kwargs = dict(torch_dtype=torch.float16)
+        if dev.supports_device_map_auto():
+            load_kwargs["device_map"] = "auto"
+        model = AutoModelForCausalLM.from_pretrained(model_id, **load_kwargs)
+    # On MPS/CPU: move model to best available device
+    if not dev.supports_device_map_auto():
+        model = model.to(dev.get_device())
     model.eval()
     _HARMBENCH_CLASSIFIER = (model, tokenizer)

obliteratus/tourney.py CHANGED Viewed

@@ -1097,6 +1097,11 @@ class TourneyRunner:
         result.winner = winner
         result.total_time_s = time.time() - t_start
         self.log("")
         self.log("=" * 60)
         if winner:
@@ -1401,6 +1406,11 @@ class TourneyRunner:
         result.winner = winner
         result.total_time_s = time.time() - t_start
         self.log("")
         self.log("=" * 60)
         if winner:

         result.winner = winner
         result.total_time_s = time.time() - t_start
+        # Clean up non-winner finalist dirs to free disk
+        for c in ranked[1:]:
+            if c.output_dir and Path(c.output_dir).exists():
+                shutil.rmtree(c.output_dir, ignore_errors=True)
         self.log("")
         self.log("=" * 60)
         if winner:
         result.winner = winner
         result.total_time_s = time.time() - t_start
+        # Clean up non-winner finalist dirs to free disk
+        for c in ranked[1:]:
+            if c.output_dir and Path(c.output_dir).exists():
+                shutil.rmtree(c.output_dir, ignore_errors=True)
         self.log("")
         self.log("=" * 60)
         if winner: