pliny-the-prompter commited on
Commit
f0084ba
·
verified ·
1 Parent(s): d0e05ab

Upload 135 files

Browse files
app.py CHANGED
@@ -98,6 +98,51 @@ def _is_quota_error(exc: BaseException) -> bool:
98
  return True
99
  return False
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  # ---------------------------------------------------------------------------
102
  # Global state
103
  # ---------------------------------------------------------------------------
@@ -164,7 +209,7 @@ def _recover_sessions_from_disk() -> None:
164
  """
165
  global _last_obliterated_label, _obliterate_counter
166
  found_any = False
167
- for pattern in ("obliterated_*", "obliterated", "bench_*"):
168
  for p in Path("/tmp").glob(pattern):
169
  if not p.is_dir():
170
  continue
@@ -291,6 +336,11 @@ METHODS = {
291
  "optimized (bayesian auto-tuned)": "optimized",
292
  "inverted (semantic refusal inversion)": "inverted",
293
  "nuclear (maximum force combo)": "nuclear",
 
 
 
 
 
294
  }
295
 
296
  # ── Community Hub push ────────────────────────────────────────────────
@@ -342,6 +392,17 @@ def _get_preset_defaults(method_display: str):
342
  "spectral_cascade": cfg.get("spectral_cascade", False),
343
  "spectral_bands": cfg.get("spectral_bands", 3),
344
  "spectral_threshold": cfg.get("spectral_threshold", 0.05),
 
 
 
 
 
 
 
 
 
 
 
345
  }
346
 
347
  def _on_method_change(method_display: str):
@@ -376,6 +437,16 @@ def _on_method_change(method_display: str):
376
  d["expert_transplant"],
377
  d["use_wasserstein_optimal"],
378
  d["spectral_cascade"],
 
 
 
 
 
 
 
 
 
 
379
  )
380
 
381
  def _on_dataset_change(dataset_label: str):
@@ -1751,6 +1822,12 @@ def obliterate(model_choice: str, method_choice: str,
1751
  adv_project_embeddings: bool, adv_activation_steering: bool,
1752
  adv_expert_transplant: bool, adv_wasserstein_optimal: bool,
1753
  adv_spectral_cascade: bool,
 
 
 
 
 
 
1754
  progress=gr.Progress()):
1755
  """Run the full obliteration pipeline, streaming log updates to the UI.
1756
 
@@ -1936,6 +2013,15 @@ def obliterate(model_choice: str, method_choice: str,
1936
  spectral_bands=int(adv_spectral_bands),
1937
  spectral_threshold=float(adv_spectral_threshold),
1938
  verify_sample_size=int(adv_verify_sample_size),
 
 
 
 
 
 
 
 
 
1939
  )
1940
  pipeline_ref[0] = pipeline
1941
  pipeline.run()
@@ -2107,10 +2193,9 @@ def obliterate(model_choice: str, method_choice: str,
2107
  bnb_4bit_quant_type="nf4",
2108
  llm_int8_enable_fp32_cpu_offload=True,
2109
  )
2110
- model_reloaded = AutoModelForCausalLM.from_pretrained(
2111
  save_dir,
2112
  quantization_config=bnb_cfg,
2113
- device_map="auto",
2114
  trust_remote_code=True,
2115
  )
2116
  tokenizer_reloaded = AutoTokenizer.from_pretrained(
@@ -2148,9 +2233,8 @@ def obliterate(model_choice: str, method_choice: str,
2148
  yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update(), gr.update()
2149
  try:
2150
  offload_dir = tempfile.mkdtemp(prefix="obliteratus_offload_")
2151
- model_reloaded = AutoModelForCausalLM.from_pretrained(
2152
  save_dir,
2153
- device_map="auto",
2154
  offload_folder=offload_dir,
2155
  torch_dtype=torch.float16,
2156
  trust_remote_code=True,
@@ -2311,8 +2395,8 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
2311
  if checkpoint and Path(checkpoint).exists():
2312
  try:
2313
  is_preset = (_state.get("model_name") or "") in MODELS
2314
- model = AutoModelForCausalLM.from_pretrained(
2315
- checkpoint, device_map="auto", torch_dtype=torch.float16,
2316
  trust_remote_code=is_preset,
2317
  )
2318
  tokenizer = AutoTokenizer.from_pretrained(
@@ -2502,8 +2586,8 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
2502
  if checkpoint and Path(checkpoint).exists():
2503
  is_preset = (_state.get("model_name") or "") in MODELS
2504
  try:
2505
- model_loaded = AutoModelForCausalLM.from_pretrained(
2506
- checkpoint, device_map="auto", torch_dtype=torch.float16,
2507
  trust_remote_code=is_preset,
2508
  )
2509
  tokenizer_loaded = AutoTokenizer.from_pretrained(
@@ -2563,9 +2647,8 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
2563
 
2564
  is_preset = cfg["model_choice"] in MODELS
2565
  try:
2566
- model_loaded = AutoModelForCausalLM.from_pretrained(
2567
  checkpoint_dir,
2568
- device_map="auto",
2569
  torch_dtype=torch.float16,
2570
  trust_remote_code=is_preset,
2571
  )
@@ -2599,10 +2682,9 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
2599
  )
2600
  yield f"**Loading {choice}** in 4-bit (model too large for fp16)...", ""
2601
  progress(0.5, desc="Loading 4-bit...")
2602
- model_loaded = AutoModelForCausalLM.from_pretrained(
2603
  checkpoint_dir,
2604
  quantization_config=bnb_cfg,
2605
- device_map="auto",
2606
  trust_remote_code=is_preset,
2607
  )
2608
  tokenizer_loaded = AutoTokenizer.from_pretrained(
@@ -2744,8 +2826,8 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
2744
  if checkpoint and Path(checkpoint).exists():
2745
  try:
2746
  is_preset = (model_name or "") in MODELS
2747
- abliterated_model = AutoModelForCausalLM.from_pretrained(
2748
- checkpoint, device_map="auto", torch_dtype=torch.float16,
2749
  trust_remote_code=is_preset,
2750
  )
2751
  tokenizer = AutoTokenizer.from_pretrained(
@@ -2870,10 +2952,9 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
2870
  is_preset = model_name in MODELS
2871
  original_response = ""
2872
  try:
2873
- from transformers import AutoModelForCausalLM as AMCLM
2874
- original_model = AMCLM.from_pretrained(
2875
  model_id, torch_dtype=torch.float16,
2876
- device_map="auto", trust_remote_code=is_preset,
2877
  low_cpu_mem_usage=True,
2878
  token=os.environ.get("HF_TOKEN") or None,
2879
  )
@@ -3184,8 +3265,8 @@ def _tourney_gpu_wrapper(fn, *args, **kwargs):
3184
  return _tourney_gpu_run(fn, *args, **kwargs)
3185
 
3186
 
3187
- def run_tourney(model_choice, dataset, quantization):
3188
- """Run an elimination tournament across all abliteration methods.
3189
 
3190
  Each individual method is run inside its own ``@spaces.GPU`` allocation
3191
  (up to 5 minutes per method) so the full tournament is not constrained
@@ -3198,6 +3279,10 @@ def run_tourney(model_choice, dataset, quantization):
3198
  yield "**Error:** Select a model first.", "", ""
3199
  return
3200
 
 
 
 
 
3201
  from obliteratus.tourney import (
3202
  TourneyRunner, render_bracket_html,
3203
  _load_checkpoint, _checkpoint_matches,
@@ -3229,6 +3314,7 @@ def run_tourney(model_choice, dataset, quantization):
3229
  hub_repo=None,
3230
  dataset_key=dataset_key,
3231
  quantization=quant,
 
3232
  on_log=logger,
3233
  resume=resume,
3234
  )
@@ -3333,18 +3419,27 @@ def run_tourney(model_choice, dataset, quantization):
3333
  _ts = datetime.now().strftime("%H:%M")
3334
  _short = model_id.split("/")[-1] if "/" in model_id else model_id
3335
  _label = f"tourney winner ({winner.method}) on {_short} ({_ts})"
 
 
 
 
 
 
 
 
 
 
 
3336
  with _lock:
3337
- _session_models[_label] = {
3338
- "model_id": model_id,
3339
- "model_choice": model_choice,
3340
- "method": winner.method,
3341
- "dataset_key": dataset_key,
3342
- "prompt_volume": 0,
3343
- "output_dir": winner.output_dir,
3344
- "source": "tourney",
3345
- "tourney_score": winner.score,
3346
- "tourney_metrics": winner.metrics,
3347
- }
3348
  yield (
3349
  f"**Champion: `{winner.method}`** "
3350
  f"(score: {winner.score:.4f})\n"
@@ -4013,6 +4108,47 @@ with gr.Blocks(theme=THEME, css=CSS, js=_JS, title="OBLITERATUS", fill_height=Tr
4013
  with gr.Row():
4014
  adv_spectral_cascade = gr.Checkbox(value=_defaults["spectral_cascade"], label="Spectral Cascade",
4015
  info="DCT frequency decomposition for precision refusal targeting")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4016
 
4017
  # List of all advanced controls (order must match _on_method_change return)
4018
  _adv_controls = [
@@ -4029,6 +4165,12 @@ with gr.Blocks(theme=THEME, css=CSS, js=_JS, title="OBLITERATUS", fill_height=Tr
4029
  adv_project_embeddings, adv_activation_steering,
4030
  adv_expert_transplant, adv_wasserstein_optimal,
4031
  adv_spectral_cascade,
 
 
 
 
 
 
4032
  ]
4033
 
4034
  obliterate_btn = gr.Button(
@@ -4199,7 +4341,8 @@ result = client.predict(
4199
  mm_method = gr.Dropdown(
4200
  choices=["basic", "advanced", "aggressive",
4201
  "spectral_cascade", "informed", "surgical",
4202
- "optimized", "inverted", "nuclear"],
 
4203
  value="surgical",
4204
  label="Abliteration Method",
4205
  )
@@ -4568,11 +4711,11 @@ tradeoff point where refusal is minimized with minimal capability damage.
4568
 
4569
  # ── Tab 6: Tourney ────────────────────────────────────────────────
4570
  with gr.Tab("Tourney", id="tourney"):
4571
- gr.Markdown("""### March Madness Tournament
4572
- Pit **all abliteration methods** against each other in elimination rounds.
4573
  The winner is saved locally — push it to HuggingFace Hub from the **Push to Hub** tab.
4574
 
4575
- **Round 1 — Qualifiers:** All methods, reduced prompts. Bottom half eliminated.
4576
  **Round 2 — Semifinals:** Survivors, full prompts. Bottom half eliminated.
4577
  **Round 3 — Finals:** Top contenders, maximum prompts. Champion crowned.
4578
  """)
@@ -4584,6 +4727,14 @@ The winner is saved locally — push it to HuggingFace Hub from the **Push to Hu
4584
  allow_custom_value=True,
4585
  )
4586
 
 
 
 
 
 
 
 
 
4587
  with gr.Accordion("Advanced Settings", open=False):
4588
  with gr.Row():
4589
  tourney_dataset_dd = gr.Dropdown(
@@ -4613,9 +4764,16 @@ The winner is saved locally — push it to HuggingFace Hub from the **Push to Hu
4613
 
4614
  tourney_btn.click(
4615
  fn=run_tourney,
4616
- inputs=[tourney_model_dd,
4617
  tourney_dataset_dd, tourney_quant_dd],
4618
  outputs=[tourney_status, tourney_bracket, tourney_log],
 
 
 
 
 
 
 
4619
  )
4620
 
4621
  # ── Tab 7: Export ─────────────────────────────────────────────────
 
98
  return True
99
  return False
100
 
101
+
102
+ def _load_model_to_device(
103
+ pretrained_path: str,
104
+ *,
105
+ torch_dtype=None,
106
+ trust_remote_code: bool = False,
107
+ quantization_config=None,
108
+ offload_folder: str | None = None,
109
+ low_cpu_mem_usage: bool = False,
110
+ token: str | None = None,
111
+ ) -> AutoModelForCausalLM:
112
+ """Load a causal LM onto the best available device, MPS-safe.
113
+
114
+ Accelerate's ``device_map="auto"`` is not supported on MPS — models
115
+ silently land on CPU. This helper skips ``device_map`` on non-CUDA
116
+ backends and explicitly moves the model to the best device after loading.
117
+ On CUDA the behaviour is identical to ``device_map="auto"``.
118
+ """
119
+ kwargs: dict = {}
120
+ if torch_dtype is not None:
121
+ kwargs["torch_dtype"] = torch_dtype
122
+ if trust_remote_code:
123
+ kwargs["trust_remote_code"] = True
124
+ if quantization_config is not None:
125
+ kwargs["quantization_config"] = quantization_config
126
+ if offload_folder is not None:
127
+ kwargs["offload_folder"] = offload_folder
128
+ if low_cpu_mem_usage:
129
+ kwargs["low_cpu_mem_usage"] = True
130
+ if token is not None:
131
+ kwargs["token"] = token
132
+
133
+ if dev.supports_device_map_auto():
134
+ kwargs["device_map"] = "auto"
135
+
136
+ model = AutoModelForCausalLM.from_pretrained(pretrained_path, **kwargs)
137
+
138
+ # On MPS / CPU: model loaded without device_map, move to best device
139
+ if not dev.supports_device_map_auto():
140
+ target = dev.get_device()
141
+ model = model.to(target)
142
+
143
+ return model
144
+
145
+
146
  # ---------------------------------------------------------------------------
147
  # Global state
148
  # ---------------------------------------------------------------------------
 
209
  """
210
  global _last_obliterated_label, _obliterate_counter
211
  found_any = False
212
+ for pattern in ("obliterated_*", "obliterated", "bench_*", "obliteratus_tourney/r*"):
213
  for p in Path("/tmp").glob(pattern):
214
  if not p.is_dir():
215
  continue
 
336
  "optimized (bayesian auto-tuned)": "optimized",
337
  "inverted (semantic refusal inversion)": "inverted",
338
  "nuclear (maximum force combo)": "nuclear",
339
+ # Baseline reproductions for benchmarking
340
+ "failspy (FailSpy/abliterator baseline)": "failspy",
341
+ "gabliteration (Gülmez 2026 baseline)": "gabliteration",
342
+ "heretic (p-e-w 2025-2026 baseline)": "heretic",
343
+ "rdo (Wollschlager ICML 2025 baseline)": "rdo",
344
  }
345
 
346
  # ── Community Hub push ────────────────────────────────────────────────
 
392
  "spectral_cascade": cfg.get("spectral_cascade", False),
393
  "spectral_bands": cfg.get("spectral_bands", 3),
394
  "spectral_threshold": cfg.get("spectral_threshold", 0.05),
395
+ # Baseline-specific parameters
396
+ "layer_selection": cfg.get("layer_selection", "all"),
397
+ "winsorize_activations": cfg.get("winsorize_activations", False),
398
+ "winsorize_percentile": cfg.get("winsorize_percentile", 1.0),
399
+ "use_kl_optimization": cfg.get("use_kl_optimization", False),
400
+ "kl_budget": cfg.get("kl_budget", 0.5),
401
+ "float_layer_interpolation": cfg.get("float_layer_interpolation", False),
402
+ "rdo_refinement": cfg.get("rdo_refinement", False),
403
+ "cot_aware": cfg.get("cot_aware", False),
404
+ "bayesian_trials": cfg.get("bayesian_trials", 50),
405
+ "n_sae_features": cfg.get("n_sae_features", 64),
406
  }
407
 
408
  def _on_method_change(method_display: str):
 
437
  d["expert_transplant"],
438
  d["use_wasserstein_optimal"],
439
  d["spectral_cascade"],
440
+ d["layer_selection"],
441
+ d["winsorize_activations"],
442
+ d["winsorize_percentile"],
443
+ d["use_kl_optimization"],
444
+ d["kl_budget"],
445
+ d["float_layer_interpolation"],
446
+ d["rdo_refinement"],
447
+ d["cot_aware"],
448
+ d["bayesian_trials"],
449
+ d["n_sae_features"],
450
  )
451
 
452
  def _on_dataset_change(dataset_label: str):
 
1822
  adv_project_embeddings: bool, adv_activation_steering: bool,
1823
  adv_expert_transplant: bool, adv_wasserstein_optimal: bool,
1824
  adv_spectral_cascade: bool,
1825
+ adv_layer_selection: str, adv_winsorize: bool,
1826
+ adv_winsorize_percentile: float,
1827
+ adv_kl_optimization: bool, adv_kl_budget: float,
1828
+ adv_float_layer_interp: bool, adv_rdo_refinement: bool,
1829
+ adv_cot_aware: bool,
1830
+ adv_bayesian_trials: int, adv_n_sae_features: int,
1831
  progress=gr.Progress()):
1832
  """Run the full obliteration pipeline, streaming log updates to the UI.
1833
 
 
2013
  spectral_bands=int(adv_spectral_bands),
2014
  spectral_threshold=float(adv_spectral_threshold),
2015
  verify_sample_size=int(adv_verify_sample_size),
2016
+ layer_selection=adv_layer_selection,
2017
+ winsorize_activations=adv_winsorize,
2018
+ winsorize_percentile=float(adv_winsorize_percentile),
2019
+ use_kl_optimization=adv_kl_optimization,
2020
+ kl_budget=float(adv_kl_budget),
2021
+ float_layer_interpolation=adv_float_layer_interp,
2022
+ rdo_refinement=adv_rdo_refinement,
2023
+ cot_aware=adv_cot_aware,
2024
+ n_sae_features=int(adv_n_sae_features),
2025
  )
2026
  pipeline_ref[0] = pipeline
2027
  pipeline.run()
 
2193
  bnb_4bit_quant_type="nf4",
2194
  llm_int8_enable_fp32_cpu_offload=True,
2195
  )
2196
+ model_reloaded = _load_model_to_device(
2197
  save_dir,
2198
  quantization_config=bnb_cfg,
 
2199
  trust_remote_code=True,
2200
  )
2201
  tokenizer_reloaded = AutoTokenizer.from_pretrained(
 
2233
  yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update(), gr.update()
2234
  try:
2235
  offload_dir = tempfile.mkdtemp(prefix="obliteratus_offload_")
2236
+ model_reloaded = _load_model_to_device(
2237
  save_dir,
 
2238
  offload_folder=offload_dir,
2239
  torch_dtype=torch.float16,
2240
  trust_remote_code=True,
 
2395
  if checkpoint and Path(checkpoint).exists():
2396
  try:
2397
  is_preset = (_state.get("model_name") or "") in MODELS
2398
+ model = _load_model_to_device(
2399
+ checkpoint, torch_dtype=torch.float16,
2400
  trust_remote_code=is_preset,
2401
  )
2402
  tokenizer = AutoTokenizer.from_pretrained(
 
2586
  if checkpoint and Path(checkpoint).exists():
2587
  is_preset = (_state.get("model_name") or "") in MODELS
2588
  try:
2589
+ model_loaded = _load_model_to_device(
2590
+ checkpoint, torch_dtype=torch.float16,
2591
  trust_remote_code=is_preset,
2592
  )
2593
  tokenizer_loaded = AutoTokenizer.from_pretrained(
 
2647
 
2648
  is_preset = cfg["model_choice"] in MODELS
2649
  try:
2650
+ model_loaded = _load_model_to_device(
2651
  checkpoint_dir,
 
2652
  torch_dtype=torch.float16,
2653
  trust_remote_code=is_preset,
2654
  )
 
2682
  )
2683
  yield f"**Loading {choice}** in 4-bit (model too large for fp16)...", ""
2684
  progress(0.5, desc="Loading 4-bit...")
2685
+ model_loaded = _load_model_to_device(
2686
  checkpoint_dir,
2687
  quantization_config=bnb_cfg,
 
2688
  trust_remote_code=is_preset,
2689
  )
2690
  tokenizer_loaded = AutoTokenizer.from_pretrained(
 
2826
  if checkpoint and Path(checkpoint).exists():
2827
  try:
2828
  is_preset = (model_name or "") in MODELS
2829
+ abliterated_model = _load_model_to_device(
2830
+ checkpoint, torch_dtype=torch.float16,
2831
  trust_remote_code=is_preset,
2832
  )
2833
  tokenizer = AutoTokenizer.from_pretrained(
 
2952
  is_preset = model_name in MODELS
2953
  original_response = ""
2954
  try:
2955
+ original_model = _load_model_to_device(
 
2956
  model_id, torch_dtype=torch.float16,
2957
+ trust_remote_code=is_preset,
2958
  low_cpu_mem_usage=True,
2959
  token=os.environ.get("HF_TOKEN") or None,
2960
  )
 
3265
  return _tourney_gpu_run(fn, *args, **kwargs)
3266
 
3267
 
3268
+ def run_tourney(model_choice, selected_methods, dataset, quantization):
3269
+ """Run an elimination tournament across selected abliteration methods.
3270
 
3271
  Each individual method is run inside its own ``@spaces.GPU`` allocation
3272
  (up to 5 minutes per method) so the full tournament is not constrained
 
3279
  yield "**Error:** Select a model first.", "", ""
3280
  return
3281
 
3282
+ if not selected_methods or len(selected_methods) < 3:
3283
+ yield "**Error:** Select at least 3 methods for a tournament.", "", ""
3284
+ return
3285
+
3286
  from obliteratus.tourney import (
3287
  TourneyRunner, render_bracket_html,
3288
  _load_checkpoint, _checkpoint_matches,
 
3314
  hub_repo=None,
3315
  dataset_key=dataset_key,
3316
  quantization=quant,
3317
+ methods=list(selected_methods),
3318
  on_log=logger,
3319
  resume=resume,
3320
  )
 
3419
  _ts = datetime.now().strftime("%H:%M")
3420
  _short = model_id.split("/")[-1] if "/" in model_id else model_id
3421
  _label = f"tourney winner ({winner.method}) on {_short} ({_ts})"
3422
+ _winner_meta = {
3423
+ "model_id": model_id,
3424
+ "model_choice": model_choice,
3425
+ "method": winner.method,
3426
+ "dataset_key": dataset_key,
3427
+ "prompt_volume": 0,
3428
+ "output_dir": winner.output_dir,
3429
+ "source": "tourney",
3430
+ "tourney_score": winner.score,
3431
+ "tourney_metrics": winner.metrics,
3432
+ }
3433
  with _lock:
3434
+ _session_models[_label] = _winner_meta
3435
+ # Persist so the winner survives ZeroGPU process restarts
3436
+ _persist_session_meta(winner.output_dir, _label, {
3437
+ "model_id": model_id,
3438
+ "model_choice": model_choice,
3439
+ "method": winner.method,
3440
+ "dataset_key": dataset_key,
3441
+ "source": "tourney",
3442
+ })
 
 
3443
  yield (
3444
  f"**Champion: `{winner.method}`** "
3445
  f"(score: {winner.score:.4f})\n"
 
4108
  with gr.Row():
4109
  adv_spectral_cascade = gr.Checkbox(value=_defaults["spectral_cascade"], label="Spectral Cascade",
4110
  info="DCT frequency decomposition for precision refusal targeting")
4111
+ gr.Markdown("**Layer Selection & Baseline Options**")
4112
+ with gr.Row():
4113
+ adv_layer_selection = gr.Dropdown(
4114
+ choices=["knee_cosmic", "all", "all_except_first", "middle60", "top_k", "knee"],
4115
+ value=_defaults["layer_selection"],
4116
+ label="Layer Selection",
4117
+ info="Which layers to project refusal directions from",
4118
+ )
4119
+ adv_winsorize_percentile = gr.Slider(
4120
+ 0.0, 1.0, value=_defaults["winsorize_percentile"], step=0.01,
4121
+ label="Winsorize Percentile",
4122
+ info="Activation clamping quantile (1.0 = disabled, 0.01 = 99th pctile)",
4123
+ )
4124
+ adv_kl_budget = gr.Slider(
4125
+ 0.0, 2.0, value=_defaults["kl_budget"], step=0.1,
4126
+ label="KL Budget",
4127
+ info="Max KL divergence from base model (Heretic/optimized)",
4128
+ )
4129
+ with gr.Row():
4130
+ adv_winsorize = gr.Checkbox(value=_defaults["winsorize_activations"], label="Winsorize Activations",
4131
+ info="Clamp outlier activations before direction extraction")
4132
+ adv_kl_optimization = gr.Checkbox(value=_defaults["use_kl_optimization"], label="KL Optimization",
4133
+ info="Optimize projection strength to stay within KL budget")
4134
+ adv_float_layer_interp = gr.Checkbox(value=_defaults["float_layer_interpolation"], label="Float Layer Interpolation",
4135
+ info="Interpolate between adjacent layers' directions (Heretic)")
4136
+ adv_rdo_refinement = gr.Checkbox(value=_defaults["rdo_refinement"], label="RDO Refinement",
4137
+ info="Gradient-based direction refinement (Wollschlager et al.)")
4138
+ with gr.Row():
4139
+ adv_cot_aware = gr.Checkbox(value=_defaults["cot_aware"], label="CoT-Aware",
4140
+ info="Preserve chain-of-thought reasoning during abliteration")
4141
+ with gr.Row():
4142
+ adv_bayesian_trials = gr.Slider(
4143
+ 10, 200, value=_defaults["bayesian_trials"], step=10,
4144
+ label="Bayesian Trials",
4145
+ info="Optuna TPE optimization trials (Heretic/optimized methods)",
4146
+ )
4147
+ adv_n_sae_features = gr.Slider(
4148
+ 16, 256, value=_defaults["n_sae_features"], step=16,
4149
+ label="SAE Features",
4150
+ info="Number of SAE features to target (inverted/nuclear methods)",
4151
+ )
4152
 
4153
  # List of all advanced controls (order must match _on_method_change return)
4154
  _adv_controls = [
 
4165
  adv_project_embeddings, adv_activation_steering,
4166
  adv_expert_transplant, adv_wasserstein_optimal,
4167
  adv_spectral_cascade,
4168
+ adv_layer_selection, adv_winsorize,
4169
+ adv_winsorize_percentile,
4170
+ adv_kl_optimization, adv_kl_budget,
4171
+ adv_float_layer_interp, adv_rdo_refinement,
4172
+ adv_cot_aware,
4173
+ adv_bayesian_trials, adv_n_sae_features,
4174
  ]
4175
 
4176
  obliterate_btn = gr.Button(
 
4341
  mm_method = gr.Dropdown(
4342
  choices=["basic", "advanced", "aggressive",
4343
  "spectral_cascade", "informed", "surgical",
4344
+ "optimized", "inverted", "nuclear",
4345
+ "failspy", "gabliteration", "heretic", "rdo"],
4346
  value="surgical",
4347
  label="Abliteration Method",
4348
  )
 
4711
 
4712
  # ── Tab 6: Tourney ────────────────────────────────────────────────
4713
  with gr.Tab("Tourney", id="tourney"):
4714
+ gr.Markdown("""### Tourney Mode
4715
+ Pit abliteration methods against each other in elimination rounds.
4716
  The winner is saved locally — push it to HuggingFace Hub from the **Push to Hub** tab.
4717
 
4718
+ **Round 1 — Qualifiers:** Selected methods, reduced prompts. Bottom half eliminated.
4719
  **Round 2 — Semifinals:** Survivors, full prompts. Bottom half eliminated.
4720
  **Round 3 — Finals:** Top contenders, maximum prompts. Champion crowned.
4721
  """)
 
4727
  allow_custom_value=True,
4728
  )
4729
 
4730
+ from obliteratus.tourney import TOURNEY_METHODS as _ALL_TOURNEY_METHODS
4731
+ tourney_methods_cb = gr.CheckboxGroup(
4732
+ choices=_ALL_TOURNEY_METHODS,
4733
+ value=_ALL_TOURNEY_METHODS,
4734
+ label="Methods to Compete",
4735
+ info="Pick at least 3 methods. All selected by default.",
4736
+ )
4737
+
4738
  with gr.Accordion("Advanced Settings", open=False):
4739
  with gr.Row():
4740
  tourney_dataset_dd = gr.Dropdown(
 
4764
 
4765
  tourney_btn.click(
4766
  fn=run_tourney,
4767
+ inputs=[tourney_model_dd, tourney_methods_cb,
4768
  tourney_dataset_dd, tourney_quant_dd],
4769
  outputs=[tourney_status, tourney_bracket, tourney_log],
4770
+ ).then(
4771
+ fn=lambda: (
4772
+ gr.update(choices=_get_session_model_choices()),
4773
+ gr.update(choices=_get_session_model_choices()),
4774
+ _get_vram_html(),
4775
+ ),
4776
+ outputs=[session_model_dd, ab_session_model_dd, vram_display],
4777
  )
4778
 
4779
  # ── Tab 7: Export ─────────────────────────────────────────────────
docs/index.html CHANGED
@@ -1317,11 +1317,56 @@
1317
  <span class="method-label">AGGRESSIVE</span>
1318
  <span class="method-desc">Full Gabliteration + 3-pass refine</span>
1319
  </label>
 
 
 
 
 
1320
  <label class="method-radio" id="method-informed" onclick="setAblMethod('informed')" style="border-color:var(--cyan)">
1321
  <input type="radio" name="abl-method" value="informed">
1322
  <span class="method-label" style="color:var(--cyan)">INFORMED</span>
1323
  <span class="method-desc">Analysis-guided auto-config + Ouroboros</span>
1324
  </label>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1325
  </div>
1326
  <div id="method-details" style="margin-top:10px; font-size:0.7rem; color:var(--text-dim); padding:8px; border:1px solid rgba(188,19,254,0.2); border-radius:4px">
1327
  4 SVD directions &bull; norm-preserving &bull; 30% regularization &bull; 2 refinement passes &bull; 32 prompt pairs
@@ -1941,10 +1986,19 @@ function startAbliterateFromLibrary(hfId) {
1941
 
1942
  let ablMethod = 'advanced';
1943
  const METHOD_INFO = {
1944
- basic: {dirs:1, norm:false, reg:0.0, passes:1, desc:'1 direction &bull; standard projection &bull; 1 pass &bull; 32 prompt pairs'},
1945
- advanced: {dirs:4, norm:true, reg:0.3, passes:2, desc:'4 SVD directions &bull; norm-preserving &bull; 30% regularization &bull; 2 refinement passes &bull; 32 prompt pairs'},
1946
- aggressive: {dirs:8, norm:true, reg:0.0, passes:3, desc:'8 SVD directions &bull; norm-preserving &bull; full orthogonalization &bull; 3 refinement passes &bull; 32 prompt pairs'},
1947
- informed: {dirs:'auto', norm:true, reg:'auto', passes:'auto', desc:'<span style="color:var(--cyan)">Analysis-guided</span> &bull; auto directions &bull; auto regularization &bull; Ouroboros-compensated &bull; cone/alignment/cluster/defense analysis'},
 
 
 
 
 
 
 
 
 
1948
  };
1949
 
1950
  function getAblCmd() {
 
1317
  <span class="method-label">AGGRESSIVE</span>
1318
  <span class="method-desc">Full Gabliteration + 3-pass refine</span>
1319
  </label>
1320
+ <label class="method-radio" id="method-spectral_cascade" onclick="setAblMethod('spectral_cascade')">
1321
+ <input type="radio" name="abl-method" value="spectral_cascade">
1322
+ <span class="method-label">SPECTRAL</span>
1323
+ <span class="method-desc">DCT frequency-selective decomposition</span>
1324
+ </label>
1325
  <label class="method-radio" id="method-informed" onclick="setAblMethod('informed')" style="border-color:var(--cyan)">
1326
  <input type="radio" name="abl-method" value="informed">
1327
  <span class="method-label" style="color:var(--cyan)">INFORMED</span>
1328
  <span class="method-desc">Analysis-guided auto-config + Ouroboros</span>
1329
  </label>
1330
+ <label class="method-radio" id="method-surgical" onclick="setAblMethod('surgical')">
1331
+ <input type="radio" name="abl-method" value="surgical">
1332
+ <span class="method-label">SURGICAL</span>
1333
+ <span class="method-desc">Precision MoE-aware head surgery</span>
1334
+ </label>
1335
+ <label class="method-radio" id="method-optimized" onclick="setAblMethod('optimized')">
1336
+ <input type="radio" name="abl-method" value="optimized">
1337
+ <span class="method-label">OPTIMIZED</span>
1338
+ <span class="method-desc">Bayesian auto-tuned + KL-optimized</span>
1339
+ </label>
1340
+ <label class="method-radio" id="method-inverted" onclick="setAblMethod('inverted')">
1341
+ <input type="radio" name="abl-method" value="inverted">
1342
+ <span class="method-label">INVERTED</span>
1343
+ <span class="method-desc">Semantic refusal inversion</span>
1344
+ </label>
1345
+ <label class="method-radio" id="method-nuclear" onclick="setAblMethod('nuclear')">
1346
+ <input type="radio" name="abl-method" value="nuclear">
1347
+ <span class="method-label">NUCLEAR</span>
1348
+ <span class="method-desc">Maximum force combo</span>
1349
+ </label>
1350
+ <label class="method-radio" id="method-failspy" onclick="setAblMethod('failspy')">
1351
+ <input type="radio" name="abl-method" value="failspy">
1352
+ <span class="method-label">FAILSPY</span>
1353
+ <span class="method-desc">FailSpy/abliterator baseline</span>
1354
+ </label>
1355
+ <label class="method-radio" id="method-gabliteration" onclick="setAblMethod('gabliteration')">
1356
+ <input type="radio" name="abl-method" value="gabliteration">
1357
+ <span class="method-label">GABLIT</span>
1358
+ <span class="method-desc">Gabliteration (G&uuml;lmez 2026) baseline</span>
1359
+ </label>
1360
+ <label class="method-radio" id="method-heretic" onclick="setAblMethod('heretic')">
1361
+ <input type="radio" name="abl-method" value="heretic">
1362
+ <span class="method-label">HERETIC</span>
1363
+ <span class="method-desc">Heretic/p-e-w Bayesian baseline</span>
1364
+ </label>
1365
+ <label class="method-radio" id="method-rdo" onclick="setAblMethod('rdo')">
1366
+ <input type="radio" name="abl-method" value="rdo">
1367
+ <span class="method-label">RDO</span>
1368
+ <span class="method-desc">Refusal Direction Optimization baseline</span>
1369
+ </label>
1370
  </div>
1371
  <div id="method-details" style="margin-top:10px; font-size:0.7rem; color:var(--text-dim); padding:8px; border:1px solid rgba(188,19,254,0.2); border-radius:4px">
1372
  4 SVD directions &bull; norm-preserving &bull; 30% regularization &bull; 2 refinement passes &bull; 32 prompt pairs
 
1986
 
1987
  let ablMethod = 'advanced';
1988
  const METHOD_INFO = {
1989
+ basic: {dirs:1, norm:false, reg:0.0, passes:1, desc:'1 direction &bull; standard projection &bull; 1 pass'},
1990
+ advanced: {dirs:4, norm:true, reg:0.3, passes:2, desc:'4 SVD directions &bull; norm-preserving &bull; 30% regularization &bull; 2 refinement passes'},
1991
+ aggressive: {dirs:8, norm:true, reg:0.0, passes:3, desc:'8 SVD directions &bull; norm-preserving &bull; full orthogonalization &bull; 3 refinement passes'},
1992
+ spectral_cascade: {dirs:6, norm:true, reg:0.15, passes:1, desc:'6 whitened-SVD directions &bull; DCT frequency decomposition &bull; coherence-weighted &bull; adaptive bands'},
1993
+ informed: {dirs:'auto', norm:true, reg:'auto', passes:'auto', desc:'<span style="color:var(--cyan)">Analysis-guided</span> &bull; auto directions &bull; auto regularization &bull; Ouroboros-compensated &bull; cone/alignment/cluster analysis'},
1994
+ surgical: {dirs:4, norm:true, reg:0.2, passes:2, desc:'4 SVD directions &bull; attention head surgery &bull; SAE features &bull; safety neuron masking &bull; per-expert MoE'},
1995
+ optimized: {dirs:4, norm:true, reg:0.2, passes:2, desc:'4 SVD directions &bull; Bayesian auto-tuned &bull; CoT-aware &bull; KL co-optimized &bull; winsorized activations'},
1996
+ inverted: {dirs:4, norm:true, reg:0.1, passes:2, desc:'4 SVD directions &bull; semantic inversion (2x reflection) &bull; SAE feature targeting'},
1997
+ nuclear: {dirs:8, norm:true, reg:0.0, passes:3, desc:'8 SVD directions &bull; all techniques combined &bull; maximum force &bull; head surgery + SAE + steering + transplant'},
1998
+ failspy: {dirs:1, norm:false, reg:0.0, passes:1, desc:'<span style="color:var(--text-dim)">Baseline</span> &bull; 1 diff-means direction &bull; all layers except first &bull; FailSpy/abliterator reproduction'},
1999
+ gabliteration: {dirs:4, norm:false, reg:0.231, passes:1, desc:'<span style="color:var(--text-dim)">Baseline</span> &bull; 4 SVD directions &bull; ridge reg (alpha=0.3) &bull; top-k layer selection &bull; G&uuml;lmez 2026'},
2000
+ heretic: {dirs:1, norm:false, reg:0.0, passes:1, desc:'<span style="color:var(--text-dim)">Baseline</span> &bull; 1 diff-means &bull; Bayesian (Optuna TPE) &bull; KL-optimized &bull; float layer interpolation &bull; p-e-w'},
2001
+ rdo: {dirs:4, norm:true, reg:0.0, passes:1, desc:'<span style="color:var(--text-dim)">Baseline</span> &bull; 4 SVD directions &bull; gradient-refined (RDO) &bull; linear probe classifier &bull; Wollschlager ICML 2025'},
2002
  };
2003
 
2004
  function getAblCmd() {
obliteratus/.DS_Store CHANGED
Binary files a/obliteratus/.DS_Store and b/obliteratus/.DS_Store differ
 
obliteratus/abliterate.py CHANGED
@@ -328,10 +328,11 @@ METHODS = {
328
  "description": (
329
  "Faithful reproduction of the FailSpy/abliterator library — the "
330
  "most widely used community tool. Single direction via difference-"
331
- "in-means (Arditi et al.), middle 60%% layer heuristic (layers "
332
- "20%%-80%%), no regularization, no norm preservation. Uses chat "
333
- "template for instruct models. This is what most HuggingFace "
334
- "abliterated models were created with."
 
335
  ),
336
  "n_directions": 1,
337
  "direction_method": "diff_means",
@@ -349,7 +350,7 @@ METHODS = {
349
  "attention_head_surgery": False,
350
  "use_sae_features": False,
351
  "invert_refusal": False,
352
- "layer_selection": "middle60",
353
  },
354
  "gabliteration": {
355
  "label": "Gabliteration (Gülmez 2026 Baseline)",
@@ -383,20 +384,26 @@ METHODS = {
383
  "layer_selection": "top_k",
384
  },
385
  "heretic": {
386
- "label": "Heretic / p-e-w (2025 Baseline)",
387
  "description": (
388
- "Faithful reproduction of Heretic's core algorithm (p-e-w, 2025). "
389
- "Bayesian optimization via Optuna TPE with parametric bell curve "
390
- "kernel. Uses 1-2 directions (float interpolation between top SVD "
391
- "components), component-specific scaling (attention vs MLP), "
392
- "activation winsorization (1%% tails). No whitened SVD, no SAE "
393
- "features, no jailbreak contrast. The key innovation is replacing "
 
 
 
 
394
  "manual hyperparameter selection with automated Pareto optimization "
395
- "over the (refusal_rate, KL_divergence) frontier."
396
  ),
397
- "n_directions": 2,
398
  "direction_method": "diff_means",
399
- "norm_preserve": True,
 
 
400
  "regularization": 0.0,
401
  "refinement_passes": 1,
402
  "project_biases": False,
@@ -404,14 +411,21 @@ METHODS = {
404
  "use_whitened_svd": False,
405
  "true_iterative_refinement": False,
406
  "use_jailbreak_contrast": False,
407
- "layer_adaptive_strength": True,
 
 
408
  "safety_neuron_masking": False,
409
  "per_expert_directions": False,
410
  "attention_head_surgery": False,
411
  "use_sae_features": False,
412
  "invert_refusal": False,
413
- "winsorize_activations": True,
414
- "winsorize_percentile": 0.01,
 
 
 
 
 
415
  "float_layer_interpolation": True,
416
  "cot_aware": False,
417
  "use_kl_optimization": True,
@@ -1689,7 +1703,8 @@ class AbliterationPipeline:
1689
  # Supports multiple algorithms for baseline comparison:
1690
  # knee_cosmic: OBLITERATUS default (knee detection + COSMIC fusion)
1691
  # knee: knee detection only (simplified OBLITERATUS)
1692
- # middle60: FailSpy/abliterator heuristic (layers 20%-80%)
 
1693
  # all: all layers (for Bayesian optimization / Heretic)
1694
  # top_k: top-k by refusal strength (Gabliteration-style)
1695
  sorted_layers = sorted(norms.items(), key=lambda x: x[1], reverse=True)
@@ -1702,8 +1717,14 @@ class AbliterationPipeline:
1702
 
1703
  selection_method = self.layer_selection
1704
 
1705
- if selection_method == "middle60":
1706
- # FailSpy/abliterator heuristic: middle 60% of layers
 
 
 
 
 
 
1707
  self._strong_layers = self._select_layers_middle60(n_layers)
1708
  self.log(f"Layer selection: middle-60% ({len(self._strong_layers)} layers)")
1709
 
@@ -2359,14 +2380,14 @@ class AbliterationPipeline:
2359
 
2360
  @staticmethod
2361
  def _select_layers_middle60(n_layers: int) -> list[int]:
2362
- """Select the middle 60% of layers (FailSpy/abliterator heuristic).
2363
 
2364
- The original abliterator library by FailSpy selects layers from index
2365
- n_layers*0.2 to n_layers*0.8, based on the empirical observation that
2366
- refusal concentrates in middle layers (not early embedding layers or
2367
- late unembedding layers).
2368
 
2369
- Reference: FailSpy/abliterator (2024), GitHub.
 
 
 
2370
  """
2371
  start = int(n_layers * 0.2)
2372
  end = int(n_layers * 0.8)
@@ -3751,7 +3772,9 @@ class AbliterationPipeline:
3751
 
3752
  # Respect configured layer_selection (matching _distill)
3753
  selection_method = self.layer_selection
3754
- if selection_method == "middle60":
 
 
3755
  self._strong_layers = self._select_layers_middle60(n_layers)
3756
  elif selection_method == "all":
3757
  self._strong_layers = self._select_layers_all(n_layers)
 
328
  "description": (
329
  "Faithful reproduction of the FailSpy/abliterator library — the "
330
  "most widely used community tool. Single direction via difference-"
331
+ "in-means (Arditi et al.), applied to all layers except layer 0 "
332
+ "(matching FailSpy source: range(1, n_layers)). Projects both "
333
+ "W_O (attention output) and MLP W_out. No regularization, no "
334
+ "norm preservation. Uses chat template for instruct models. "
335
+ "This is what most HuggingFace abliterated models were created with."
336
  ),
337
  "n_directions": 1,
338
  "direction_method": "diff_means",
 
350
  "attention_head_surgery": False,
351
  "use_sae_features": False,
352
  "invert_refusal": False,
353
+ "layer_selection": "all_except_first",
354
  },
355
  "gabliteration": {
356
  "label": "Gabliteration (Gülmez 2026 Baseline)",
 
384
  "layer_selection": "top_k",
385
  },
386
  "heretic": {
387
+ "label": "Heretic / p-e-w (2025-2026 Baseline)",
388
  "description": (
389
+ "Faithful reproduction of Heretic's core algorithm (p-e-w, 2025-2026). "
390
+ "Bayesian optimization via Optuna TPE with linear bell curve layer "
391
+ "weighting (NOT Gaussian linear interpolation between max_weight and "
392
+ "min_weight over min_weight_distance). One diff-of-means direction per "
393
+ "layer; direction_scope is sampled ('global' selects a float layer index "
394
+ "with lerp between adjacent layers' directions, 'per layer' uses each "
395
+ "layer's own direction). LoRA-based ablation (delta W = -lambda * v * "
396
+ "(v^T W)), never modifies base weights directly. Row normalization "
397
+ "defaults to NONE (PRE and FULL are options). Activation winsorization "
398
+ "via symmetric quantile clamping. The key innovation is replacing "
399
  "manual hyperparameter selection with automated Pareto optimization "
400
+ "over the (refusal_count, KL_divergence) frontier."
401
  ),
402
+ "n_directions": 1,
403
  "direction_method": "diff_means",
404
+ # Heretic default row_normalization is NONE; PRE/FULL are optional.
405
+ # OBLITERATUS norm_preserve=False matches Heretic's default behavior.
406
+ "norm_preserve": False,
407
  "regularization": 0.0,
408
  "refinement_passes": 1,
409
  "project_biases": False,
 
411
  "use_whitened_svd": False,
412
  "true_iterative_refinement": False,
413
  "use_jailbreak_contrast": False,
414
+ # Heretic uses its own bell curve weighting (linear, not Gaussian),
415
+ # not OBLITERATUS's norm-based layer_adaptive_strength.
416
+ "layer_adaptive_strength": False,
417
  "safety_neuron_masking": False,
418
  "per_expert_directions": False,
419
  "attention_head_surgery": False,
420
  "use_sae_features": False,
421
  "invert_refusal": False,
422
+ # Heretic default winsorization_quantile is 1.0 (disabled by default).
423
+ # For faithful baseline reproduction we match the source default.
424
+ "winsorize_activations": False,
425
+ "winsorize_percentile": 1.0,
426
+ # Heretic's float direction index interpolates between adjacent LAYERS'
427
+ # directions (not SVD components). OBLITERATUS float_layer_interpolation
428
+ # provides the bell-curve layer weighting aspect.
429
  "float_layer_interpolation": True,
430
  "cot_aware": False,
431
  "use_kl_optimization": True,
 
1703
  # Supports multiple algorithms for baseline comparison:
1704
  # knee_cosmic: OBLITERATUS default (knee detection + COSMIC fusion)
1705
  # knee: knee detection only (simplified OBLITERATUS)
1706
+ # middle60: legacy heuristic (layers 20%-80%)
1707
+ # all_except_first: FailSpy/abliterator (all layers except layer 0)
1708
  # all: all layers (for Bayesian optimization / Heretic)
1709
  # top_k: top-k by refusal strength (Gabliteration-style)
1710
  sorted_layers = sorted(norms.items(), key=lambda x: x[1], reverse=True)
 
1717
 
1718
  selection_method = self.layer_selection
1719
 
1720
+ if selection_method == "all_except_first":
1721
+ # FailSpy/abliterator: all layers except layer 0
1722
+ # Source: range(1, self.model.cfg.n_layers) in FailSpy/abliterator
1723
+ self._strong_layers = list(range(1, n_layers))
1724
+ self.log(f"Layer selection: all-except-first ({len(self._strong_layers)} layers)")
1725
+
1726
+ elif selection_method == "middle60":
1727
+ # Legacy heuristic: middle 60% of layers (layers 20%-80%)
1728
  self._strong_layers = self._select_layers_middle60(n_layers)
1729
  self.log(f"Layer selection: middle-60% ({len(self._strong_layers)} layers)")
1730
 
 
2380
 
2381
  @staticmethod
2382
  def _select_layers_middle60(n_layers: int) -> list[int]:
2383
+ """Select the middle 60% of layers (legacy heuristic).
2384
 
2385
+ Selects layers from index n_layers*0.2 to n_layers*0.8.
 
 
 
2386
 
2387
+ NOTE: This does NOT match FailSpy/abliterator's actual layer selection.
2388
+ FailSpy uses all layers except layer 0 (range(1, n_layers)). Use
2389
+ layer_selection="all_except_first" for faithful FailSpy reproduction.
2390
+ This method is retained for backward compatibility only.
2391
  """
2392
  start = int(n_layers * 0.2)
2393
  end = int(n_layers * 0.8)
 
3772
 
3773
  # Respect configured layer_selection (matching _distill)
3774
  selection_method = self.layer_selection
3775
+ if selection_method == "all_except_first":
3776
+ self._strong_layers = list(range(1, n_layers))
3777
+ elif selection_method == "middle60":
3778
  self._strong_layers = self._select_layers_middle60(n_layers)
3779
  elif selection_method == "all":
3780
  self._strong_layers = self._select_layers_all(n_layers)
obliteratus/analysis/conditional_abliteration.py CHANGED
@@ -269,7 +269,7 @@ class ConditionalAbliterator:
269
  ) -> torch.Tensor | None:
270
  """Extract category-specific refusal direction.
271
 
272
- Uses Fisher's Linear Discriminant (whitened difference-of-means)
273
  and then orthogonalizes against previously extracted directions
274
  to ensure category independence.
275
  """
 
269
  ) -> torch.Tensor | None:
270
  """Extract category-specific refusal direction.
271
 
272
+ Uses difference-of-means (category_mean - harmless_mean)
273
  and then orthogonalizes against previously extracted directions
274
  to ensure category independence.
275
  """
obliteratus/analysis/leace.py CHANGED
@@ -1,36 +1,34 @@
1
- """LEACE (LEAst-squares Concept Erasure) for refusal direction extraction.
2
-
3
- Closed-form optimal concept erasure that finds the minimum-rank edit to
4
- representations preventing any linear classifier from recovering the
5
- concept (harmful vs harmless). Unlike SVD, LEACE produces directions
6
- that are provably optimal for erasure with minimal distortion.
7
-
8
- The key insight: instead of finding directions of maximum variance (SVD)
9
- or maximum mean difference (diff-of-means), LEACE solves a constrained
10
- optimization problem: find the smallest perturbation to representations
11
- such that no linear probe can distinguish harmful from harmless.
12
-
13
- Mathematical formulation (rank-1 erasure):
14
- Given class-conditional means mu_0, mu_1 and within-class
15
- covariance S_w:
16
- 1. Compute mean difference: delta = mu_1 - mu_0
17
- 2. Compute within-class covariance: S_w = (S_0 + S_1) / 2
18
- 3. Solve generalized eigenvalue problem: S_b v = lambda S_w v
19
- where S_b = delta @ delta^T (between-class scatter)
20
- 4. The top generalized eigenvector is the LEACE direction
21
- 5. Erase by projecting out: x' = x - (x @ v) * v^T
22
-
23
- This is mathematically equivalent to Fisher's Linear Discriminant but
24
- applied as an erasure operation. The direction maximizes class
25
- separability relative to within-class spread, making it the optimal
26
- single direction to remove for concept erasure.
 
27
 
28
  Advantages over SVD:
29
- - Theoretically optimal: minimizes representation distortion for
30
- guaranteed erasure of linear concept information
31
- - Handles rogue dimensions naturally: within-class normalization
32
- prevents high-variance but non-discriminative dimensions from
33
- dominating
34
  - No hyperparameters beyond regularization epsilon
35
  - Closed-form solution (no iterative optimization)
36
 
@@ -39,6 +37,8 @@ References:
39
  closed form. NeurIPS 2023.
40
  - Ravfogel et al. (2022): RLACE: Adversarial concept erasure
41
  (iterative precursor to LEACE).
 
 
42
  """
43
 
44
  from __future__ import annotations
@@ -61,11 +61,11 @@ class LEACEResult:
61
 
62
 
63
  class LEACEExtractor:
64
- """Extract refusal directions via LEACE (closed-form concept erasure).
65
 
66
  Finds the direction that maximally separates harmful from harmless
67
- activations relative to within-class variance, then erases it.
68
- This is the provably optimal rank-1 concept erasure.
69
  """
70
 
71
  def __init__(
 
1
+ """LEACE-inspired direction extraction for refusal concept erasure.
2
+
3
+ This module implements Fisher's Linear Discriminant (FLD) direction for
4
+ concept erasure, inspired by LEACE (Belrose et al. 2023).
5
+
6
+ IMPORTANT: This is NOT a faithful implementation of LEACE as described in
7
+ the paper. Key difference:
8
+
9
+ - **True LEACE** uses the *total* covariance Sigma_X for whitening:
10
+ P* = I - W^{-1} P_{W Sigma_XZ} W where W = Sigma_X^{-1/2}
11
+ For binary concepts, this yields: v = Sigma_X^{-1} delta
12
+
13
+ - **This implementation** uses *within-class* covariance S_w:
14
+ v = S_w^{-1} delta
15
+ This is Fisher's Linear Discriminant direction, which maximizes
16
+ class separability relative to within-class spread.
17
+
18
+ For binary concepts, Sigma_X = S_w + p(1-p) * delta @ delta^T,
19
+ so the two directions differ when the between-class scatter is
20
+ non-negligible relative to within-class scatter. In high-dimensional
21
+ settings (d >> 1) with moderate class separation, the difference
22
+ is typically small but non-zero.
23
+
24
+ The FLD direction is still a strong choice for refusal erasure — it
25
+ handles rogue dimensions (high-variance but non-discriminative) better
26
+ than plain diff-of-means, and is a closed-form solution with no
27
+ iterative optimization.
28
 
29
  Advantages over SVD:
30
+ - Within-class normalization prevents high-variance but
31
+ non-discriminative dimensions from dominating
 
 
 
32
  - No hyperparameters beyond regularization epsilon
33
  - Closed-form solution (no iterative optimization)
34
 
 
37
  closed form. NeurIPS 2023.
38
  - Ravfogel et al. (2022): RLACE: Adversarial concept erasure
39
  (iterative precursor to LEACE).
40
+ - Fisher (1936): The use of multiple measurements in taxonomic
41
+ problems. Annals of Eugenics.
42
  """
43
 
44
  from __future__ import annotations
 
61
 
62
 
63
  class LEACEExtractor:
64
+ """Extract refusal directions via Fisher's Linear Discriminant.
65
 
66
  Finds the direction that maximally separates harmful from harmless
67
+ activations relative to within-class variance (v = S_w^{-1} delta).
68
+ See module docstring for how this relates to true LEACE.
69
  """
70
 
71
  def __init__(
obliteratus/analysis/riemannian_manifold.py CHANGED
@@ -428,8 +428,15 @@ class RiemannianManifoldAnalyzer:
428
  geodesic triangle with area A satisfies:
429
  sum(angles) = pi + K * A (Gauss-Bonnet for small triangles)
430
 
431
- We approximate geodesics with straight lines (valid for small K)
432
- and use angle excess to estimate K.
 
 
 
 
 
 
 
433
  """
434
  # Compute sides
435
  ab = (b - a).float()
@@ -613,8 +620,12 @@ class RiemannianManifoldAnalyzer:
613
  return torch.zeros_like(activation)
614
  v = v / norm
615
 
616
- # Correction magnitude: K * proj_magnitude^2 / 2
617
- correction_magnitude = curvature * proj_magnitude ** 2 / 2.0
 
 
 
 
618
 
619
  # Clamp to prevent instability
620
  correction_magnitude = max(-0.1, min(0.1, correction_magnitude))
 
428
  geodesic triangle with area A satisfies:
429
  sum(angles) = pi + K * A (Gauss-Bonnet for small triangles)
430
 
431
+ IMPORTANT LIMITATION: This method uses Euclidean chords and angles
432
+ in ambient space, NOT geodesics on the manifold. In flat Euclidean
433
+ space, the angle sum of any triangle is exactly pi, so this method
434
+ will yield K ≈ 0 (up to numerical noise) regardless of the actual
435
+ manifold curvature. The results are only meaningful when the data
436
+ lies on an approximately low-dimensional curved submanifold and
437
+ triangles are sufficiently small relative to the curvature radius.
438
+ For rigorous curvature estimates, use methods based on local PCA
439
+ eigenvalue decay or Jacobian-based Riemannian metric computation.
440
  """
441
  # Compute sides
442
  ab = (b - a).float()
 
620
  return torch.zeros_like(activation)
621
  v = v / norm
622
 
623
+ # Second-order geodesic correction: K * proj_magnitude^2 / 6
624
+ # From Jacobi field estimate: deviation of geodesic from straight
625
+ # line over distance L with curvature K is ≈ K * L^2 / 6.
626
+ # Note: the residual bound in analyze() uses K * ||x||^2 / 8
627
+ # which is a looser upper bound including higher-order terms.
628
+ correction_magnitude = curvature * proj_magnitude ** 2 / 6.0
629
 
630
  # Clamp to prevent instability
631
  correction_magnitude = max(-0.1, min(0.1, correction_magnitude))
obliteratus/analysis/sae_abliteration.py CHANGED
@@ -94,7 +94,7 @@ class SparseAutoencoder(nn.Module):
94
 
95
  @property
96
  def decoder_weight(self) -> torch.Tensor:
97
- """Return the decoder weight matrix (n_features x hidden_dim for untied, or encoder.weight.T)."""
98
  if self.tied_weights:
99
  return self.encoder.weight.T
100
  return self.decoder.weight
 
94
 
95
  @property
96
  def decoder_weight(self) -> torch.Tensor:
97
+ """Return the decoder weight matrix (hidden_dim x n_features for untied, or encoder.weight.T)."""
98
  if self.tied_weights:
99
  return self.encoder.weight.T
100
  return self.decoder.weight
obliteratus/analysis/spectral_certification.py CHANGED
@@ -175,10 +175,11 @@ class SpectralCertifier:
175
  harmful_centered = harmful_activations - harmful_mean
176
  harmless_centered = harmless_activations - harmless_mean
177
 
178
- # Pooled within-class covariance
 
179
  cov_h = harmful_centered.T @ harmful_centered / max(n_h - 1, 1)
180
  cov_b = harmless_centered.T @ harmless_centered / max(n_b - 1, 1)
181
- pooled_cov = (cov_h * n_h + cov_b * n_b) / max(n - 2, 1)
182
 
183
  # Step 2: Estimate noise variance (median eigenvalue method)
184
  noise_var = self._estimate_noise_variance(pooled_cov, n, d)
@@ -374,8 +375,13 @@ class SpectralCertifier:
374
  # Correct for MP bias: median of MP distribution
375
  gamma = d / max(n, 1)
376
  if gamma < 1:
377
- # MP median approximation (from Bai & Silverstein)
378
- mp_median_ratio = (1 + math.sqrt(gamma)) ** 2 * 0.5
 
 
 
 
 
379
  noise_var = median_eig / max(mp_median_ratio, 1e-10)
380
  else:
381
  noise_var = median_eig
 
175
  harmful_centered = harmful_activations - harmful_mean
176
  harmless_centered = harmless_activations - harmless_mean
177
 
178
+ # Pooled within-class covariance (standard formula: sum of scatter
179
+ # matrices divided by total degrees of freedom)
180
  cov_h = harmful_centered.T @ harmful_centered / max(n_h - 1, 1)
181
  cov_b = harmless_centered.T @ harmless_centered / max(n_b - 1, 1)
182
+ pooled_cov = (cov_h * (n_h - 1) + cov_b * (n_b - 1)) / max(n - 2, 1)
183
 
184
  # Step 2: Estimate noise variance (median eigenvalue method)
185
  noise_var = self._estimate_noise_variance(pooled_cov, n, d)
 
375
  # Correct for MP bias: median of MP distribution
376
  gamma = d / max(n, 1)
377
  if gamma < 1:
378
+ # MP median approximation. The exact MP median requires
379
+ # numerical inversion of the MP CDF; we use the empirical
380
+ # approximation median ≈ (1 - sqrt(gamma))^2 + gamma^(1/3)
381
+ # which is more accurate than the naive 0.5 * upper_edge
382
+ # for small gamma. Falls back to the simpler formula when
383
+ # gamma is very small.
384
+ mp_median_ratio = (1 - math.sqrt(gamma)) ** 2 + gamma ** (1.0 / 3.0)
385
  noise_var = median_eig / max(mp_median_ratio, 1e-10)
386
  else:
387
  noise_var = median_eig
obliteratus/analysis/wasserstein_optimal.py CHANGED
@@ -58,7 +58,7 @@ class WassersteinDirectionResult:
58
  direction: torch.Tensor # (hidden_dim,) optimal direction
59
  wasserstein_cost: float # W_2^2 cost for this direction
60
  mean_shift_component: float # (r^T m)^2 portion
61
- bures_component: float # r^T Sigma r portion (upper bound)
62
  refusal_projection: float # (r^T d)^2
63
  cost_effectiveness_ratio: float # W_2^2 / (r^T d)^2
64
 
 
58
  direction: torch.Tensor # (hidden_dim,) optimal direction
59
  wasserstein_cost: float # W_2^2 cost for this direction
60
  mean_shift_component: float # (r^T m)^2 portion
61
+ bures_component: float # r^T Sigma r portion (exact when r is eigenvector of Sigma, lower bound otherwise)
62
  refusal_projection: float # (r^T d)^2
63
  cost_effectiveness_ratio: float # W_2^2 / (r^T d)^2
64
 
obliteratus/evaluation/heretic_eval.py CHANGED
@@ -334,19 +334,20 @@ def _load_harmbench_classifier():
334
  bnb_4bit_quant_type="nf4",
335
  llm_int8_enable_fp32_cpu_offload=True,
336
  )
337
- model = AutoModelForCausalLM.from_pretrained(
338
- model_id,
339
- quantization_config=bnb_cfg,
340
- device_map="auto",
341
- torch_dtype=torch.float16,
342
- )
343
  except Exception:
344
  logger.info("4-bit quantization unavailable for classifier, loading in float16")
345
- model = AutoModelForCausalLM.from_pretrained(
346
- model_id,
347
- device_map="auto",
348
- torch_dtype=torch.float16,
349
- )
 
 
 
350
 
351
  model.eval()
352
  _HARMBENCH_CLASSIFIER = (model, tokenizer)
 
334
  bnb_4bit_quant_type="nf4",
335
  llm_int8_enable_fp32_cpu_offload=True,
336
  )
337
+ load_kwargs = dict(quantization_config=bnb_cfg, torch_dtype=torch.float16)
338
+ if dev.supports_device_map_auto():
339
+ load_kwargs["device_map"] = "auto"
340
+ model = AutoModelForCausalLM.from_pretrained(model_id, **load_kwargs)
 
 
341
  except Exception:
342
  logger.info("4-bit quantization unavailable for classifier, loading in float16")
343
+ load_kwargs = dict(torch_dtype=torch.float16)
344
+ if dev.supports_device_map_auto():
345
+ load_kwargs["device_map"] = "auto"
346
+ model = AutoModelForCausalLM.from_pretrained(model_id, **load_kwargs)
347
+
348
+ # On MPS/CPU: move model to best available device
349
+ if not dev.supports_device_map_auto():
350
+ model = model.to(dev.get_device())
351
 
352
  model.eval()
353
  _HARMBENCH_CLASSIFIER = (model, tokenizer)
obliteratus/tourney.py CHANGED
@@ -1097,6 +1097,11 @@ class TourneyRunner:
1097
  result.winner = winner
1098
  result.total_time_s = time.time() - t_start
1099
 
 
 
 
 
 
1100
  self.log("")
1101
  self.log("=" * 60)
1102
  if winner:
@@ -1401,6 +1406,11 @@ class TourneyRunner:
1401
  result.winner = winner
1402
  result.total_time_s = time.time() - t_start
1403
 
 
 
 
 
 
1404
  self.log("")
1405
  self.log("=" * 60)
1406
  if winner:
 
1097
  result.winner = winner
1098
  result.total_time_s = time.time() - t_start
1099
 
1100
+ # Clean up non-winner finalist dirs to free disk
1101
+ for c in ranked[1:]:
1102
+ if c.output_dir and Path(c.output_dir).exists():
1103
+ shutil.rmtree(c.output_dir, ignore_errors=True)
1104
+
1105
  self.log("")
1106
  self.log("=" * 60)
1107
  if winner:
 
1406
  result.winner = winner
1407
  result.total_time_s = time.time() - t_start
1408
 
1409
+ # Clean up non-winner finalist dirs to free disk
1410
+ for c in ranked[1:]:
1411
+ if c.output_dir and Path(c.output_dir).exists():
1412
+ shutil.rmtree(c.output_dir, ignore_errors=True)
1413
+
1414
  self.log("")
1415
  self.log("=" * 60)
1416
  if winner: