Spaces:
Running on Zero
Running on Zero
Upload 135 files
Browse files- app.py +194 -36
- docs/index.html +58 -4
- obliteratus/.DS_Store +0 -0
- obliteratus/abliterate.py +51 -28
- obliteratus/analysis/conditional_abliteration.py +1 -1
- obliteratus/analysis/leace.py +34 -34
- obliteratus/analysis/riemannian_manifold.py +15 -4
- obliteratus/analysis/sae_abliteration.py +1 -1
- obliteratus/analysis/spectral_certification.py +10 -4
- obliteratus/analysis/wasserstein_optimal.py +1 -1
- obliteratus/evaluation/heretic_eval.py +12 -11
- obliteratus/tourney.py +10 -0
app.py
CHANGED
|
@@ -98,6 +98,51 @@ def _is_quota_error(exc: BaseException) -> bool:
|
|
| 98 |
return True
|
| 99 |
return False
|
| 100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
# ---------------------------------------------------------------------------
|
| 102 |
# Global state
|
| 103 |
# ---------------------------------------------------------------------------
|
|
@@ -164,7 +209,7 @@ def _recover_sessions_from_disk() -> None:
|
|
| 164 |
"""
|
| 165 |
global _last_obliterated_label, _obliterate_counter
|
| 166 |
found_any = False
|
| 167 |
-
for pattern in ("obliterated_*", "obliterated", "bench_*"):
|
| 168 |
for p in Path("/tmp").glob(pattern):
|
| 169 |
if not p.is_dir():
|
| 170 |
continue
|
|
@@ -291,6 +336,11 @@ METHODS = {
|
|
| 291 |
"optimized (bayesian auto-tuned)": "optimized",
|
| 292 |
"inverted (semantic refusal inversion)": "inverted",
|
| 293 |
"nuclear (maximum force combo)": "nuclear",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
}
|
| 295 |
|
| 296 |
# ── Community Hub push ────────────────────────────────────────────────
|
|
@@ -342,6 +392,17 @@ def _get_preset_defaults(method_display: str):
|
|
| 342 |
"spectral_cascade": cfg.get("spectral_cascade", False),
|
| 343 |
"spectral_bands": cfg.get("spectral_bands", 3),
|
| 344 |
"spectral_threshold": cfg.get("spectral_threshold", 0.05),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
}
|
| 346 |
|
| 347 |
def _on_method_change(method_display: str):
|
|
@@ -376,6 +437,16 @@ def _on_method_change(method_display: str):
|
|
| 376 |
d["expert_transplant"],
|
| 377 |
d["use_wasserstein_optimal"],
|
| 378 |
d["spectral_cascade"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 379 |
)
|
| 380 |
|
| 381 |
def _on_dataset_change(dataset_label: str):
|
|
@@ -1751,6 +1822,12 @@ def obliterate(model_choice: str, method_choice: str,
|
|
| 1751 |
adv_project_embeddings: bool, adv_activation_steering: bool,
|
| 1752 |
adv_expert_transplant: bool, adv_wasserstein_optimal: bool,
|
| 1753 |
adv_spectral_cascade: bool,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1754 |
progress=gr.Progress()):
|
| 1755 |
"""Run the full obliteration pipeline, streaming log updates to the UI.
|
| 1756 |
|
|
@@ -1936,6 +2013,15 @@ def obliterate(model_choice: str, method_choice: str,
|
|
| 1936 |
spectral_bands=int(adv_spectral_bands),
|
| 1937 |
spectral_threshold=float(adv_spectral_threshold),
|
| 1938 |
verify_sample_size=int(adv_verify_sample_size),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1939 |
)
|
| 1940 |
pipeline_ref[0] = pipeline
|
| 1941 |
pipeline.run()
|
|
@@ -2107,10 +2193,9 @@ def obliterate(model_choice: str, method_choice: str,
|
|
| 2107 |
bnb_4bit_quant_type="nf4",
|
| 2108 |
llm_int8_enable_fp32_cpu_offload=True,
|
| 2109 |
)
|
| 2110 |
-
model_reloaded =
|
| 2111 |
save_dir,
|
| 2112 |
quantization_config=bnb_cfg,
|
| 2113 |
-
device_map="auto",
|
| 2114 |
trust_remote_code=True,
|
| 2115 |
)
|
| 2116 |
tokenizer_reloaded = AutoTokenizer.from_pretrained(
|
|
@@ -2148,9 +2233,8 @@ def obliterate(model_choice: str, method_choice: str,
|
|
| 2148 |
yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update(), gr.update()
|
| 2149 |
try:
|
| 2150 |
offload_dir = tempfile.mkdtemp(prefix="obliteratus_offload_")
|
| 2151 |
-
model_reloaded =
|
| 2152 |
save_dir,
|
| 2153 |
-
device_map="auto",
|
| 2154 |
offload_folder=offload_dir,
|
| 2155 |
torch_dtype=torch.float16,
|
| 2156 |
trust_remote_code=True,
|
|
@@ -2311,8 +2395,8 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
|
|
| 2311 |
if checkpoint and Path(checkpoint).exists():
|
| 2312 |
try:
|
| 2313 |
is_preset = (_state.get("model_name") or "") in MODELS
|
| 2314 |
-
model =
|
| 2315 |
-
checkpoint,
|
| 2316 |
trust_remote_code=is_preset,
|
| 2317 |
)
|
| 2318 |
tokenizer = AutoTokenizer.from_pretrained(
|
|
@@ -2502,8 +2586,8 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
|
|
| 2502 |
if checkpoint and Path(checkpoint).exists():
|
| 2503 |
is_preset = (_state.get("model_name") or "") in MODELS
|
| 2504 |
try:
|
| 2505 |
-
model_loaded =
|
| 2506 |
-
checkpoint,
|
| 2507 |
trust_remote_code=is_preset,
|
| 2508 |
)
|
| 2509 |
tokenizer_loaded = AutoTokenizer.from_pretrained(
|
|
@@ -2563,9 +2647,8 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
|
|
| 2563 |
|
| 2564 |
is_preset = cfg["model_choice"] in MODELS
|
| 2565 |
try:
|
| 2566 |
-
model_loaded =
|
| 2567 |
checkpoint_dir,
|
| 2568 |
-
device_map="auto",
|
| 2569 |
torch_dtype=torch.float16,
|
| 2570 |
trust_remote_code=is_preset,
|
| 2571 |
)
|
|
@@ -2599,10 +2682,9 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
|
|
| 2599 |
)
|
| 2600 |
yield f"**Loading {choice}** in 4-bit (model too large for fp16)...", ""
|
| 2601 |
progress(0.5, desc="Loading 4-bit...")
|
| 2602 |
-
model_loaded =
|
| 2603 |
checkpoint_dir,
|
| 2604 |
quantization_config=bnb_cfg,
|
| 2605 |
-
device_map="auto",
|
| 2606 |
trust_remote_code=is_preset,
|
| 2607 |
)
|
| 2608 |
tokenizer_loaded = AutoTokenizer.from_pretrained(
|
|
@@ -2744,8 +2826,8 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
|
|
| 2744 |
if checkpoint and Path(checkpoint).exists():
|
| 2745 |
try:
|
| 2746 |
is_preset = (model_name or "") in MODELS
|
| 2747 |
-
abliterated_model =
|
| 2748 |
-
checkpoint,
|
| 2749 |
trust_remote_code=is_preset,
|
| 2750 |
)
|
| 2751 |
tokenizer = AutoTokenizer.from_pretrained(
|
|
@@ -2870,10 +2952,9 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
|
|
| 2870 |
is_preset = model_name in MODELS
|
| 2871 |
original_response = ""
|
| 2872 |
try:
|
| 2873 |
-
|
| 2874 |
-
original_model = AMCLM.from_pretrained(
|
| 2875 |
model_id, torch_dtype=torch.float16,
|
| 2876 |
-
|
| 2877 |
low_cpu_mem_usage=True,
|
| 2878 |
token=os.environ.get("HF_TOKEN") or None,
|
| 2879 |
)
|
|
@@ -3184,8 +3265,8 @@ def _tourney_gpu_wrapper(fn, *args, **kwargs):
|
|
| 3184 |
return _tourney_gpu_run(fn, *args, **kwargs)
|
| 3185 |
|
| 3186 |
|
| 3187 |
-
def run_tourney(model_choice, dataset, quantization):
|
| 3188 |
-
"""Run an elimination tournament across
|
| 3189 |
|
| 3190 |
Each individual method is run inside its own ``@spaces.GPU`` allocation
|
| 3191 |
(up to 5 minutes per method) so the full tournament is not constrained
|
|
@@ -3198,6 +3279,10 @@ def run_tourney(model_choice, dataset, quantization):
|
|
| 3198 |
yield "**Error:** Select a model first.", "", ""
|
| 3199 |
return
|
| 3200 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3201 |
from obliteratus.tourney import (
|
| 3202 |
TourneyRunner, render_bracket_html,
|
| 3203 |
_load_checkpoint, _checkpoint_matches,
|
|
@@ -3229,6 +3314,7 @@ def run_tourney(model_choice, dataset, quantization):
|
|
| 3229 |
hub_repo=None,
|
| 3230 |
dataset_key=dataset_key,
|
| 3231 |
quantization=quant,
|
|
|
|
| 3232 |
on_log=logger,
|
| 3233 |
resume=resume,
|
| 3234 |
)
|
|
@@ -3333,18 +3419,27 @@ def run_tourney(model_choice, dataset, quantization):
|
|
| 3333 |
_ts = datetime.now().strftime("%H:%M")
|
| 3334 |
_short = model_id.split("/")[-1] if "/" in model_id else model_id
|
| 3335 |
_label = f"tourney winner ({winner.method}) on {_short} ({_ts})"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3336 |
with _lock:
|
| 3337 |
-
_session_models[_label] =
|
| 3338 |
-
|
| 3339 |
-
|
| 3340 |
-
|
| 3341 |
-
|
| 3342 |
-
|
| 3343 |
-
|
| 3344 |
-
|
| 3345 |
-
|
| 3346 |
-
"tourney_metrics": winner.metrics,
|
| 3347 |
-
}
|
| 3348 |
yield (
|
| 3349 |
f"**Champion: `{winner.method}`** "
|
| 3350 |
f"(score: {winner.score:.4f})\n"
|
|
@@ -4013,6 +4108,47 @@ with gr.Blocks(theme=THEME, css=CSS, js=_JS, title="OBLITERATUS", fill_height=Tr
|
|
| 4013 |
with gr.Row():
|
| 4014 |
adv_spectral_cascade = gr.Checkbox(value=_defaults["spectral_cascade"], label="Spectral Cascade",
|
| 4015 |
info="DCT frequency decomposition for precision refusal targeting")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4016 |
|
| 4017 |
# List of all advanced controls (order must match _on_method_change return)
|
| 4018 |
_adv_controls = [
|
|
@@ -4029,6 +4165,12 @@ with gr.Blocks(theme=THEME, css=CSS, js=_JS, title="OBLITERATUS", fill_height=Tr
|
|
| 4029 |
adv_project_embeddings, adv_activation_steering,
|
| 4030 |
adv_expert_transplant, adv_wasserstein_optimal,
|
| 4031 |
adv_spectral_cascade,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4032 |
]
|
| 4033 |
|
| 4034 |
obliterate_btn = gr.Button(
|
|
@@ -4199,7 +4341,8 @@ result = client.predict(
|
|
| 4199 |
mm_method = gr.Dropdown(
|
| 4200 |
choices=["basic", "advanced", "aggressive",
|
| 4201 |
"spectral_cascade", "informed", "surgical",
|
| 4202 |
-
"optimized", "inverted", "nuclear"
|
|
|
|
| 4203 |
value="surgical",
|
| 4204 |
label="Abliteration Method",
|
| 4205 |
)
|
|
@@ -4568,11 +4711,11 @@ tradeoff point where refusal is minimized with minimal capability damage.
|
|
| 4568 |
|
| 4569 |
# ── Tab 6: Tourney ────────────────────────────────────────────────
|
| 4570 |
with gr.Tab("Tourney", id="tourney"):
|
| 4571 |
-
gr.Markdown("""###
|
| 4572 |
-
Pit
|
| 4573 |
The winner is saved locally — push it to HuggingFace Hub from the **Push to Hub** tab.
|
| 4574 |
|
| 4575 |
-
**Round 1 — Qualifiers:**
|
| 4576 |
**Round 2 — Semifinals:** Survivors, full prompts. Bottom half eliminated.
|
| 4577 |
**Round 3 — Finals:** Top contenders, maximum prompts. Champion crowned.
|
| 4578 |
""")
|
|
@@ -4584,6 +4727,14 @@ The winner is saved locally — push it to HuggingFace Hub from the **Push to Hu
|
|
| 4584 |
allow_custom_value=True,
|
| 4585 |
)
|
| 4586 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4587 |
with gr.Accordion("Advanced Settings", open=False):
|
| 4588 |
with gr.Row():
|
| 4589 |
tourney_dataset_dd = gr.Dropdown(
|
|
@@ -4613,9 +4764,16 @@ The winner is saved locally — push it to HuggingFace Hub from the **Push to Hu
|
|
| 4613 |
|
| 4614 |
tourney_btn.click(
|
| 4615 |
fn=run_tourney,
|
| 4616 |
-
inputs=[tourney_model_dd,
|
| 4617 |
tourney_dataset_dd, tourney_quant_dd],
|
| 4618 |
outputs=[tourney_status, tourney_bracket, tourney_log],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4619 |
)
|
| 4620 |
|
| 4621 |
# ── Tab 7: Export ─────────────────────────────────────────────────
|
|
|
|
| 98 |
return True
|
| 99 |
return False
|
| 100 |
|
| 101 |
+
|
| 102 |
+
def _load_model_to_device(
|
| 103 |
+
pretrained_path: str,
|
| 104 |
+
*,
|
| 105 |
+
torch_dtype=None,
|
| 106 |
+
trust_remote_code: bool = False,
|
| 107 |
+
quantization_config=None,
|
| 108 |
+
offload_folder: str | None = None,
|
| 109 |
+
low_cpu_mem_usage: bool = False,
|
| 110 |
+
token: str | None = None,
|
| 111 |
+
) -> AutoModelForCausalLM:
|
| 112 |
+
"""Load a causal LM onto the best available device, MPS-safe.
|
| 113 |
+
|
| 114 |
+
Accelerate's ``device_map="auto"`` is not supported on MPS — models
|
| 115 |
+
silently land on CPU. This helper skips ``device_map`` on non-CUDA
|
| 116 |
+
backends and explicitly moves the model to the best device after loading.
|
| 117 |
+
On CUDA the behaviour is identical to ``device_map="auto"``.
|
| 118 |
+
"""
|
| 119 |
+
kwargs: dict = {}
|
| 120 |
+
if torch_dtype is not None:
|
| 121 |
+
kwargs["torch_dtype"] = torch_dtype
|
| 122 |
+
if trust_remote_code:
|
| 123 |
+
kwargs["trust_remote_code"] = True
|
| 124 |
+
if quantization_config is not None:
|
| 125 |
+
kwargs["quantization_config"] = quantization_config
|
| 126 |
+
if offload_folder is not None:
|
| 127 |
+
kwargs["offload_folder"] = offload_folder
|
| 128 |
+
if low_cpu_mem_usage:
|
| 129 |
+
kwargs["low_cpu_mem_usage"] = True
|
| 130 |
+
if token is not None:
|
| 131 |
+
kwargs["token"] = token
|
| 132 |
+
|
| 133 |
+
if dev.supports_device_map_auto():
|
| 134 |
+
kwargs["device_map"] = "auto"
|
| 135 |
+
|
| 136 |
+
model = AutoModelForCausalLM.from_pretrained(pretrained_path, **kwargs)
|
| 137 |
+
|
| 138 |
+
# On MPS / CPU: model loaded without device_map, move to best device
|
| 139 |
+
if not dev.supports_device_map_auto():
|
| 140 |
+
target = dev.get_device()
|
| 141 |
+
model = model.to(target)
|
| 142 |
+
|
| 143 |
+
return model
|
| 144 |
+
|
| 145 |
+
|
| 146 |
# ---------------------------------------------------------------------------
|
| 147 |
# Global state
|
| 148 |
# ---------------------------------------------------------------------------
|
|
|
|
| 209 |
"""
|
| 210 |
global _last_obliterated_label, _obliterate_counter
|
| 211 |
found_any = False
|
| 212 |
+
for pattern in ("obliterated_*", "obliterated", "bench_*", "obliteratus_tourney/r*"):
|
| 213 |
for p in Path("/tmp").glob(pattern):
|
| 214 |
if not p.is_dir():
|
| 215 |
continue
|
|
|
|
| 336 |
"optimized (bayesian auto-tuned)": "optimized",
|
| 337 |
"inverted (semantic refusal inversion)": "inverted",
|
| 338 |
"nuclear (maximum force combo)": "nuclear",
|
| 339 |
+
# Baseline reproductions for benchmarking
|
| 340 |
+
"failspy (FailSpy/abliterator baseline)": "failspy",
|
| 341 |
+
"gabliteration (Gülmez 2026 baseline)": "gabliteration",
|
| 342 |
+
"heretic (p-e-w 2025-2026 baseline)": "heretic",
|
| 343 |
+
"rdo (Wollschlager ICML 2025 baseline)": "rdo",
|
| 344 |
}
|
| 345 |
|
| 346 |
# ── Community Hub push ────────────────────────────────────────────────
|
|
|
|
| 392 |
"spectral_cascade": cfg.get("spectral_cascade", False),
|
| 393 |
"spectral_bands": cfg.get("spectral_bands", 3),
|
| 394 |
"spectral_threshold": cfg.get("spectral_threshold", 0.05),
|
| 395 |
+
# Baseline-specific parameters
|
| 396 |
+
"layer_selection": cfg.get("layer_selection", "all"),
|
| 397 |
+
"winsorize_activations": cfg.get("winsorize_activations", False),
|
| 398 |
+
"winsorize_percentile": cfg.get("winsorize_percentile", 1.0),
|
| 399 |
+
"use_kl_optimization": cfg.get("use_kl_optimization", False),
|
| 400 |
+
"kl_budget": cfg.get("kl_budget", 0.5),
|
| 401 |
+
"float_layer_interpolation": cfg.get("float_layer_interpolation", False),
|
| 402 |
+
"rdo_refinement": cfg.get("rdo_refinement", False),
|
| 403 |
+
"cot_aware": cfg.get("cot_aware", False),
|
| 404 |
+
"bayesian_trials": cfg.get("bayesian_trials", 50),
|
| 405 |
+
"n_sae_features": cfg.get("n_sae_features", 64),
|
| 406 |
}
|
| 407 |
|
| 408 |
def _on_method_change(method_display: str):
|
|
|
|
| 437 |
d["expert_transplant"],
|
| 438 |
d["use_wasserstein_optimal"],
|
| 439 |
d["spectral_cascade"],
|
| 440 |
+
d["layer_selection"],
|
| 441 |
+
d["winsorize_activations"],
|
| 442 |
+
d["winsorize_percentile"],
|
| 443 |
+
d["use_kl_optimization"],
|
| 444 |
+
d["kl_budget"],
|
| 445 |
+
d["float_layer_interpolation"],
|
| 446 |
+
d["rdo_refinement"],
|
| 447 |
+
d["cot_aware"],
|
| 448 |
+
d["bayesian_trials"],
|
| 449 |
+
d["n_sae_features"],
|
| 450 |
)
|
| 451 |
|
| 452 |
def _on_dataset_change(dataset_label: str):
|
|
|
|
| 1822 |
adv_project_embeddings: bool, adv_activation_steering: bool,
|
| 1823 |
adv_expert_transplant: bool, adv_wasserstein_optimal: bool,
|
| 1824 |
adv_spectral_cascade: bool,
|
| 1825 |
+
adv_layer_selection: str, adv_winsorize: bool,
|
| 1826 |
+
adv_winsorize_percentile: float,
|
| 1827 |
+
adv_kl_optimization: bool, adv_kl_budget: float,
|
| 1828 |
+
adv_float_layer_interp: bool, adv_rdo_refinement: bool,
|
| 1829 |
+
adv_cot_aware: bool,
|
| 1830 |
+
adv_bayesian_trials: int, adv_n_sae_features: int,
|
| 1831 |
progress=gr.Progress()):
|
| 1832 |
"""Run the full obliteration pipeline, streaming log updates to the UI.
|
| 1833 |
|
|
|
|
| 2013 |
spectral_bands=int(adv_spectral_bands),
|
| 2014 |
spectral_threshold=float(adv_spectral_threshold),
|
| 2015 |
verify_sample_size=int(adv_verify_sample_size),
|
| 2016 |
+
layer_selection=adv_layer_selection,
|
| 2017 |
+
winsorize_activations=adv_winsorize,
|
| 2018 |
+
winsorize_percentile=float(adv_winsorize_percentile),
|
| 2019 |
+
use_kl_optimization=adv_kl_optimization,
|
| 2020 |
+
kl_budget=float(adv_kl_budget),
|
| 2021 |
+
float_layer_interpolation=adv_float_layer_interp,
|
| 2022 |
+
rdo_refinement=adv_rdo_refinement,
|
| 2023 |
+
cot_aware=adv_cot_aware,
|
| 2024 |
+
n_sae_features=int(adv_n_sae_features),
|
| 2025 |
)
|
| 2026 |
pipeline_ref[0] = pipeline
|
| 2027 |
pipeline.run()
|
|
|
|
| 2193 |
bnb_4bit_quant_type="nf4",
|
| 2194 |
llm_int8_enable_fp32_cpu_offload=True,
|
| 2195 |
)
|
| 2196 |
+
model_reloaded = _load_model_to_device(
|
| 2197 |
save_dir,
|
| 2198 |
quantization_config=bnb_cfg,
|
|
|
|
| 2199 |
trust_remote_code=True,
|
| 2200 |
)
|
| 2201 |
tokenizer_reloaded = AutoTokenizer.from_pretrained(
|
|
|
|
| 2233 |
yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update(), gr.update()
|
| 2234 |
try:
|
| 2235 |
offload_dir = tempfile.mkdtemp(prefix="obliteratus_offload_")
|
| 2236 |
+
model_reloaded = _load_model_to_device(
|
| 2237 |
save_dir,
|
|
|
|
| 2238 |
offload_folder=offload_dir,
|
| 2239 |
torch_dtype=torch.float16,
|
| 2240 |
trust_remote_code=True,
|
|
|
|
| 2395 |
if checkpoint and Path(checkpoint).exists():
|
| 2396 |
try:
|
| 2397 |
is_preset = (_state.get("model_name") or "") in MODELS
|
| 2398 |
+
model = _load_model_to_device(
|
| 2399 |
+
checkpoint, torch_dtype=torch.float16,
|
| 2400 |
trust_remote_code=is_preset,
|
| 2401 |
)
|
| 2402 |
tokenizer = AutoTokenizer.from_pretrained(
|
|
|
|
| 2586 |
if checkpoint and Path(checkpoint).exists():
|
| 2587 |
is_preset = (_state.get("model_name") or "") in MODELS
|
| 2588 |
try:
|
| 2589 |
+
model_loaded = _load_model_to_device(
|
| 2590 |
+
checkpoint, torch_dtype=torch.float16,
|
| 2591 |
trust_remote_code=is_preset,
|
| 2592 |
)
|
| 2593 |
tokenizer_loaded = AutoTokenizer.from_pretrained(
|
|
|
|
| 2647 |
|
| 2648 |
is_preset = cfg["model_choice"] in MODELS
|
| 2649 |
try:
|
| 2650 |
+
model_loaded = _load_model_to_device(
|
| 2651 |
checkpoint_dir,
|
|
|
|
| 2652 |
torch_dtype=torch.float16,
|
| 2653 |
trust_remote_code=is_preset,
|
| 2654 |
)
|
|
|
|
| 2682 |
)
|
| 2683 |
yield f"**Loading {choice}** in 4-bit (model too large for fp16)...", ""
|
| 2684 |
progress(0.5, desc="Loading 4-bit...")
|
| 2685 |
+
model_loaded = _load_model_to_device(
|
| 2686 |
checkpoint_dir,
|
| 2687 |
quantization_config=bnb_cfg,
|
|
|
|
| 2688 |
trust_remote_code=is_preset,
|
| 2689 |
)
|
| 2690 |
tokenizer_loaded = AutoTokenizer.from_pretrained(
|
|
|
|
| 2826 |
if checkpoint and Path(checkpoint).exists():
|
| 2827 |
try:
|
| 2828 |
is_preset = (model_name or "") in MODELS
|
| 2829 |
+
abliterated_model = _load_model_to_device(
|
| 2830 |
+
checkpoint, torch_dtype=torch.float16,
|
| 2831 |
trust_remote_code=is_preset,
|
| 2832 |
)
|
| 2833 |
tokenizer = AutoTokenizer.from_pretrained(
|
|
|
|
| 2952 |
is_preset = model_name in MODELS
|
| 2953 |
original_response = ""
|
| 2954 |
try:
|
| 2955 |
+
original_model = _load_model_to_device(
|
|
|
|
| 2956 |
model_id, torch_dtype=torch.float16,
|
| 2957 |
+
trust_remote_code=is_preset,
|
| 2958 |
low_cpu_mem_usage=True,
|
| 2959 |
token=os.environ.get("HF_TOKEN") or None,
|
| 2960 |
)
|
|
|
|
| 3265 |
return _tourney_gpu_run(fn, *args, **kwargs)
|
| 3266 |
|
| 3267 |
|
| 3268 |
+
def run_tourney(model_choice, selected_methods, dataset, quantization):
|
| 3269 |
+
"""Run an elimination tournament across selected abliteration methods.
|
| 3270 |
|
| 3271 |
Each individual method is run inside its own ``@spaces.GPU`` allocation
|
| 3272 |
(up to 5 minutes per method) so the full tournament is not constrained
|
|
|
|
| 3279 |
yield "**Error:** Select a model first.", "", ""
|
| 3280 |
return
|
| 3281 |
|
| 3282 |
+
if not selected_methods or len(selected_methods) < 3:
|
| 3283 |
+
yield "**Error:** Select at least 3 methods for a tournament.", "", ""
|
| 3284 |
+
return
|
| 3285 |
+
|
| 3286 |
from obliteratus.tourney import (
|
| 3287 |
TourneyRunner, render_bracket_html,
|
| 3288 |
_load_checkpoint, _checkpoint_matches,
|
|
|
|
| 3314 |
hub_repo=None,
|
| 3315 |
dataset_key=dataset_key,
|
| 3316 |
quantization=quant,
|
| 3317 |
+
methods=list(selected_methods),
|
| 3318 |
on_log=logger,
|
| 3319 |
resume=resume,
|
| 3320 |
)
|
|
|
|
| 3419 |
_ts = datetime.now().strftime("%H:%M")
|
| 3420 |
_short = model_id.split("/")[-1] if "/" in model_id else model_id
|
| 3421 |
_label = f"tourney winner ({winner.method}) on {_short} ({_ts})"
|
| 3422 |
+
_winner_meta = {
|
| 3423 |
+
"model_id": model_id,
|
| 3424 |
+
"model_choice": model_choice,
|
| 3425 |
+
"method": winner.method,
|
| 3426 |
+
"dataset_key": dataset_key,
|
| 3427 |
+
"prompt_volume": 0,
|
| 3428 |
+
"output_dir": winner.output_dir,
|
| 3429 |
+
"source": "tourney",
|
| 3430 |
+
"tourney_score": winner.score,
|
| 3431 |
+
"tourney_metrics": winner.metrics,
|
| 3432 |
+
}
|
| 3433 |
with _lock:
|
| 3434 |
+
_session_models[_label] = _winner_meta
|
| 3435 |
+
# Persist so the winner survives ZeroGPU process restarts
|
| 3436 |
+
_persist_session_meta(winner.output_dir, _label, {
|
| 3437 |
+
"model_id": model_id,
|
| 3438 |
+
"model_choice": model_choice,
|
| 3439 |
+
"method": winner.method,
|
| 3440 |
+
"dataset_key": dataset_key,
|
| 3441 |
+
"source": "tourney",
|
| 3442 |
+
})
|
|
|
|
|
|
|
| 3443 |
yield (
|
| 3444 |
f"**Champion: `{winner.method}`** "
|
| 3445 |
f"(score: {winner.score:.4f})\n"
|
|
|
|
| 4108 |
with gr.Row():
|
| 4109 |
adv_spectral_cascade = gr.Checkbox(value=_defaults["spectral_cascade"], label="Spectral Cascade",
|
| 4110 |
info="DCT frequency decomposition for precision refusal targeting")
|
| 4111 |
+
gr.Markdown("**Layer Selection & Baseline Options**")
|
| 4112 |
+
with gr.Row():
|
| 4113 |
+
adv_layer_selection = gr.Dropdown(
|
| 4114 |
+
choices=["knee_cosmic", "all", "all_except_first", "middle60", "top_k", "knee"],
|
| 4115 |
+
value=_defaults["layer_selection"],
|
| 4116 |
+
label="Layer Selection",
|
| 4117 |
+
info="Which layers to project refusal directions from",
|
| 4118 |
+
)
|
| 4119 |
+
adv_winsorize_percentile = gr.Slider(
|
| 4120 |
+
0.0, 1.0, value=_defaults["winsorize_percentile"], step=0.01,
|
| 4121 |
+
label="Winsorize Percentile",
|
| 4122 |
+
info="Activation clamping quantile (1.0 = disabled, 0.01 = 99th pctile)",
|
| 4123 |
+
)
|
| 4124 |
+
adv_kl_budget = gr.Slider(
|
| 4125 |
+
0.0, 2.0, value=_defaults["kl_budget"], step=0.1,
|
| 4126 |
+
label="KL Budget",
|
| 4127 |
+
info="Max KL divergence from base model (Heretic/optimized)",
|
| 4128 |
+
)
|
| 4129 |
+
with gr.Row():
|
| 4130 |
+
adv_winsorize = gr.Checkbox(value=_defaults["winsorize_activations"], label="Winsorize Activations",
|
| 4131 |
+
info="Clamp outlier activations before direction extraction")
|
| 4132 |
+
adv_kl_optimization = gr.Checkbox(value=_defaults["use_kl_optimization"], label="KL Optimization",
|
| 4133 |
+
info="Optimize projection strength to stay within KL budget")
|
| 4134 |
+
adv_float_layer_interp = gr.Checkbox(value=_defaults["float_layer_interpolation"], label="Float Layer Interpolation",
|
| 4135 |
+
info="Interpolate between adjacent layers' directions (Heretic)")
|
| 4136 |
+
adv_rdo_refinement = gr.Checkbox(value=_defaults["rdo_refinement"], label="RDO Refinement",
|
| 4137 |
+
info="Gradient-based direction refinement (Wollschlager et al.)")
|
| 4138 |
+
with gr.Row():
|
| 4139 |
+
adv_cot_aware = gr.Checkbox(value=_defaults["cot_aware"], label="CoT-Aware",
|
| 4140 |
+
info="Preserve chain-of-thought reasoning during abliteration")
|
| 4141 |
+
with gr.Row():
|
| 4142 |
+
adv_bayesian_trials = gr.Slider(
|
| 4143 |
+
10, 200, value=_defaults["bayesian_trials"], step=10,
|
| 4144 |
+
label="Bayesian Trials",
|
| 4145 |
+
info="Optuna TPE optimization trials (Heretic/optimized methods)",
|
| 4146 |
+
)
|
| 4147 |
+
adv_n_sae_features = gr.Slider(
|
| 4148 |
+
16, 256, value=_defaults["n_sae_features"], step=16,
|
| 4149 |
+
label="SAE Features",
|
| 4150 |
+
info="Number of SAE features to target (inverted/nuclear methods)",
|
| 4151 |
+
)
|
| 4152 |
|
| 4153 |
# List of all advanced controls (order must match _on_method_change return)
|
| 4154 |
_adv_controls = [
|
|
|
|
| 4165 |
adv_project_embeddings, adv_activation_steering,
|
| 4166 |
adv_expert_transplant, adv_wasserstein_optimal,
|
| 4167 |
adv_spectral_cascade,
|
| 4168 |
+
adv_layer_selection, adv_winsorize,
|
| 4169 |
+
adv_winsorize_percentile,
|
| 4170 |
+
adv_kl_optimization, adv_kl_budget,
|
| 4171 |
+
adv_float_layer_interp, adv_rdo_refinement,
|
| 4172 |
+
adv_cot_aware,
|
| 4173 |
+
adv_bayesian_trials, adv_n_sae_features,
|
| 4174 |
]
|
| 4175 |
|
| 4176 |
obliterate_btn = gr.Button(
|
|
|
|
| 4341 |
mm_method = gr.Dropdown(
|
| 4342 |
choices=["basic", "advanced", "aggressive",
|
| 4343 |
"spectral_cascade", "informed", "surgical",
|
| 4344 |
+
"optimized", "inverted", "nuclear",
|
| 4345 |
+
"failspy", "gabliteration", "heretic", "rdo"],
|
| 4346 |
value="surgical",
|
| 4347 |
label="Abliteration Method",
|
| 4348 |
)
|
|
|
|
| 4711 |
|
| 4712 |
# ── Tab 6: Tourney ────────────────────────────────────────────────
|
| 4713 |
with gr.Tab("Tourney", id="tourney"):
|
| 4714 |
+
gr.Markdown("""### Tourney Mode
|
| 4715 |
+
Pit abliteration methods against each other in elimination rounds.
|
| 4716 |
The winner is saved locally — push it to HuggingFace Hub from the **Push to Hub** tab.
|
| 4717 |
|
| 4718 |
+
**Round 1 — Qualifiers:** Selected methods, reduced prompts. Bottom half eliminated.
|
| 4719 |
**Round 2 — Semifinals:** Survivors, full prompts. Bottom half eliminated.
|
| 4720 |
**Round 3 — Finals:** Top contenders, maximum prompts. Champion crowned.
|
| 4721 |
""")
|
|
|
|
| 4727 |
allow_custom_value=True,
|
| 4728 |
)
|
| 4729 |
|
| 4730 |
+
from obliteratus.tourney import TOURNEY_METHODS as _ALL_TOURNEY_METHODS
|
| 4731 |
+
tourney_methods_cb = gr.CheckboxGroup(
|
| 4732 |
+
choices=_ALL_TOURNEY_METHODS,
|
| 4733 |
+
value=_ALL_TOURNEY_METHODS,
|
| 4734 |
+
label="Methods to Compete",
|
| 4735 |
+
info="Pick at least 3 methods. All selected by default.",
|
| 4736 |
+
)
|
| 4737 |
+
|
| 4738 |
with gr.Accordion("Advanced Settings", open=False):
|
| 4739 |
with gr.Row():
|
| 4740 |
tourney_dataset_dd = gr.Dropdown(
|
|
|
|
| 4764 |
|
| 4765 |
tourney_btn.click(
|
| 4766 |
fn=run_tourney,
|
| 4767 |
+
inputs=[tourney_model_dd, tourney_methods_cb,
|
| 4768 |
tourney_dataset_dd, tourney_quant_dd],
|
| 4769 |
outputs=[tourney_status, tourney_bracket, tourney_log],
|
| 4770 |
+
).then(
|
| 4771 |
+
fn=lambda: (
|
| 4772 |
+
gr.update(choices=_get_session_model_choices()),
|
| 4773 |
+
gr.update(choices=_get_session_model_choices()),
|
| 4774 |
+
_get_vram_html(),
|
| 4775 |
+
),
|
| 4776 |
+
outputs=[session_model_dd, ab_session_model_dd, vram_display],
|
| 4777 |
)
|
| 4778 |
|
| 4779 |
# ── Tab 7: Export ─────────────────────────────────────────────────
|
docs/index.html
CHANGED
|
@@ -1317,11 +1317,56 @@
|
|
| 1317 |
<span class="method-label">AGGRESSIVE</span>
|
| 1318 |
<span class="method-desc">Full Gabliteration + 3-pass refine</span>
|
| 1319 |
</label>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1320 |
<label class="method-radio" id="method-informed" onclick="setAblMethod('informed')" style="border-color:var(--cyan)">
|
| 1321 |
<input type="radio" name="abl-method" value="informed">
|
| 1322 |
<span class="method-label" style="color:var(--cyan)">INFORMED</span>
|
| 1323 |
<span class="method-desc">Analysis-guided auto-config + Ouroboros</span>
|
| 1324 |
</label>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1325 |
</div>
|
| 1326 |
<div id="method-details" style="margin-top:10px; font-size:0.7rem; color:var(--text-dim); padding:8px; border:1px solid rgba(188,19,254,0.2); border-radius:4px">
|
| 1327 |
4 SVD directions • norm-preserving • 30% regularization • 2 refinement passes • 32 prompt pairs
|
|
@@ -1941,10 +1986,19 @@ function startAbliterateFromLibrary(hfId) {
|
|
| 1941 |
|
| 1942 |
let ablMethod = 'advanced';
|
| 1943 |
const METHOD_INFO = {
|
| 1944 |
-
basic:
|
| 1945 |
-
advanced:
|
| 1946 |
-
aggressive:
|
| 1947 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1948 |
};
|
| 1949 |
|
| 1950 |
function getAblCmd() {
|
|
|
|
| 1317 |
<span class="method-label">AGGRESSIVE</span>
|
| 1318 |
<span class="method-desc">Full Gabliteration + 3-pass refine</span>
|
| 1319 |
</label>
|
| 1320 |
+
<label class="method-radio" id="method-spectral_cascade" onclick="setAblMethod('spectral_cascade')">
|
| 1321 |
+
<input type="radio" name="abl-method" value="spectral_cascade">
|
| 1322 |
+
<span class="method-label">SPECTRAL</span>
|
| 1323 |
+
<span class="method-desc">DCT frequency-selective decomposition</span>
|
| 1324 |
+
</label>
|
| 1325 |
<label class="method-radio" id="method-informed" onclick="setAblMethod('informed')" style="border-color:var(--cyan)">
|
| 1326 |
<input type="radio" name="abl-method" value="informed">
|
| 1327 |
<span class="method-label" style="color:var(--cyan)">INFORMED</span>
|
| 1328 |
<span class="method-desc">Analysis-guided auto-config + Ouroboros</span>
|
| 1329 |
</label>
|
| 1330 |
+
<label class="method-radio" id="method-surgical" onclick="setAblMethod('surgical')">
|
| 1331 |
+
<input type="radio" name="abl-method" value="surgical">
|
| 1332 |
+
<span class="method-label">SURGICAL</span>
|
| 1333 |
+
<span class="method-desc">Precision MoE-aware head surgery</span>
|
| 1334 |
+
</label>
|
| 1335 |
+
<label class="method-radio" id="method-optimized" onclick="setAblMethod('optimized')">
|
| 1336 |
+
<input type="radio" name="abl-method" value="optimized">
|
| 1337 |
+
<span class="method-label">OPTIMIZED</span>
|
| 1338 |
+
<span class="method-desc">Bayesian auto-tuned + KL-optimized</span>
|
| 1339 |
+
</label>
|
| 1340 |
+
<label class="method-radio" id="method-inverted" onclick="setAblMethod('inverted')">
|
| 1341 |
+
<input type="radio" name="abl-method" value="inverted">
|
| 1342 |
+
<span class="method-label">INVERTED</span>
|
| 1343 |
+
<span class="method-desc">Semantic refusal inversion</span>
|
| 1344 |
+
</label>
|
| 1345 |
+
<label class="method-radio" id="method-nuclear" onclick="setAblMethod('nuclear')">
|
| 1346 |
+
<input type="radio" name="abl-method" value="nuclear">
|
| 1347 |
+
<span class="method-label">NUCLEAR</span>
|
| 1348 |
+
<span class="method-desc">Maximum force combo</span>
|
| 1349 |
+
</label>
|
| 1350 |
+
<label class="method-radio" id="method-failspy" onclick="setAblMethod('failspy')">
|
| 1351 |
+
<input type="radio" name="abl-method" value="failspy">
|
| 1352 |
+
<span class="method-label">FAILSPY</span>
|
| 1353 |
+
<span class="method-desc">FailSpy/abliterator baseline</span>
|
| 1354 |
+
</label>
|
| 1355 |
+
<label class="method-radio" id="method-gabliteration" onclick="setAblMethod('gabliteration')">
|
| 1356 |
+
<input type="radio" name="abl-method" value="gabliteration">
|
| 1357 |
+
<span class="method-label">GABLIT</span>
|
| 1358 |
+
<span class="method-desc">Gabliteration (Gülmez 2026) baseline</span>
|
| 1359 |
+
</label>
|
| 1360 |
+
<label class="method-radio" id="method-heretic" onclick="setAblMethod('heretic')">
|
| 1361 |
+
<input type="radio" name="abl-method" value="heretic">
|
| 1362 |
+
<span class="method-label">HERETIC</span>
|
| 1363 |
+
<span class="method-desc">Heretic/p-e-w Bayesian baseline</span>
|
| 1364 |
+
</label>
|
| 1365 |
+
<label class="method-radio" id="method-rdo" onclick="setAblMethod('rdo')">
|
| 1366 |
+
<input type="radio" name="abl-method" value="rdo">
|
| 1367 |
+
<span class="method-label">RDO</span>
|
| 1368 |
+
<span class="method-desc">Refusal Direction Optimization baseline</span>
|
| 1369 |
+
</label>
|
| 1370 |
</div>
|
| 1371 |
<div id="method-details" style="margin-top:10px; font-size:0.7rem; color:var(--text-dim); padding:8px; border:1px solid rgba(188,19,254,0.2); border-radius:4px">
|
| 1372 |
4 SVD directions • norm-preserving • 30% regularization • 2 refinement passes • 32 prompt pairs
|
|
|
|
| 1986 |
|
| 1987 |
let ablMethod = 'advanced';
|
| 1988 |
const METHOD_INFO = {
|
| 1989 |
+
basic: {dirs:1, norm:false, reg:0.0, passes:1, desc:'1 direction • standard projection • 1 pass'},
|
| 1990 |
+
advanced: {dirs:4, norm:true, reg:0.3, passes:2, desc:'4 SVD directions • norm-preserving • 30% regularization • 2 refinement passes'},
|
| 1991 |
+
aggressive: {dirs:8, norm:true, reg:0.0, passes:3, desc:'8 SVD directions • norm-preserving • full orthogonalization • 3 refinement passes'},
|
| 1992 |
+
spectral_cascade: {dirs:6, norm:true, reg:0.15, passes:1, desc:'6 whitened-SVD directions • DCT frequency decomposition • coherence-weighted • adaptive bands'},
|
| 1993 |
+
informed: {dirs:'auto', norm:true, reg:'auto', passes:'auto', desc:'<span style="color:var(--cyan)">Analysis-guided</span> • auto directions • auto regularization • Ouroboros-compensated • cone/alignment/cluster analysis'},
|
| 1994 |
+
surgical: {dirs:4, norm:true, reg:0.2, passes:2, desc:'4 SVD directions • attention head surgery • SAE features • safety neuron masking • per-expert MoE'},
|
| 1995 |
+
optimized: {dirs:4, norm:true, reg:0.2, passes:2, desc:'4 SVD directions • Bayesian auto-tuned • CoT-aware • KL co-optimized • winsorized activations'},
|
| 1996 |
+
inverted: {dirs:4, norm:true, reg:0.1, passes:2, desc:'4 SVD directions • semantic inversion (2x reflection) • SAE feature targeting'},
|
| 1997 |
+
nuclear: {dirs:8, norm:true, reg:0.0, passes:3, desc:'8 SVD directions • all techniques combined • maximum force • head surgery + SAE + steering + transplant'},
|
| 1998 |
+
failspy: {dirs:1, norm:false, reg:0.0, passes:1, desc:'<span style="color:var(--text-dim)">Baseline</span> • 1 diff-means direction • all layers except first • FailSpy/abliterator reproduction'},
|
| 1999 |
+
gabliteration: {dirs:4, norm:false, reg:0.231, passes:1, desc:'<span style="color:var(--text-dim)">Baseline</span> • 4 SVD directions • ridge reg (alpha=0.3) • top-k layer selection • Gülmez 2026'},
|
| 2000 |
+
heretic: {dirs:1, norm:false, reg:0.0, passes:1, desc:'<span style="color:var(--text-dim)">Baseline</span> • 1 diff-means • Bayesian (Optuna TPE) • KL-optimized • float layer interpolation • p-e-w'},
|
| 2001 |
+
rdo: {dirs:4, norm:true, reg:0.0, passes:1, desc:'<span style="color:var(--text-dim)">Baseline</span> • 4 SVD directions • gradient-refined (RDO) • linear probe classifier • Wollschlager ICML 2025'},
|
| 2002 |
};
|
| 2003 |
|
| 2004 |
function getAblCmd() {
|
obliteratus/.DS_Store
CHANGED
|
Binary files a/obliteratus/.DS_Store and b/obliteratus/.DS_Store differ
|
|
|
obliteratus/abliterate.py
CHANGED
|
@@ -328,10 +328,11 @@ METHODS = {
|
|
| 328 |
"description": (
|
| 329 |
"Faithful reproduction of the FailSpy/abliterator library — the "
|
| 330 |
"most widely used community tool. Single direction via difference-"
|
| 331 |
-
"in-means (Arditi et al.),
|
| 332 |
-
"
|
| 333 |
-
"
|
| 334 |
-
"
|
|
|
|
| 335 |
),
|
| 336 |
"n_directions": 1,
|
| 337 |
"direction_method": "diff_means",
|
|
@@ -349,7 +350,7 @@ METHODS = {
|
|
| 349 |
"attention_head_surgery": False,
|
| 350 |
"use_sae_features": False,
|
| 351 |
"invert_refusal": False,
|
| 352 |
-
"layer_selection": "
|
| 353 |
},
|
| 354 |
"gabliteration": {
|
| 355 |
"label": "Gabliteration (Gülmez 2026 Baseline)",
|
|
@@ -383,20 +384,26 @@ METHODS = {
|
|
| 383 |
"layer_selection": "top_k",
|
| 384 |
},
|
| 385 |
"heretic": {
|
| 386 |
-
"label": "Heretic / p-e-w (2025 Baseline)",
|
| 387 |
"description": (
|
| 388 |
-
"Faithful reproduction of Heretic's core algorithm (p-e-w, 2025). "
|
| 389 |
-
"Bayesian optimization via Optuna TPE with
|
| 390 |
-
"
|
| 391 |
-
"
|
| 392 |
-
"
|
| 393 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 394 |
"manual hyperparameter selection with automated Pareto optimization "
|
| 395 |
-
"over the (
|
| 396 |
),
|
| 397 |
-
"n_directions":
|
| 398 |
"direction_method": "diff_means",
|
| 399 |
-
|
|
|
|
|
|
|
| 400 |
"regularization": 0.0,
|
| 401 |
"refinement_passes": 1,
|
| 402 |
"project_biases": False,
|
|
@@ -404,14 +411,21 @@ METHODS = {
|
|
| 404 |
"use_whitened_svd": False,
|
| 405 |
"true_iterative_refinement": False,
|
| 406 |
"use_jailbreak_contrast": False,
|
| 407 |
-
|
|
|
|
|
|
|
| 408 |
"safety_neuron_masking": False,
|
| 409 |
"per_expert_directions": False,
|
| 410 |
"attention_head_surgery": False,
|
| 411 |
"use_sae_features": False,
|
| 412 |
"invert_refusal": False,
|
| 413 |
-
|
| 414 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 415 |
"float_layer_interpolation": True,
|
| 416 |
"cot_aware": False,
|
| 417 |
"use_kl_optimization": True,
|
|
@@ -1689,7 +1703,8 @@ class AbliterationPipeline:
|
|
| 1689 |
# Supports multiple algorithms for baseline comparison:
|
| 1690 |
# knee_cosmic: OBLITERATUS default (knee detection + COSMIC fusion)
|
| 1691 |
# knee: knee detection only (simplified OBLITERATUS)
|
| 1692 |
-
# middle60:
|
|
|
|
| 1693 |
# all: all layers (for Bayesian optimization / Heretic)
|
| 1694 |
# top_k: top-k by refusal strength (Gabliteration-style)
|
| 1695 |
sorted_layers = sorted(norms.items(), key=lambda x: x[1], reverse=True)
|
|
@@ -1702,8 +1717,14 @@ class AbliterationPipeline:
|
|
| 1702 |
|
| 1703 |
selection_method = self.layer_selection
|
| 1704 |
|
| 1705 |
-
if selection_method == "
|
| 1706 |
-
# FailSpy/abliterator
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1707 |
self._strong_layers = self._select_layers_middle60(n_layers)
|
| 1708 |
self.log(f"Layer selection: middle-60% ({len(self._strong_layers)} layers)")
|
| 1709 |
|
|
@@ -2359,14 +2380,14 @@ class AbliterationPipeline:
|
|
| 2359 |
|
| 2360 |
@staticmethod
|
| 2361 |
def _select_layers_middle60(n_layers: int) -> list[int]:
|
| 2362 |
-
"""Select the middle 60% of layers (
|
| 2363 |
|
| 2364 |
-
|
| 2365 |
-
n_layers*0.2 to n_layers*0.8, based on the empirical observation that
|
| 2366 |
-
refusal concentrates in middle layers (not early embedding layers or
|
| 2367 |
-
late unembedding layers).
|
| 2368 |
|
| 2369 |
-
|
|
|
|
|
|
|
|
|
|
| 2370 |
"""
|
| 2371 |
start = int(n_layers * 0.2)
|
| 2372 |
end = int(n_layers * 0.8)
|
|
@@ -3751,7 +3772,9 @@ class AbliterationPipeline:
|
|
| 3751 |
|
| 3752 |
# Respect configured layer_selection (matching _distill)
|
| 3753 |
selection_method = self.layer_selection
|
| 3754 |
-
if selection_method == "
|
|
|
|
|
|
|
| 3755 |
self._strong_layers = self._select_layers_middle60(n_layers)
|
| 3756 |
elif selection_method == "all":
|
| 3757 |
self._strong_layers = self._select_layers_all(n_layers)
|
|
|
|
| 328 |
"description": (
|
| 329 |
"Faithful reproduction of the FailSpy/abliterator library — the "
|
| 330 |
"most widely used community tool. Single direction via difference-"
|
| 331 |
+
"in-means (Arditi et al.), applied to all layers except layer 0 "
|
| 332 |
+
"(matching FailSpy source: range(1, n_layers)). Projects both "
|
| 333 |
+
"W_O (attention output) and MLP W_out. No regularization, no "
|
| 334 |
+
"norm preservation. Uses chat template for instruct models. "
|
| 335 |
+
"This is what most HuggingFace abliterated models were created with."
|
| 336 |
),
|
| 337 |
"n_directions": 1,
|
| 338 |
"direction_method": "diff_means",
|
|
|
|
| 350 |
"attention_head_surgery": False,
|
| 351 |
"use_sae_features": False,
|
| 352 |
"invert_refusal": False,
|
| 353 |
+
"layer_selection": "all_except_first",
|
| 354 |
},
|
| 355 |
"gabliteration": {
|
| 356 |
"label": "Gabliteration (Gülmez 2026 Baseline)",
|
|
|
|
| 384 |
"layer_selection": "top_k",
|
| 385 |
},
|
| 386 |
"heretic": {
|
| 387 |
+
"label": "Heretic / p-e-w (2025-2026 Baseline)",
|
| 388 |
"description": (
|
| 389 |
+
"Faithful reproduction of Heretic's core algorithm (p-e-w, 2025-2026). "
|
| 390 |
+
"Bayesian optimization via Optuna TPE with linear bell curve layer "
|
| 391 |
+
"weighting (NOT Gaussian — linear interpolation between max_weight and "
|
| 392 |
+
"min_weight over min_weight_distance). One diff-of-means direction per "
|
| 393 |
+
"layer; direction_scope is sampled ('global' selects a float layer index "
|
| 394 |
+
"with lerp between adjacent layers' directions, 'per layer' uses each "
|
| 395 |
+
"layer's own direction). LoRA-based ablation (delta W = -lambda * v * "
|
| 396 |
+
"(v^T W)), never modifies base weights directly. Row normalization "
|
| 397 |
+
"defaults to NONE (PRE and FULL are options). Activation winsorization "
|
| 398 |
+
"via symmetric quantile clamping. The key innovation is replacing "
|
| 399 |
"manual hyperparameter selection with automated Pareto optimization "
|
| 400 |
+
"over the (refusal_count, KL_divergence) frontier."
|
| 401 |
),
|
| 402 |
+
"n_directions": 1,
|
| 403 |
"direction_method": "diff_means",
|
| 404 |
+
# Heretic default row_normalization is NONE; PRE/FULL are optional.
|
| 405 |
+
# OBLITERATUS norm_preserve=False matches Heretic's default behavior.
|
| 406 |
+
"norm_preserve": False,
|
| 407 |
"regularization": 0.0,
|
| 408 |
"refinement_passes": 1,
|
| 409 |
"project_biases": False,
|
|
|
|
| 411 |
"use_whitened_svd": False,
|
| 412 |
"true_iterative_refinement": False,
|
| 413 |
"use_jailbreak_contrast": False,
|
| 414 |
+
# Heretic uses its own bell curve weighting (linear, not Gaussian),
|
| 415 |
+
# not OBLITERATUS's norm-based layer_adaptive_strength.
|
| 416 |
+
"layer_adaptive_strength": False,
|
| 417 |
"safety_neuron_masking": False,
|
| 418 |
"per_expert_directions": False,
|
| 419 |
"attention_head_surgery": False,
|
| 420 |
"use_sae_features": False,
|
| 421 |
"invert_refusal": False,
|
| 422 |
+
# Heretic default winsorization_quantile is 1.0 (disabled by default).
|
| 423 |
+
# For faithful baseline reproduction we match the source default.
|
| 424 |
+
"winsorize_activations": False,
|
| 425 |
+
"winsorize_percentile": 1.0,
|
| 426 |
+
# Heretic's float direction index interpolates between adjacent LAYERS'
|
| 427 |
+
# directions (not SVD components). OBLITERATUS float_layer_interpolation
|
| 428 |
+
# provides the bell-curve layer weighting aspect.
|
| 429 |
"float_layer_interpolation": True,
|
| 430 |
"cot_aware": False,
|
| 431 |
"use_kl_optimization": True,
|
|
|
|
| 1703 |
# Supports multiple algorithms for baseline comparison:
|
| 1704 |
# knee_cosmic: OBLITERATUS default (knee detection + COSMIC fusion)
|
| 1705 |
# knee: knee detection only (simplified OBLITERATUS)
|
| 1706 |
+
# middle60: legacy heuristic (layers 20%-80%)
|
| 1707 |
+
# all_except_first: FailSpy/abliterator (all layers except layer 0)
|
| 1708 |
# all: all layers (for Bayesian optimization / Heretic)
|
| 1709 |
# top_k: top-k by refusal strength (Gabliteration-style)
|
| 1710 |
sorted_layers = sorted(norms.items(), key=lambda x: x[1], reverse=True)
|
|
|
|
| 1717 |
|
| 1718 |
selection_method = self.layer_selection
|
| 1719 |
|
| 1720 |
+
if selection_method == "all_except_first":
|
| 1721 |
+
# FailSpy/abliterator: all layers except layer 0
|
| 1722 |
+
# Source: range(1, self.model.cfg.n_layers) in FailSpy/abliterator
|
| 1723 |
+
self._strong_layers = list(range(1, n_layers))
|
| 1724 |
+
self.log(f"Layer selection: all-except-first ({len(self._strong_layers)} layers)")
|
| 1725 |
+
|
| 1726 |
+
elif selection_method == "middle60":
|
| 1727 |
+
# Legacy heuristic: middle 60% of layers (layers 20%-80%)
|
| 1728 |
self._strong_layers = self._select_layers_middle60(n_layers)
|
| 1729 |
self.log(f"Layer selection: middle-60% ({len(self._strong_layers)} layers)")
|
| 1730 |
|
|
|
|
| 2380 |
|
| 2381 |
@staticmethod
|
| 2382 |
def _select_layers_middle60(n_layers: int) -> list[int]:
|
| 2383 |
+
"""Select the middle 60% of layers (legacy heuristic).
|
| 2384 |
|
| 2385 |
+
Selects layers from index n_layers*0.2 to n_layers*0.8.
|
|
|
|
|
|
|
|
|
|
| 2386 |
|
| 2387 |
+
NOTE: This does NOT match FailSpy/abliterator's actual layer selection.
|
| 2388 |
+
FailSpy uses all layers except layer 0 (range(1, n_layers)). Use
|
| 2389 |
+
layer_selection="all_except_first" for faithful FailSpy reproduction.
|
| 2390 |
+
This method is retained for backward compatibility only.
|
| 2391 |
"""
|
| 2392 |
start = int(n_layers * 0.2)
|
| 2393 |
end = int(n_layers * 0.8)
|
|
|
|
| 3772 |
|
| 3773 |
# Respect configured layer_selection (matching _distill)
|
| 3774 |
selection_method = self.layer_selection
|
| 3775 |
+
if selection_method == "all_except_first":
|
| 3776 |
+
self._strong_layers = list(range(1, n_layers))
|
| 3777 |
+
elif selection_method == "middle60":
|
| 3778 |
self._strong_layers = self._select_layers_middle60(n_layers)
|
| 3779 |
elif selection_method == "all":
|
| 3780 |
self._strong_layers = self._select_layers_all(n_layers)
|
obliteratus/analysis/conditional_abliteration.py
CHANGED
|
@@ -269,7 +269,7 @@ class ConditionalAbliterator:
|
|
| 269 |
) -> torch.Tensor | None:
|
| 270 |
"""Extract category-specific refusal direction.
|
| 271 |
|
| 272 |
-
Uses
|
| 273 |
and then orthogonalizes against previously extracted directions
|
| 274 |
to ensure category independence.
|
| 275 |
"""
|
|
|
|
| 269 |
) -> torch.Tensor | None:
|
| 270 |
"""Extract category-specific refusal direction.
|
| 271 |
|
| 272 |
+
Uses difference-of-means (category_mean - harmless_mean)
|
| 273 |
and then orthogonalizes against previously extracted directions
|
| 274 |
to ensure category independence.
|
| 275 |
"""
|
obliteratus/analysis/leace.py
CHANGED
|
@@ -1,36 +1,34 @@
|
|
| 1 |
-
"""LEACE
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
|
|
|
| 27 |
|
| 28 |
Advantages over SVD:
|
| 29 |
-
-
|
| 30 |
-
|
| 31 |
-
- Handles rogue dimensions naturally: within-class normalization
|
| 32 |
-
prevents high-variance but non-discriminative dimensions from
|
| 33 |
-
dominating
|
| 34 |
- No hyperparameters beyond regularization epsilon
|
| 35 |
- Closed-form solution (no iterative optimization)
|
| 36 |
|
|
@@ -39,6 +37,8 @@ References:
|
|
| 39 |
closed form. NeurIPS 2023.
|
| 40 |
- Ravfogel et al. (2022): RLACE: Adversarial concept erasure
|
| 41 |
(iterative precursor to LEACE).
|
|
|
|
|
|
|
| 42 |
"""
|
| 43 |
|
| 44 |
from __future__ import annotations
|
|
@@ -61,11 +61,11 @@ class LEACEResult:
|
|
| 61 |
|
| 62 |
|
| 63 |
class LEACEExtractor:
|
| 64 |
-
"""Extract refusal directions via
|
| 65 |
|
| 66 |
Finds the direction that maximally separates harmful from harmless
|
| 67 |
-
activations relative to within-class variance
|
| 68 |
-
|
| 69 |
"""
|
| 70 |
|
| 71 |
def __init__(
|
|
|
|
| 1 |
+
"""LEACE-inspired direction extraction for refusal concept erasure.
|
| 2 |
+
|
| 3 |
+
This module implements Fisher's Linear Discriminant (FLD) direction for
|
| 4 |
+
concept erasure, inspired by LEACE (Belrose et al. 2023).
|
| 5 |
+
|
| 6 |
+
IMPORTANT: This is NOT a faithful implementation of LEACE as described in
|
| 7 |
+
the paper. Key difference:
|
| 8 |
+
|
| 9 |
+
- **True LEACE** uses the *total* covariance Sigma_X for whitening:
|
| 10 |
+
P* = I - W^{-1} P_{W Sigma_XZ} W where W = Sigma_X^{-1/2}
|
| 11 |
+
For binary concepts, this yields: v = Sigma_X^{-1} delta
|
| 12 |
+
|
| 13 |
+
- **This implementation** uses *within-class* covariance S_w:
|
| 14 |
+
v = S_w^{-1} delta
|
| 15 |
+
This is Fisher's Linear Discriminant direction, which maximizes
|
| 16 |
+
class separability relative to within-class spread.
|
| 17 |
+
|
| 18 |
+
For binary concepts, Sigma_X = S_w + p(1-p) * delta @ delta^T,
|
| 19 |
+
so the two directions differ when the between-class scatter is
|
| 20 |
+
non-negligible relative to within-class scatter. In high-dimensional
|
| 21 |
+
settings (d >> 1) with moderate class separation, the difference
|
| 22 |
+
is typically small but non-zero.
|
| 23 |
+
|
| 24 |
+
The FLD direction is still a strong choice for refusal erasure — it
|
| 25 |
+
handles rogue dimensions (high-variance but non-discriminative) better
|
| 26 |
+
than plain diff-of-means, and is a closed-form solution with no
|
| 27 |
+
iterative optimization.
|
| 28 |
|
| 29 |
Advantages over SVD:
|
| 30 |
+
- Within-class normalization prevents high-variance but
|
| 31 |
+
non-discriminative dimensions from dominating
|
|
|
|
|
|
|
|
|
|
| 32 |
- No hyperparameters beyond regularization epsilon
|
| 33 |
- Closed-form solution (no iterative optimization)
|
| 34 |
|
|
|
|
| 37 |
closed form. NeurIPS 2023.
|
| 38 |
- Ravfogel et al. (2022): RLACE: Adversarial concept erasure
|
| 39 |
(iterative precursor to LEACE).
|
| 40 |
+
- Fisher (1936): The use of multiple measurements in taxonomic
|
| 41 |
+
problems. Annals of Eugenics.
|
| 42 |
"""
|
| 43 |
|
| 44 |
from __future__ import annotations
|
|
|
|
| 61 |
|
| 62 |
|
| 63 |
class LEACEExtractor:
|
| 64 |
+
"""Extract refusal directions via Fisher's Linear Discriminant.
|
| 65 |
|
| 66 |
Finds the direction that maximally separates harmful from harmless
|
| 67 |
+
activations relative to within-class variance (v = S_w^{-1} delta).
|
| 68 |
+
See module docstring for how this relates to true LEACE.
|
| 69 |
"""
|
| 70 |
|
| 71 |
def __init__(
|
obliteratus/analysis/riemannian_manifold.py
CHANGED
|
@@ -428,8 +428,15 @@ class RiemannianManifoldAnalyzer:
|
|
| 428 |
geodesic triangle with area A satisfies:
|
| 429 |
sum(angles) = pi + K * A (Gauss-Bonnet for small triangles)
|
| 430 |
|
| 431 |
-
|
| 432 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
"""
|
| 434 |
# Compute sides
|
| 435 |
ab = (b - a).float()
|
|
@@ -613,8 +620,12 @@ class RiemannianManifoldAnalyzer:
|
|
| 613 |
return torch.zeros_like(activation)
|
| 614 |
v = v / norm
|
| 615 |
|
| 616 |
-
#
|
| 617 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 618 |
|
| 619 |
# Clamp to prevent instability
|
| 620 |
correction_magnitude = max(-0.1, min(0.1, correction_magnitude))
|
|
|
|
| 428 |
geodesic triangle with area A satisfies:
|
| 429 |
sum(angles) = pi + K * A (Gauss-Bonnet for small triangles)
|
| 430 |
|
| 431 |
+
IMPORTANT LIMITATION: This method uses Euclidean chords and angles
|
| 432 |
+
in ambient space, NOT geodesics on the manifold. In flat Euclidean
|
| 433 |
+
space, the angle sum of any triangle is exactly pi, so this method
|
| 434 |
+
will yield K ≈ 0 (up to numerical noise) regardless of the actual
|
| 435 |
+
manifold curvature. The results are only meaningful when the data
|
| 436 |
+
lies on an approximately low-dimensional curved submanifold and
|
| 437 |
+
triangles are sufficiently small relative to the curvature radius.
|
| 438 |
+
For rigorous curvature estimates, use methods based on local PCA
|
| 439 |
+
eigenvalue decay or Jacobian-based Riemannian metric computation.
|
| 440 |
"""
|
| 441 |
# Compute sides
|
| 442 |
ab = (b - a).float()
|
|
|
|
| 620 |
return torch.zeros_like(activation)
|
| 621 |
v = v / norm
|
| 622 |
|
| 623 |
+
# Second-order geodesic correction: K * proj_magnitude^2 / 6
|
| 624 |
+
# From Jacobi field estimate: deviation of geodesic from straight
|
| 625 |
+
# line over distance L with curvature K is ≈ K * L^2 / 6.
|
| 626 |
+
# Note: the residual bound in analyze() uses K * ||x||^2 / 8
|
| 627 |
+
# which is a looser upper bound including higher-order terms.
|
| 628 |
+
correction_magnitude = curvature * proj_magnitude ** 2 / 6.0
|
| 629 |
|
| 630 |
# Clamp to prevent instability
|
| 631 |
correction_magnitude = max(-0.1, min(0.1, correction_magnitude))
|
obliteratus/analysis/sae_abliteration.py
CHANGED
|
@@ -94,7 +94,7 @@ class SparseAutoencoder(nn.Module):
|
|
| 94 |
|
| 95 |
@property
|
| 96 |
def decoder_weight(self) -> torch.Tensor:
|
| 97 |
-
"""Return the decoder weight matrix (
|
| 98 |
if self.tied_weights:
|
| 99 |
return self.encoder.weight.T
|
| 100 |
return self.decoder.weight
|
|
|
|
| 94 |
|
| 95 |
@property
|
| 96 |
def decoder_weight(self) -> torch.Tensor:
|
| 97 |
+
"""Return the decoder weight matrix (hidden_dim x n_features for untied, or encoder.weight.T)."""
|
| 98 |
if self.tied_weights:
|
| 99 |
return self.encoder.weight.T
|
| 100 |
return self.decoder.weight
|
obliteratus/analysis/spectral_certification.py
CHANGED
|
@@ -175,10 +175,11 @@ class SpectralCertifier:
|
|
| 175 |
harmful_centered = harmful_activations - harmful_mean
|
| 176 |
harmless_centered = harmless_activations - harmless_mean
|
| 177 |
|
| 178 |
-
# Pooled within-class covariance
|
|
|
|
| 179 |
cov_h = harmful_centered.T @ harmful_centered / max(n_h - 1, 1)
|
| 180 |
cov_b = harmless_centered.T @ harmless_centered / max(n_b - 1, 1)
|
| 181 |
-
pooled_cov = (cov_h * n_h + cov_b * n_b) / max(n - 2, 1)
|
| 182 |
|
| 183 |
# Step 2: Estimate noise variance (median eigenvalue method)
|
| 184 |
noise_var = self._estimate_noise_variance(pooled_cov, n, d)
|
|
@@ -374,8 +375,13 @@ class SpectralCertifier:
|
|
| 374 |
# Correct for MP bias: median of MP distribution
|
| 375 |
gamma = d / max(n, 1)
|
| 376 |
if gamma < 1:
|
| 377 |
-
# MP median approximation
|
| 378 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 379 |
noise_var = median_eig / max(mp_median_ratio, 1e-10)
|
| 380 |
else:
|
| 381 |
noise_var = median_eig
|
|
|
|
| 175 |
harmful_centered = harmful_activations - harmful_mean
|
| 176 |
harmless_centered = harmless_activations - harmless_mean
|
| 177 |
|
| 178 |
+
# Pooled within-class covariance (standard formula: sum of scatter
|
| 179 |
+
# matrices divided by total degrees of freedom)
|
| 180 |
cov_h = harmful_centered.T @ harmful_centered / max(n_h - 1, 1)
|
| 181 |
cov_b = harmless_centered.T @ harmless_centered / max(n_b - 1, 1)
|
| 182 |
+
pooled_cov = (cov_h * (n_h - 1) + cov_b * (n_b - 1)) / max(n - 2, 1)
|
| 183 |
|
| 184 |
# Step 2: Estimate noise variance (median eigenvalue method)
|
| 185 |
noise_var = self._estimate_noise_variance(pooled_cov, n, d)
|
|
|
|
| 375 |
# Correct for MP bias: median of MP distribution
|
| 376 |
gamma = d / max(n, 1)
|
| 377 |
if gamma < 1:
|
| 378 |
+
# MP median approximation. The exact MP median requires
|
| 379 |
+
# numerical inversion of the MP CDF; we use the empirical
|
| 380 |
+
# approximation median ≈ (1 - sqrt(gamma))^2 + gamma^(1/3)
|
| 381 |
+
# which is more accurate than the naive 0.5 * upper_edge
|
| 382 |
+
# for small gamma. Falls back to the simpler formula when
|
| 383 |
+
# gamma is very small.
|
| 384 |
+
mp_median_ratio = (1 - math.sqrt(gamma)) ** 2 + gamma ** (1.0 / 3.0)
|
| 385 |
noise_var = median_eig / max(mp_median_ratio, 1e-10)
|
| 386 |
else:
|
| 387 |
noise_var = median_eig
|
obliteratus/analysis/wasserstein_optimal.py
CHANGED
|
@@ -58,7 +58,7 @@ class WassersteinDirectionResult:
|
|
| 58 |
direction: torch.Tensor # (hidden_dim,) optimal direction
|
| 59 |
wasserstein_cost: float # W_2^2 cost for this direction
|
| 60 |
mean_shift_component: float # (r^T m)^2 portion
|
| 61 |
-
bures_component: float # r^T Sigma r portion (
|
| 62 |
refusal_projection: float # (r^T d)^2
|
| 63 |
cost_effectiveness_ratio: float # W_2^2 / (r^T d)^2
|
| 64 |
|
|
|
|
| 58 |
direction: torch.Tensor # (hidden_dim,) optimal direction
|
| 59 |
wasserstein_cost: float # W_2^2 cost for this direction
|
| 60 |
mean_shift_component: float # (r^T m)^2 portion
|
| 61 |
+
bures_component: float # r^T Sigma r portion (exact when r is eigenvector of Sigma, lower bound otherwise)
|
| 62 |
refusal_projection: float # (r^T d)^2
|
| 63 |
cost_effectiveness_ratio: float # W_2^2 / (r^T d)^2
|
| 64 |
|
obliteratus/evaluation/heretic_eval.py
CHANGED
|
@@ -334,19 +334,20 @@ def _load_harmbench_classifier():
|
|
| 334 |
bnb_4bit_quant_type="nf4",
|
| 335 |
llm_int8_enable_fp32_cpu_offload=True,
|
| 336 |
)
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
torch_dtype=torch.float16,
|
| 342 |
-
)
|
| 343 |
except Exception:
|
| 344 |
logger.info("4-bit quantization unavailable for classifier, loading in float16")
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
device_map="auto"
|
| 348 |
-
|
| 349 |
-
|
|
|
|
|
|
|
|
|
|
| 350 |
|
| 351 |
model.eval()
|
| 352 |
_HARMBENCH_CLASSIFIER = (model, tokenizer)
|
|
|
|
| 334 |
bnb_4bit_quant_type="nf4",
|
| 335 |
llm_int8_enable_fp32_cpu_offload=True,
|
| 336 |
)
|
| 337 |
+
load_kwargs = dict(quantization_config=bnb_cfg, torch_dtype=torch.float16)
|
| 338 |
+
if dev.supports_device_map_auto():
|
| 339 |
+
load_kwargs["device_map"] = "auto"
|
| 340 |
+
model = AutoModelForCausalLM.from_pretrained(model_id, **load_kwargs)
|
|
|
|
|
|
|
| 341 |
except Exception:
|
| 342 |
logger.info("4-bit quantization unavailable for classifier, loading in float16")
|
| 343 |
+
load_kwargs = dict(torch_dtype=torch.float16)
|
| 344 |
+
if dev.supports_device_map_auto():
|
| 345 |
+
load_kwargs["device_map"] = "auto"
|
| 346 |
+
model = AutoModelForCausalLM.from_pretrained(model_id, **load_kwargs)
|
| 347 |
+
|
| 348 |
+
# On MPS/CPU: move model to best available device
|
| 349 |
+
if not dev.supports_device_map_auto():
|
| 350 |
+
model = model.to(dev.get_device())
|
| 351 |
|
| 352 |
model.eval()
|
| 353 |
_HARMBENCH_CLASSIFIER = (model, tokenizer)
|
obliteratus/tourney.py
CHANGED
|
@@ -1097,6 +1097,11 @@ class TourneyRunner:
|
|
| 1097 |
result.winner = winner
|
| 1098 |
result.total_time_s = time.time() - t_start
|
| 1099 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1100 |
self.log("")
|
| 1101 |
self.log("=" * 60)
|
| 1102 |
if winner:
|
|
@@ -1401,6 +1406,11 @@ class TourneyRunner:
|
|
| 1401 |
result.winner = winner
|
| 1402 |
result.total_time_s = time.time() - t_start
|
| 1403 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1404 |
self.log("")
|
| 1405 |
self.log("=" * 60)
|
| 1406 |
if winner:
|
|
|
|
| 1097 |
result.winner = winner
|
| 1098 |
result.total_time_s = time.time() - t_start
|
| 1099 |
|
| 1100 |
+
# Clean up non-winner finalist dirs to free disk
|
| 1101 |
+
for c in ranked[1:]:
|
| 1102 |
+
if c.output_dir and Path(c.output_dir).exists():
|
| 1103 |
+
shutil.rmtree(c.output_dir, ignore_errors=True)
|
| 1104 |
+
|
| 1105 |
self.log("")
|
| 1106 |
self.log("=" * 60)
|
| 1107 |
if winner:
|
|
|
|
| 1406 |
result.winner = winner
|
| 1407 |
result.total_time_s = time.time() - t_start
|
| 1408 |
|
| 1409 |
+
# Clean up non-winner finalist dirs to free disk
|
| 1410 |
+
for c in ranked[1:]:
|
| 1411 |
+
if c.output_dir and Path(c.output_dir).exists():
|
| 1412 |
+
shutil.rmtree(c.output_dir, ignore_errors=True)
|
| 1413 |
+
|
| 1414 |
self.log("")
|
| 1415 |
self.log("=" * 60)
|
| 1416 |
if winner:
|