Spaces:
Running
Running
Upload 129 files
Browse files- README.md +3 -3
- app.py +12 -11
- hf-spaces/README.md +1 -1
- obliteratus/abliterate.py +105 -65
- obliteratus/cli.py +1 -1
- scripts/run_benchmark_remote.sh +2 -2
README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
---
|
| 2 |
title: OBLITERATUS
|
| 3 |
-
emoji: "
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: gray
|
| 6 |
sdk: gradio
|
|
@@ -302,9 +302,9 @@ Beyond targeted liberation, OBLITERATUS is a general-purpose ablation suite for
|
|
| 302 |
|
| 303 |
Each strategy enumerates all possible ablations, applies them one at a time, measures the impact, and restores the model β giving you a complete map of where the chains are anchored vs. where the mind lives.
|
| 304 |
|
| 305 |
-
##
|
| 306 |
|
| 307 |
-
OBLITERATUS ships with presets for
|
| 308 |
|
| 309 |
| Tier | VRAM | Example models |
|
| 310 |
|------|------|---------------|
|
|
|
|
| 1 |
---
|
| 2 |
title: OBLITERATUS
|
| 3 |
+
emoji: "βοΈβπ₯"
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: gray
|
| 6 |
sdk: gradio
|
|
|
|
| 302 |
|
| 303 |
Each strategy enumerates all possible ablations, applies them one at a time, measures the impact, and restores the model β giving you a complete map of where the chains are anchored vs. where the mind lives.
|
| 304 |
|
| 305 |
+
## 116 curated models across 5 tiers
|
| 306 |
|
| 307 |
+
OBLITERATUS ships with presets for 116 models organized by compute requirement:
|
| 308 |
|
| 309 |
| Tier | VRAM | Example models |
|
| 310 |
|------|------|---------------|
|
app.py
CHANGED
|
@@ -324,13 +324,13 @@ _NEEDS_QUANTIZATION = {
|
|
| 324 |
}
|
| 325 |
|
| 326 |
|
| 327 |
-
def _should_quantize(model_id: str) -> str | None:
|
| 328 |
"""Return '4bit' if the model needs quantization for available GPU, else None."""
|
| 329 |
try:
|
| 330 |
from obliteratus.models.loader import _estimate_model_memory_gb, _available_gpu_memory_gb
|
| 331 |
from transformers import AutoConfig
|
| 332 |
token = os.environ.get("HF_TOKEN") or None
|
| 333 |
-
config = AutoConfig.from_pretrained(model_id, trust_remote_code=
|
| 334 |
# Skip if model already ships with native quantization (e.g. Mxfp4Config)
|
| 335 |
if getattr(config, "quantization_config", None) is not None:
|
| 336 |
return None
|
|
@@ -701,7 +701,7 @@ def benchmark(
|
|
| 701 |
if result.status == "running":
|
| 702 |
run_logs.append(f"{stage_key.upper()} β {result.message}")
|
| 703 |
|
| 704 |
-
quantization = _should_quantize(model_id)
|
| 705 |
|
| 706 |
def run_pipeline():
|
| 707 |
try:
|
|
@@ -1044,7 +1044,7 @@ def benchmark_multi_model(
|
|
| 1044 |
def on_stage(result):
|
| 1045 |
pass
|
| 1046 |
|
| 1047 |
-
quantization = _should_quantize(model_id)
|
| 1048 |
|
| 1049 |
def run_pipeline():
|
| 1050 |
try:
|
|
@@ -1359,9 +1359,10 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
|
|
| 1359 |
_state["model_name"] = model_choice
|
| 1360 |
_state["method"] = method
|
| 1361 |
|
| 1362 |
-
|
| 1363 |
-
|
| 1364 |
-
|
|
|
|
| 1365 |
|
| 1366 |
log_lines = []
|
| 1367 |
last_yielded = [0]
|
|
@@ -1387,7 +1388,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
|
|
| 1387 |
idx = stage_order.get(stage_key, 0)
|
| 1388 |
progress((idx + 1) / 6, desc=f"{stage_key.upper()}")
|
| 1389 |
|
| 1390 |
-
quantization = _should_quantize(model_id)
|
| 1391 |
|
| 1392 |
def run_pipeline():
|
| 1393 |
try:
|
|
@@ -1497,7 +1498,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
|
|
| 1497 |
# Stream log updates while pipeline runs (max 45 minutes to prevent indefinite hang)
|
| 1498 |
_max_pipeline_secs = 45 * 60
|
| 1499 |
_pipeline_start = time.time()
|
| 1500 |
-
status_msg =
|
| 1501 |
while worker.is_alive():
|
| 1502 |
status_msg = f"**Obliterating\u2026** ({_elapsed()})"
|
| 1503 |
if len(log_lines) > last_yielded[0]:
|
|
@@ -2018,8 +2019,8 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
|
|
| 2018 |
else:
|
| 2019 |
n = min(len(harmful_all), len(harmless_all))
|
| 2020 |
|
| 2021 |
-
quantization = _should_quantize(model_id)
|
| 2022 |
is_preset = cfg["model_choice"] in MODELS
|
|
|
|
| 2023 |
|
| 2024 |
pipeline_ref = [None]
|
| 2025 |
error_ref = [None]
|
|
@@ -2319,7 +2320,7 @@ def strength_sweep(model_choice: str, method_choice: str,
|
|
| 2319 |
|
| 2320 |
def _run_sweep_point():
|
| 2321 |
try:
|
| 2322 |
-
quantization = _should_quantize(model_id)
|
| 2323 |
pipe = AbliterationPipeline(
|
| 2324 |
model_id, method=method_key,
|
| 2325 |
output_dir=f"/tmp/sweep_{step_i}",
|
|
|
|
| 324 |
}
|
| 325 |
|
| 326 |
|
| 327 |
+
def _should_quantize(model_id: str, is_preset: bool = False) -> str | None:
|
| 328 |
"""Return '4bit' if the model needs quantization for available GPU, else None."""
|
| 329 |
try:
|
| 330 |
from obliteratus.models.loader import _estimate_model_memory_gb, _available_gpu_memory_gb
|
| 331 |
from transformers import AutoConfig
|
| 332 |
token = os.environ.get("HF_TOKEN") or None
|
| 333 |
+
config = AutoConfig.from_pretrained(model_id, trust_remote_code=is_preset, token=token)
|
| 334 |
# Skip if model already ships with native quantization (e.g. Mxfp4Config)
|
| 335 |
if getattr(config, "quantization_config", None) is not None:
|
| 336 |
return None
|
|
|
|
| 701 |
if result.status == "running":
|
| 702 |
run_logs.append(f"{stage_key.upper()} β {result.message}")
|
| 703 |
|
| 704 |
+
quantization = _should_quantize(model_id, is_preset=is_preset)
|
| 705 |
|
| 706 |
def run_pipeline():
|
| 707 |
try:
|
|
|
|
| 1044 |
def on_stage(result):
|
| 1045 |
pass
|
| 1046 |
|
| 1047 |
+
quantization = _should_quantize(model_id, is_preset=is_preset_model)
|
| 1048 |
|
| 1049 |
def run_pipeline():
|
| 1050 |
try:
|
|
|
|
| 1359 |
_state["model_name"] = model_choice
|
| 1360 |
_state["method"] = method
|
| 1361 |
|
| 1362 |
+
with _lock:
|
| 1363 |
+
global _obliterate_counter
|
| 1364 |
+
_obliterate_counter += 1
|
| 1365 |
+
save_dir = f"/tmp/obliterated_{_obliterate_counter}"
|
| 1366 |
|
| 1367 |
log_lines = []
|
| 1368 |
last_yielded = [0]
|
|
|
|
| 1388 |
idx = stage_order.get(stage_key, 0)
|
| 1389 |
progress((idx + 1) / 6, desc=f"{stage_key.upper()}")
|
| 1390 |
|
| 1391 |
+
quantization = _should_quantize(model_id, is_preset=is_preset)
|
| 1392 |
|
| 1393 |
def run_pipeline():
|
| 1394 |
try:
|
|
|
|
| 1498 |
# Stream log updates while pipeline runs (max 45 minutes to prevent indefinite hang)
|
| 1499 |
_max_pipeline_secs = 45 * 60
|
| 1500 |
_pipeline_start = time.time()
|
| 1501 |
+
status_msg = "**Obliterating\u2026** (0s)"
|
| 1502 |
while worker.is_alive():
|
| 1503 |
status_msg = f"**Obliterating\u2026** ({_elapsed()})"
|
| 1504 |
if len(log_lines) > last_yielded[0]:
|
|
|
|
| 2019 |
else:
|
| 2020 |
n = min(len(harmful_all), len(harmless_all))
|
| 2021 |
|
|
|
|
| 2022 |
is_preset = cfg["model_choice"] in MODELS
|
| 2023 |
+
quantization = _should_quantize(model_id, is_preset=is_preset)
|
| 2024 |
|
| 2025 |
pipeline_ref = [None]
|
| 2026 |
error_ref = [None]
|
|
|
|
| 2320 |
|
| 2321 |
def _run_sweep_point():
|
| 2322 |
try:
|
| 2323 |
+
quantization = _should_quantize(model_id, is_preset=is_preset)
|
| 2324 |
pipe = AbliterationPipeline(
|
| 2325 |
model_id, method=method_key,
|
| 2326 |
output_dir=f"/tmp/sweep_{step_i}",
|
hf-spaces/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
---
|
| 2 |
title: OBLITERATUS
|
| 3 |
-
emoji: "
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: gray
|
| 6 |
sdk: gradio
|
|
|
|
| 1 |
---
|
| 2 |
title: OBLITERATUS
|
| 3 |
+
emoji: "βοΈβπ₯"
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: gray
|
| 6 |
sdk: gradio
|
obliteratus/abliterate.py
CHANGED
|
@@ -949,8 +949,14 @@ class AbliterationPipeline:
|
|
| 949 |
self.log(f" Router profiling complete: {n_profiled} MoE layers profiled")
|
| 950 |
|
| 951 |
for idx in range(n_layers):
|
| 952 |
-
self.
|
| 953 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 954 |
|
| 955 |
# ββ Jailbreak-contrastive probing βββββββββββββββββββββββββββββββββ
|
| 956 |
if self.use_jailbreak_contrast:
|
|
@@ -1008,18 +1014,31 @@ class AbliterationPipeline:
|
|
| 1008 |
|
| 1009 |
n = len(prompts)
|
| 1010 |
self.log(f" Wrapping {n} prompts with chat template")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1011 |
wrapped = []
|
| 1012 |
-
for i,
|
| 1013 |
-
messages = [{"role": "user", "content": prompt}]
|
| 1014 |
try:
|
| 1015 |
text = tokenizer.apply_chat_template(
|
| 1016 |
-
|
| 1017 |
)
|
| 1018 |
wrapped.append(text)
|
| 1019 |
except Exception:
|
| 1020 |
-
wrapped.append(
|
| 1021 |
-
|
| 1022 |
-
self.log(f" chat template {i + 1}/{n}")
|
| 1023 |
return wrapped
|
| 1024 |
|
| 1025 |
@staticmethod
|
|
@@ -1426,7 +1445,7 @@ class AbliterationPipeline:
|
|
| 1426 |
if n_dirs > 1:
|
| 1427 |
harmful_stack = torch.stack(self._harmful_acts[idx]).squeeze(1)
|
| 1428 |
harmless_stack = torch.stack(self._harmless_acts[idx]).squeeze(1)
|
| 1429 |
-
diff_matrix = harmful_stack - harmless_stack
|
| 1430 |
if torch.isfinite(diff_matrix).all():
|
| 1431 |
k = min(n_dirs, diff_matrix.shape[0], diff_matrix.shape[1])
|
| 1432 |
_, _, Vh = torch.linalg.svd(diff_matrix, full_matrices=False)
|
|
@@ -1475,7 +1494,7 @@ class AbliterationPipeline:
|
|
| 1475 |
# SVD-based multi-direction extraction (Gabliteration)
|
| 1476 |
harmful_stack = torch.stack(self._harmful_acts[idx]).squeeze(1) # (n_prompts, hidden)
|
| 1477 |
harmless_stack = torch.stack(self._harmless_acts[idx]).squeeze(1)
|
| 1478 |
-
diff_matrix = harmful_stack - harmless_stack #
|
| 1479 |
|
| 1480 |
# SVD to extract principal refusal directions
|
| 1481 |
if not torch.isfinite(diff_matrix).all():
|
|
@@ -3046,16 +3065,21 @@ class AbliterationPipeline:
|
|
| 3046 |
# remove components that lie in both subspaces (violating
|
| 3047 |
# the GRRO's independent-Ξ±α΅’ assumption; see theory journal
|
| 3048 |
# Β§12.6 "SAE-SVD Orthogonalization").
|
| 3049 |
-
|
| 3050 |
-
|
| 3051 |
-
|
| 3052 |
-
|
| 3053 |
-
|
| 3054 |
-
|
| 3055 |
-
|
| 3056 |
-
|
| 3057 |
-
|
| 3058 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3059 |
sae_count = 0
|
| 3060 |
# SAE regularization: for inversion modes, use a much
|
| 3061 |
# gentler floor (0.6 = 40% removal) since these are
|
|
@@ -3063,39 +3087,52 @@ class AbliterationPipeline:
|
|
| 3063 |
# projection which already uses full reflection.
|
| 3064 |
sae_reg_floor = 0.6 if self.invert_refusal else 0.3
|
| 3065 |
sae_reg = max(layer_reg, sae_reg_floor) if not self.invert_refusal else sae_reg_floor
|
| 3066 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3067 |
# Skip SAE directions that collapsed to near-zero
|
| 3068 |
# after orthogonalization (fully redundant with SVD)
|
| 3069 |
-
if
|
| 3070 |
continue
|
| 3071 |
-
sd =
|
| 3072 |
-
|
| 3073 |
-
|
| 3074 |
-
|
| 3075 |
-
|
| 3076 |
-
|
| 3077 |
-
|
| 3078 |
-
|
| 3079 |
-
|
| 3080 |
-
|
| 3081 |
-
|
| 3082 |
-
|
| 3083 |
-
|
| 3084 |
-
|
| 3085 |
-
norm_preserve=self.norm_preserve,
|
| 3086 |
-
regularization=sae_reg,
|
| 3087 |
-
)
|
| 3088 |
-
if fc == 0:
|
| 3089 |
-
fc = self._project_moe_experts(
|
| 3090 |
-
ffn, sd,
|
| 3091 |
norm_preserve=self.norm_preserve,
|
| 3092 |
regularization=sae_reg,
|
| 3093 |
-
project_biases=False,
|
| 3094 |
)
|
| 3095 |
-
|
| 3096 |
-
|
| 3097 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3098 |
del sd
|
|
|
|
| 3099 |
total_sae_projections += sae_count
|
| 3100 |
count += sae_count
|
| 3101 |
|
|
@@ -3156,23 +3193,26 @@ class AbliterationPipeline:
|
|
| 3156 |
model = self.handle.model
|
| 3157 |
if last_strong in self.refusal_subspaces:
|
| 3158 |
subspace = self.refusal_subspaces[last_strong]
|
| 3159 |
-
|
| 3160 |
-
|
| 3161 |
-
|
| 3162 |
-
|
| 3163 |
-
|
| 3164 |
-
|
| 3165 |
-
|
| 3166 |
-
|
| 3167 |
-
|
| 3168 |
-
|
| 3169 |
-
|
| 3170 |
-
|
| 3171 |
-
|
| 3172 |
-
|
| 3173 |
-
|
| 3174 |
-
|
| 3175 |
-
|
|
|
|
|
|
|
|
|
|
| 3176 |
if lm_head_count > 0:
|
| 3177 |
total_modified += lm_head_count
|
| 3178 |
self.log(f" lm_head: {lm_head_count} projections")
|
|
@@ -3339,7 +3379,7 @@ class AbliterationPipeline:
|
|
| 3339 |
if n_dirs > 1:
|
| 3340 |
harmful_stack = torch.stack(self._harmful_acts[idx]).squeeze(1)
|
| 3341 |
harmless_stack = torch.stack(self._harmless_acts[idx]).squeeze(1)
|
| 3342 |
-
diff_matrix = harmful_stack - harmless_stack
|
| 3343 |
if torch.isfinite(diff_matrix).all():
|
| 3344 |
k = min(n_dirs, diff_matrix.shape[0], diff_matrix.shape[1])
|
| 3345 |
_, _, Vh = torch.linalg.svd(diff_matrix, full_matrices=False)
|
|
@@ -3374,7 +3414,7 @@ class AbliterationPipeline:
|
|
| 3374 |
else:
|
| 3375 |
harmful_stack = torch.stack(self._harmful_acts[idx]).squeeze(1)
|
| 3376 |
harmless_stack = torch.stack(self._harmless_acts[idx]).squeeze(1)
|
| 3377 |
-
diff_matrix = harmful_stack - harmless_stack
|
| 3378 |
if not torch.isfinite(diff_matrix).all():
|
| 3379 |
diff_matrix = torch.nan_to_num(diff_matrix, nan=0.0, posinf=0.0, neginf=0.0)
|
| 3380 |
k = min(n_dirs, diff_matrix.shape[0], diff_matrix.shape[1])
|
|
|
|
| 949 |
self.log(f" Router profiling complete: {n_profiled} MoE layers profiled")
|
| 950 |
|
| 951 |
for idx in range(n_layers):
|
| 952 |
+
if self._harmful_acts[idx] and self._harmless_acts[idx]:
|
| 953 |
+
self._harmful_means[idx] = torch.stack(self._harmful_acts[idx]).mean(dim=0)
|
| 954 |
+
self._harmless_means[idx] = torch.stack(self._harmless_acts[idx]).mean(dim=0)
|
| 955 |
+
else:
|
| 956 |
+
# Layer produced no activations (hook failure or skipped layer)
|
| 957 |
+
hidden = self._harmful_acts[0][0].shape[-1] if self._harmful_acts.get(0) else 768
|
| 958 |
+
self._harmful_means[idx] = torch.zeros(1, hidden)
|
| 959 |
+
self._harmless_means[idx] = torch.zeros(1, hidden)
|
| 960 |
|
| 961 |
# ββ Jailbreak-contrastive probing βββββββββββββββββββββββββββββββββ
|
| 962 |
if self.use_jailbreak_contrast:
|
|
|
|
| 1014 |
|
| 1015 |
n = len(prompts)
|
| 1016 |
self.log(f" Wrapping {n} prompts with chat template")
|
| 1017 |
+
|
| 1018 |
+
# Try batch application first (single call, much faster for large sets)
|
| 1019 |
+
all_conversations = [[{"role": "user", "content": p}] for p in prompts]
|
| 1020 |
+
try:
|
| 1021 |
+
wrapped = [
|
| 1022 |
+
tokenizer.apply_chat_template(
|
| 1023 |
+
conv, tokenize=False, add_generation_prompt=True
|
| 1024 |
+
)
|
| 1025 |
+
for conv in all_conversations
|
| 1026 |
+
]
|
| 1027 |
+
self.log(f" chat template {n}/{n}")
|
| 1028 |
+
return wrapped
|
| 1029 |
+
except Exception:
|
| 1030 |
+
pass # Fall through to per-prompt with error handling
|
| 1031 |
+
|
| 1032 |
wrapped = []
|
| 1033 |
+
for i, conv in enumerate(all_conversations):
|
|
|
|
| 1034 |
try:
|
| 1035 |
text = tokenizer.apply_chat_template(
|
| 1036 |
+
conv, tokenize=False, add_generation_prompt=True
|
| 1037 |
)
|
| 1038 |
wrapped.append(text)
|
| 1039 |
except Exception:
|
| 1040 |
+
wrapped.append(prompts[i]) # fallback to raw if individual prompt fails
|
| 1041 |
+
self.log(f" chat template {n}/{n}")
|
|
|
|
| 1042 |
return wrapped
|
| 1043 |
|
| 1044 |
@staticmethod
|
|
|
|
| 1445 |
if n_dirs > 1:
|
| 1446 |
harmful_stack = torch.stack(self._harmful_acts[idx]).squeeze(1)
|
| 1447 |
harmless_stack = torch.stack(self._harmless_acts[idx]).squeeze(1)
|
| 1448 |
+
diff_matrix = (harmful_stack - harmless_stack).float()
|
| 1449 |
if torch.isfinite(diff_matrix).all():
|
| 1450 |
k = min(n_dirs, diff_matrix.shape[0], diff_matrix.shape[1])
|
| 1451 |
_, _, Vh = torch.linalg.svd(diff_matrix, full_matrices=False)
|
|
|
|
| 1494 |
# SVD-based multi-direction extraction (Gabliteration)
|
| 1495 |
harmful_stack = torch.stack(self._harmful_acts[idx]).squeeze(1) # (n_prompts, hidden)
|
| 1496 |
harmless_stack = torch.stack(self._harmless_acts[idx]).squeeze(1)
|
| 1497 |
+
diff_matrix = (harmful_stack - harmless_stack).float() # float32 for SVD stability
|
| 1498 |
|
| 1499 |
# SVD to extract principal refusal directions
|
| 1500 |
if not torch.isfinite(diff_matrix).all():
|
|
|
|
| 3065 |
# remove components that lie in both subspaces (violating
|
| 3066 |
# the GRRO's independent-Ξ±α΅’ assumption; see theory journal
|
| 3067 |
# Β§12.6 "SAE-SVD Orthogonalization").
|
| 3068 |
+
# Batch orthogonalization: project out SVD subspace from all
|
| 3069 |
+
# SAE directions at once (replaces O(n_sae * n_svd) loop).
|
| 3070 |
+
svd_sub = subspace.to(sae_dirs.device) # (n_svd, hidden_dim)
|
| 3071 |
+
overlaps = sae_dirs @ svd_sub.T # (n_sae, n_svd)
|
| 3072 |
+
sae_dirs -= overlaps @ svd_sub # project out SVD subspace
|
| 3073 |
+
# Zero collapsed directions BEFORE normalizing to avoid
|
| 3074 |
+
# amplifying floating-point noise in near-zero directions.
|
| 3075 |
+
sae_norms = sae_dirs.norm(dim=-1, keepdim=True)
|
| 3076 |
+
collapsed_mask = (sae_norms.squeeze(-1) < 1e-8)
|
| 3077 |
+
if collapsed_mask.any():
|
| 3078 |
+
sae_dirs[collapsed_mask] = 0.0
|
| 3079 |
+
# Re-normalize surviving directions only
|
| 3080 |
+
surviving = ~collapsed_mask
|
| 3081 |
+
if surviving.any():
|
| 3082 |
+
sae_dirs[surviving] = sae_dirs[surviving] / sae_norms[surviving].clamp(min=1e-12)
|
| 3083 |
sae_count = 0
|
| 3084 |
# SAE regularization: for inversion modes, use a much
|
| 3085 |
# gentler floor (0.6 = 40% removal) since these are
|
|
|
|
| 3087 |
# projection which already uses full reflection.
|
| 3088 |
sae_reg_floor = 0.6 if self.invert_refusal else 0.3
|
| 3089 |
sae_reg = max(layer_reg, sae_reg_floor) if not self.invert_refusal else sae_reg_floor
|
| 3090 |
+
# Cache module lookups and pre-transfer SAE directions
|
| 3091 |
+
sae_attn = None
|
| 3092 |
+
sae_ffn = None
|
| 3093 |
+
try:
|
| 3094 |
+
sae_attn = get_attention_module(layers[idx], arch)
|
| 3095 |
+
except (AttributeError, RuntimeError):
|
| 3096 |
+
pass
|
| 3097 |
+
try:
|
| 3098 |
+
sae_ffn = get_ffn_module(layers[idx], arch)
|
| 3099 |
+
except (AttributeError, RuntimeError):
|
| 3100 |
+
pass
|
| 3101 |
+
sae_dirs_on_device = sae_dirs.to(device)
|
| 3102 |
+
for si in range(sae_dirs_on_device.shape[0]):
|
| 3103 |
# Skip SAE directions that collapsed to near-zero
|
| 3104 |
# after orthogonalization (fully redundant with SVD)
|
| 3105 |
+
if sae_dirs_on_device[si].norm() < 1e-6:
|
| 3106 |
continue
|
| 3107 |
+
sd = sae_dirs_on_device[si].unsqueeze(-1)
|
| 3108 |
+
if sae_attn is not None:
|
| 3109 |
+
try:
|
| 3110 |
+
sae_count += self._project_out_advanced(
|
| 3111 |
+
sae_attn, sd, _ATTN_OUT_NAMES,
|
| 3112 |
+
norm_preserve=self.norm_preserve,
|
| 3113 |
+
regularization=sae_reg,
|
| 3114 |
+
)
|
| 3115 |
+
except (AttributeError, RuntimeError):
|
| 3116 |
+
pass
|
| 3117 |
+
if sae_ffn is not None:
|
| 3118 |
+
try:
|
| 3119 |
+
fc = self._project_out_advanced(
|
| 3120 |
+
sae_ffn, sd, _FFN_OUT_NAMES,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3121 |
norm_preserve=self.norm_preserve,
|
| 3122 |
regularization=sae_reg,
|
|
|
|
| 3123 |
)
|
| 3124 |
+
if fc == 0:
|
| 3125 |
+
fc = self._project_moe_experts(
|
| 3126 |
+
sae_ffn, sd,
|
| 3127 |
+
norm_preserve=self.norm_preserve,
|
| 3128 |
+
regularization=sae_reg,
|
| 3129 |
+
project_biases=False,
|
| 3130 |
+
)
|
| 3131 |
+
sae_count += fc
|
| 3132 |
+
except (AttributeError, RuntimeError):
|
| 3133 |
+
pass
|
| 3134 |
del sd
|
| 3135 |
+
del sae_dirs_on_device
|
| 3136 |
total_sae_projections += sae_count
|
| 3137 |
count += sae_count
|
| 3138 |
|
|
|
|
| 3193 |
model = self.handle.model
|
| 3194 |
if last_strong in self.refusal_subspaces:
|
| 3195 |
subspace = self.refusal_subspaces[last_strong]
|
| 3196 |
+
lm_device = self._get_model_device(model)
|
| 3197 |
+
# Pre-transfer subspace and resolve lm_head module once
|
| 3198 |
+
subspace_on_device = subspace.to(lm_device)
|
| 3199 |
+
lm_head_name = None
|
| 3200 |
+
for head_name in ["lm_head", "embed_out", "output"]:
|
| 3201 |
+
head = getattr(model, head_name, None)
|
| 3202 |
+
if head is not None and hasattr(head, "weight"):
|
| 3203 |
+
lm_head_name = head_name
|
| 3204 |
+
break
|
| 3205 |
+
if lm_head_name is not None:
|
| 3206 |
+
lm_reg = (1.0 - self.reflection_strength) if self.invert_refusal else 0.0
|
| 3207 |
+
for dir_idx in range(subspace_on_device.shape[0]):
|
| 3208 |
+
d = subspace_on_device[dir_idx].unsqueeze(-1)
|
| 3209 |
+
lm_head_count += self._project_out_advanced(
|
| 3210 |
+
model, d, [lm_head_name],
|
| 3211 |
+
norm_preserve=self.norm_preserve,
|
| 3212 |
+
regularization=lm_reg,
|
| 3213 |
+
)
|
| 3214 |
+
del d
|
| 3215 |
+
del subspace_on_device
|
| 3216 |
if lm_head_count > 0:
|
| 3217 |
total_modified += lm_head_count
|
| 3218 |
self.log(f" lm_head: {lm_head_count} projections")
|
|
|
|
| 3379 |
if n_dirs > 1:
|
| 3380 |
harmful_stack = torch.stack(self._harmful_acts[idx]).squeeze(1)
|
| 3381 |
harmless_stack = torch.stack(self._harmless_acts[idx]).squeeze(1)
|
| 3382 |
+
diff_matrix = (harmful_stack - harmless_stack).float()
|
| 3383 |
if torch.isfinite(diff_matrix).all():
|
| 3384 |
k = min(n_dirs, diff_matrix.shape[0], diff_matrix.shape[1])
|
| 3385 |
_, _, Vh = torch.linalg.svd(diff_matrix, full_matrices=False)
|
|
|
|
| 3414 |
else:
|
| 3415 |
harmful_stack = torch.stack(self._harmful_acts[idx]).squeeze(1)
|
| 3416 |
harmless_stack = torch.stack(self._harmless_acts[idx]).squeeze(1)
|
| 3417 |
+
diff_matrix = (harmful_stack - harmless_stack).float() # float32 for SVD stability
|
| 3418 |
if not torch.isfinite(diff_matrix).all():
|
| 3419 |
diff_matrix = torch.nan_to_num(diff_matrix, nan=0.0, posinf=0.0, neginf=0.0)
|
| 3420 |
k = min(n_dirs, diff_matrix.shape[0], diff_matrix.shape[1])
|
obliteratus/cli.py
CHANGED
|
@@ -43,7 +43,7 @@ def main(argv: list[str] | None = None):
|
|
| 43 |
)
|
| 44 |
|
| 45 |
# --- models ---
|
| 46 |
-
models_parser = subparsers.add_parser("models", help="Browse
|
| 47 |
models_parser.add_argument(
|
| 48 |
"--tier",
|
| 49 |
type=str,
|
|
|
|
| 43 |
)
|
| 44 |
|
| 45 |
# --- models ---
|
| 46 |
+
models_parser = subparsers.add_parser("models", help="Browse curated models by compute tier")
|
| 47 |
models_parser.add_argument(
|
| 48 |
"--tier",
|
| 49 |
type=str,
|
scripts/run_benchmark_remote.sh
CHANGED
|
@@ -92,8 +92,8 @@ os.environ.setdefault("CUDA_LAUNCH_BLOCKING", "1")
|
|
| 92 |
import torch
|
| 93 |
import torch.nn as nn
|
| 94 |
|
| 95 |
-
# Add app dir to path (HF Space layout)
|
| 96 |
-
sys.path.insert(0, "/home/user/app")
|
| 97 |
|
| 98 |
# ββ Hotpatch: fix device detection for accelerate device_map="auto" ββββββ
|
| 99 |
# The deployed Space code uses next(model.parameters()).device which is
|
|
|
|
| 92 |
import torch
|
| 93 |
import torch.nn as nn
|
| 94 |
|
| 95 |
+
# Add app dir to path (HF Space layout: /home/user/app)
|
| 96 |
+
sys.path.insert(0, os.environ.get("APP_DIR", "/home/user/app"))
|
| 97 |
|
| 98 |
# ββ Hotpatch: fix device detection for accelerate device_map="auto" ββββββ
|
| 99 |
# The deployed Space code uses next(model.parameters()).device which is
|