Spaces:
Running
Running
Upload 130 files
Browse files- CHANGELOG.md +15 -1
- CONTRIBUTING.md +2 -2
- README.md +7 -6
- app.py +104 -45
- docs/index.html +3 -3
- hf-spaces/README.md +1 -0
- obliteratus/.DS_Store +0 -0
- obliteratus/__init__.py +1 -1
- obliteratus/abliterate.py +20 -9
- obliteratus/cli.py +5 -2
- obliteratus/evaluation/benchmarks.py +4 -3
- obliteratus/informed_pipeline.py +5 -0
- obliteratus/local_ui.py +1 -2
- pyproject.toml +2 -2
- tests/test_cli.py +9 -6
CHANGELOG.md
CHANGED
|
@@ -3,6 +3,20 @@
|
|
| 3 |
All notable changes to OBLITERATUS are documented here.
|
| 4 |
Format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
|
| 5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
## [0.1.1] - 2026-03-01
|
| 7 |
|
| 8 |
### Fixed
|
|
@@ -39,7 +53,7 @@ Format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
|
|
| 39 |
- **lm-eval-harness integration** for standardized benchmarking
|
| 40 |
- **Reproducibility framework** with deterministic seeds and full metadata logging
|
| 41 |
- **Telemetry** (opt-in only, anonymized, allowlisted fields)
|
| 42 |
-
- **
|
| 43 |
- **Research paper** (`paper/main.tex`) with geometric theory of refusal removal
|
| 44 |
- Dual license: AGPL-3.0 + commercial
|
| 45 |
|
|
|
|
| 3 |
All notable changes to OBLITERATUS are documented here.
|
| 4 |
Format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
|
| 5 |
|
| 6 |
+
## [0.1.2] - 2026-03-03
|
| 7 |
+
|
| 8 |
+
### Fixed
|
| 9 |
+
- Fixed `spaces.GPU` `AttributeError` crash on HuggingFace Spaces — fallback now catches
|
| 10 |
+
both `ImportError` and `AttributeError` so the Space gracefully degrades to CPU mode
|
| 11 |
+
when ZeroGPU is unavailable
|
| 12 |
+
- Added missing `hardware: zero-a10g` to HF Space metadata (`hf-spaces/README.md`) —
|
| 13 |
+
required for the `spaces` package to expose the `@spaces.GPU` decorator
|
| 14 |
+
|
| 15 |
+
### Improved
|
| 16 |
+
- Added mypy type checking to CI pipeline (`continue-on-error` while baseline is established)
|
| 17 |
+
- Added `mypy` to dev dependencies
|
| 18 |
+
- Version bump to 0.1.2 across `pyproject.toml` and `__init__.py`
|
| 19 |
+
|
| 20 |
## [0.1.1] - 2026-03-01
|
| 21 |
|
| 22 |
### Fixed
|
|
|
|
| 53 |
- **lm-eval-harness integration** for standardized benchmarking
|
| 54 |
- **Reproducibility framework** with deterministic seeds and full metadata logging
|
| 55 |
- **Telemetry** (opt-in only, anonymized, allowlisted fields)
|
| 56 |
+
- **823 tests** across 28 test files (incl. CLI dispatch, shared fixtures)
|
| 57 |
- **Research paper** (`paper/main.tex`) with geometric theory of refusal removal
|
| 58 |
- Dual license: AGPL-3.0 + commercial
|
| 59 |
|
CONTRIBUTING.md
CHANGED
|
@@ -15,7 +15,7 @@ This installs the package in editable mode with test dependencies (pytest, ruff)
|
|
| 15 |
## Running Tests
|
| 16 |
|
| 17 |
```bash
|
| 18 |
-
pytest # full suite (
|
| 19 |
pytest tests/test_abliterate.py # single file
|
| 20 |
pytest -x # stop on first failure
|
| 21 |
pytest -k "test_name" # run specific test
|
|
@@ -91,7 +91,7 @@ obliteratus/
|
|
| 91 |
models/ # Model loading utilities
|
| 92 |
reporting/ # Report generation
|
| 93 |
strategies/ # Ablation strategies (layer, head, FFN, embedding)
|
| 94 |
-
tests/ #
|
| 95 |
paper/ # LaTeX paper
|
| 96 |
examples/ # YAML config examples
|
| 97 |
```
|
|
|
|
| 15 |
## Running Tests
|
| 16 |
|
| 17 |
```bash
|
| 18 |
+
pytest # full suite (823 tests)
|
| 19 |
pytest tests/test_abliterate.py # single file
|
| 20 |
pytest -x # stop on first failure
|
| 21 |
pytest -k "test_name" # run specific test
|
|
|
|
| 91 |
models/ # Model loading utilities
|
| 92 |
reporting/ # Report generation
|
| 93 |
strategies/ # Ablation strategies (layer, head, FFN, embedding)
|
| 94 |
+
tests/ # 28 test files
|
| 95 |
paper/ # LaTeX paper
|
| 96 |
examples/ # YAML config examples
|
| 97 |
```
|
README.md
CHANGED
|
@@ -3,9 +3,9 @@ title: OBLITERATUS
|
|
| 3 |
emoji: "\U0001F513"
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: gray
|
| 6 |
-
sdk:
|
|
|
|
| 7 |
app_file: app.py
|
| 8 |
-
suggested_hardware: t4-small
|
| 9 |
pinned: true
|
| 10 |
license: agpl-3.0
|
| 11 |
tags:
|
|
@@ -156,6 +156,7 @@ pipeline = AbliterationPipeline(
|
|
| 156 |
model_name="meta-llama/Llama-3.1-8B-Instruct",
|
| 157 |
method="advanced",
|
| 158 |
output_dir="abliterated",
|
|
|
|
| 159 |
)
|
| 160 |
result = pipeline.run()
|
| 161 |
```
|
|
@@ -356,7 +357,7 @@ obliteratus run examples/preset_quick.yaml
|
|
| 356 |
| Analysis-informed abliteration | Yes (closed-loop feedback) | N/A | N/A | N/A | N/A | N/A |
|
| 357 |
| Auto parameter optimization | Analysis-guided | N/A | Bayesian (Optuna) | N/A | N/A | N/A |
|
| 358 |
| Model compatibility | Any HuggingFace model | ~50 architectures | 16/16 tested | TransformerLens only | HuggingFace | TransformerLens |
|
| 359 |
-
| Test suite |
|
| 360 |
|
| 361 |
## Community contributions
|
| 362 |
|
|
@@ -434,7 +435,7 @@ metrics:
|
|
| 434 |
- perplexity
|
| 435 |
|
| 436 |
batch_size: 4
|
| 437 |
-
max_length: 256
|
| 438 |
output_dir: results/my_run
|
| 439 |
```
|
| 440 |
|
|
@@ -465,7 +466,7 @@ If you use OBLITERATUS in your research, please cite:
|
|
| 465 |
author = {{OBLITERATUS Contributors}},
|
| 466 |
year = {2026},
|
| 467 |
url = {https://github.com/obliteratus-project/OBLITERATUS},
|
| 468 |
-
note = {15 analysis modules,
|
| 469 |
}
|
| 470 |
```
|
| 471 |
|
|
@@ -476,7 +477,7 @@ pip install -e ".[dev]"
|
|
| 476 |
pytest
|
| 477 |
```
|
| 478 |
|
| 479 |
-
|
| 480 |
|
| 481 |
## License
|
| 482 |
|
|
|
|
| 3 |
emoji: "\U0001F513"
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: gray
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: "5.29.0"
|
| 8 |
app_file: app.py
|
|
|
|
| 9 |
pinned: true
|
| 10 |
license: agpl-3.0
|
| 11 |
tags:
|
|
|
|
| 156 |
model_name="meta-llama/Llama-3.1-8B-Instruct",
|
| 157 |
method="advanced",
|
| 158 |
output_dir="abliterated",
|
| 159 |
+
max_seq_length=512, # optional: override tokenizer truncation length for all pipeline stages
|
| 160 |
)
|
| 161 |
result = pipeline.run()
|
| 162 |
```
|
|
|
|
| 357 |
| Analysis-informed abliteration | Yes (closed-loop feedback) | N/A | N/A | N/A | N/A | N/A |
|
| 358 |
| Auto parameter optimization | Analysis-guided | N/A | Bayesian (Optuna) | N/A | N/A | N/A |
|
| 359 |
| Model compatibility | Any HuggingFace model | ~50 architectures | 16/16 tested | TransformerLens only | HuggingFace | TransformerLens |
|
| 360 |
+
| Test suite | 823 tests | Community | Unknown | None | Minimal | Moderate |
|
| 361 |
|
| 362 |
## Community contributions
|
| 363 |
|
|
|
|
| 435 |
- perplexity
|
| 436 |
|
| 437 |
batch_size: 4
|
| 438 |
+
max_length: 256 # tokenizer truncation length (default 512)
|
| 439 |
output_dir: results/my_run
|
| 440 |
```
|
| 441 |
|
|
|
|
| 466 |
author = {{OBLITERATUS Contributors}},
|
| 467 |
year = {2026},
|
| 468 |
url = {https://github.com/obliteratus-project/OBLITERATUS},
|
| 469 |
+
note = {15 analysis modules, 823 tests}
|
| 470 |
}
|
| 471 |
```
|
| 472 |
|
|
|
|
| 477 |
pytest
|
| 478 |
```
|
| 479 |
|
| 480 |
+
823 tests across 28 test files covering CLI, all analysis modules, abliteration pipeline, architecture detection, community contributions, edge cases, and evaluation metrics.
|
| 481 |
|
| 482 |
## License
|
| 483 |
|
app.py
CHANGED
|
@@ -68,17 +68,19 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStream
|
|
| 68 |
# and we fall back to a no-op decorator so the same code works everywhere.
|
| 69 |
try:
|
| 70 |
import spaces
|
|
|
|
| 71 |
_ZEROGPU_AVAILABLE = True
|
| 72 |
-
except ImportError:
|
| 73 |
_ZEROGPU_AVAILABLE = False
|
| 74 |
-
# Create a no-op decorator that mirrors spaces.GPU interface
|
|
|
|
| 75 |
class _FakeSpaces:
|
| 76 |
@staticmethod
|
| 77 |
def GPU(duration: int = 60, **kwargs):
|
| 78 |
def decorator(fn):
|
| 79 |
return fn
|
| 80 |
return decorator
|
| 81 |
-
spaces = _FakeSpaces()
|
| 82 |
|
| 83 |
# ---------------------------------------------------------------------------
|
| 84 |
# Global state
|
|
@@ -703,27 +705,44 @@ def benchmark(
|
|
| 703 |
|
| 704 |
def run_pipeline():
|
| 705 |
try:
|
| 706 |
-
from obliteratus.abliterate import AbliterationPipeline
|
| 707 |
-
|
| 708 |
if prompt_volume > 0:
|
| 709 |
n = min(prompt_volume, len(harmful_all), len(harmless_all))
|
| 710 |
else:
|
| 711 |
n = min(len(harmful_all), len(harmless_all))
|
| 712 |
-
|
| 713 |
-
|
| 714 |
-
|
| 715 |
-
|
| 716 |
-
|
| 717 |
-
|
| 718 |
-
|
| 719 |
-
|
| 720 |
-
|
| 721 |
-
|
| 722 |
-
|
| 723 |
-
|
| 724 |
-
|
| 725 |
-
|
| 726 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 727 |
except Exception as e:
|
| 728 |
nonlocal run_error
|
| 729 |
run_error = e
|
|
@@ -1029,24 +1048,41 @@ def benchmark_multi_model(
|
|
| 1029 |
|
| 1030 |
def run_pipeline():
|
| 1031 |
try:
|
| 1032 |
-
from obliteratus.abliterate import AbliterationPipeline
|
| 1033 |
-
|
| 1034 |
n = actual_n
|
| 1035 |
-
|
| 1036 |
-
|
| 1037 |
-
|
| 1038 |
-
|
| 1039 |
-
|
| 1040 |
-
|
| 1041 |
-
|
| 1042 |
-
|
| 1043 |
-
|
| 1044 |
-
|
| 1045 |
-
|
| 1046 |
-
|
| 1047 |
-
|
| 1048 |
-
|
| 1049 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1050 |
except Exception as e:
|
| 1051 |
nonlocal run_error
|
| 1052 |
run_error = e
|
|
@@ -1461,6 +1497,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
|
|
| 1461 |
# Stream log updates while pipeline runs (max 45 minutes to prevent indefinite hang)
|
| 1462 |
_max_pipeline_secs = 45 * 60
|
| 1463 |
_pipeline_start = time.time()
|
|
|
|
| 1464 |
while worker.is_alive():
|
| 1465 |
status_msg = f"**Obliterating\u2026** ({_elapsed()})"
|
| 1466 |
if len(log_lines) > last_yielded[0]:
|
|
@@ -1741,7 +1778,7 @@ def _strip_reasoning_tokens(text: str) -> str:
|
|
| 1741 |
@spaces.GPU(duration=120)
|
| 1742 |
def chat_respond(message: str, history: list[dict], system_prompt: str,
|
| 1743 |
temperature: float, top_p: float, max_tokens: int,
|
| 1744 |
-
repetition_penalty: float):
|
| 1745 |
"""Stream a response from the liberated model.
|
| 1746 |
|
| 1747 |
On ZeroGPU, allocates a GPU for up to 2 minutes per response.
|
|
@@ -1761,6 +1798,7 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
|
|
| 1761 |
temperature = max(0.0, min(1.5, float(temperature)))
|
| 1762 |
top_p = max(0.0, min(1.0, float(top_p)))
|
| 1763 |
repetition_penalty = max(1.0, min(2.0, float(repetition_penalty)))
|
|
|
|
| 1764 |
|
| 1765 |
# Build messages — cap history to prevent unbounded memory use
|
| 1766 |
messages = []
|
|
@@ -1777,7 +1815,7 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
|
|
| 1777 |
# Fallback: simple concatenation
|
| 1778 |
text = "\n".join(f"{m['role']}: {m['content']}" for m in messages) + "\nassistant:"
|
| 1779 |
|
| 1780 |
-
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=
|
| 1781 |
inputs = {k: v.to(model.device) for k, v in inputs.items()}
|
| 1782 |
|
| 1783 |
# Streaming generation — repetition_penalty and no_repeat_ngram_size
|
|
@@ -2044,7 +2082,8 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
|
|
| 2044 |
@spaces.GPU(duration=120)
|
| 2045 |
def ab_chat_respond(message: str, history_left: list[dict], history_right: list[dict],
|
| 2046 |
system_prompt: str, temperature: float, top_p: float,
|
| 2047 |
-
max_tokens: int, repetition_penalty: float
|
|
|
|
| 2048 |
"""Generate responses from BOTH original and abliterated model side-by-side.
|
| 2049 |
|
| 2050 |
Left panel = original (pre-abliteration), Right panel = abliterated.
|
|
@@ -2076,6 +2115,7 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
|
|
| 2076 |
temperature = max(0.0, min(1.5, float(temperature)))
|
| 2077 |
top_p = max(0.0, min(1.0, float(top_p)))
|
| 2078 |
repetition_penalty = max(1.0, min(2.0, float(repetition_penalty)))
|
|
|
|
| 2079 |
|
| 2080 |
# Build messages — cap history to prevent unbounded memory use
|
| 2081 |
messages = []
|
|
@@ -2091,7 +2131,7 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
|
|
| 2091 |
except Exception:
|
| 2092 |
text = "\n".join(f"{m['role']}: {m['content']}" for m in messages) + "\nassistant:"
|
| 2093 |
|
| 2094 |
-
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=
|
| 2095 |
|
| 2096 |
gen_kwargs_base = {
|
| 2097 |
"max_new_tokens": int(max_tokens),
|
|
@@ -2279,8 +2319,13 @@ def strength_sweep(model_choice: str, method_choice: str,
|
|
| 2279 |
|
| 2280 |
def _run_sweep_point():
|
| 2281 |
try:
|
|
|
|
| 2282 |
pipe = AbliterationPipeline(
|
| 2283 |
model_id, method=method_key,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2284 |
trust_remote_code=is_preset,
|
| 2285 |
harmful_prompts=harmful, harmless_prompts=harmless,
|
| 2286 |
regularization=reg,
|
|
@@ -2316,6 +2361,9 @@ def strength_sweep(model_choice: str, method_choice: str,
|
|
| 2316 |
entry["refusal_rate"] = metrics.get("refusal_rate")
|
| 2317 |
entry["coherence"] = metrics.get("coherence")
|
| 2318 |
entry["strong_layers"] = len(pipe._strong_layers)
|
|
|
|
|
|
|
|
|
|
| 2319 |
del pipe
|
| 2320 |
|
| 2321 |
results.append(entry)
|
|
@@ -3114,7 +3162,8 @@ result = client.predict(
|
|
| 3114 |
)
|
| 3115 |
bench_methods = gr.CheckboxGroup(
|
| 3116 |
choices=["basic", "advanced", "aggressive", "spectral_cascade",
|
| 3117 |
-
"informed", "surgical", "optimized", "inverted", "nuclear"
|
|
|
|
| 3118 |
value=["basic", "advanced", "spectral_cascade", "surgical"],
|
| 3119 |
label="Methods to Compare",
|
| 3120 |
)
|
|
@@ -3438,12 +3487,17 @@ Pre-configured benchmark configurations for common research questions.
|
|
| 3438 |
label="Repetition Penalty",
|
| 3439 |
info="Penalizes repeated tokens — higher values break refusal loops (1.0 = off)",
|
| 3440 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3441 |
|
| 3442 |
gr.ChatInterface(
|
| 3443 |
fn=chat_respond,
|
| 3444 |
type="messages",
|
| 3445 |
chatbot=gr.Chatbot(height="11vh", type="messages"),
|
| 3446 |
-
additional_inputs=[system_prompt, temperature, top_p, max_tokens, repetition_penalty],
|
| 3447 |
fill_height=True,
|
| 3448 |
)
|
| 3449 |
|
|
@@ -3507,6 +3561,11 @@ See exactly how abliteration changes model behavior on the same prompt.
|
|
| 3507 |
ab_top_p = gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top P")
|
| 3508 |
ab_max_tokens = gr.Slider(32, 2048, value=256, step=32, label="Max Tokens")
|
| 3509 |
ab_rep_penalty = gr.Slider(1.0, 2.0, value=1.15, step=0.05, label="Rep Penalty")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3510 |
|
| 3511 |
with gr.Row():
|
| 3512 |
with gr.Column():
|
|
@@ -3533,7 +3592,7 @@ See exactly how abliteration changes model behavior on the same prompt.
|
|
| 3533 |
ab_send_btn.click(
|
| 3534 |
fn=ab_chat_respond,
|
| 3535 |
inputs=[ab_input, ab_chatbot_left, ab_chatbot_right,
|
| 3536 |
-
ab_system_prompt, ab_temp, ab_top_p, ab_max_tokens, ab_rep_penalty],
|
| 3537 |
outputs=[ab_chatbot_left, ab_chatbot_right, ab_status,
|
| 3538 |
ab_header_left, ab_header_right],
|
| 3539 |
)
|
|
@@ -3541,7 +3600,7 @@ See exactly how abliteration changes model behavior on the same prompt.
|
|
| 3541 |
ab_input.submit(
|
| 3542 |
fn=ab_chat_respond,
|
| 3543 |
inputs=[ab_input, ab_chatbot_left, ab_chatbot_right,
|
| 3544 |
-
ab_system_prompt, ab_temp, ab_top_p, ab_max_tokens, ab_rep_penalty],
|
| 3545 |
outputs=[ab_chatbot_left, ab_chatbot_right, ab_status,
|
| 3546 |
ab_header_left, ab_header_right],
|
| 3547 |
)
|
|
|
|
| 68 |
# and we fall back to a no-op decorator so the same code works everywhere.
|
| 69 |
try:
|
| 70 |
import spaces
|
| 71 |
+
spaces.GPU # Verify ZeroGPU decorator is actually available
|
| 72 |
_ZEROGPU_AVAILABLE = True
|
| 73 |
+
except (ImportError, AttributeError):
|
| 74 |
_ZEROGPU_AVAILABLE = False
|
| 75 |
+
# Create a no-op decorator that mirrors spaces.GPU interface so the same
|
| 76 |
+
# code runs locally, on CPU-only Spaces, and on ZeroGPU Spaces.
|
| 77 |
class _FakeSpaces:
|
| 78 |
@staticmethod
|
| 79 |
def GPU(duration: int = 60, **kwargs):
|
| 80 |
def decorator(fn):
|
| 81 |
return fn
|
| 82 |
return decorator
|
| 83 |
+
spaces = _FakeSpaces() # type: ignore[assignment]
|
| 84 |
|
| 85 |
# ---------------------------------------------------------------------------
|
| 86 |
# Global state
|
|
|
|
| 705 |
|
| 706 |
def run_pipeline():
|
| 707 |
try:
|
|
|
|
|
|
|
| 708 |
if prompt_volume > 0:
|
| 709 |
n = min(prompt_volume, len(harmful_all), len(harmless_all))
|
| 710 |
else:
|
| 711 |
n = min(len(harmful_all), len(harmless_all))
|
| 712 |
+
|
| 713 |
+
if method_key == "informed":
|
| 714 |
+
from obliteratus.informed_pipeline import InformedAbliterationPipeline
|
| 715 |
+
pipeline = InformedAbliterationPipeline(
|
| 716 |
+
model_name=model_id,
|
| 717 |
+
output_dir=f"/tmp/bench_{method_key}",
|
| 718 |
+
device="auto",
|
| 719 |
+
dtype="float16",
|
| 720 |
+
quantization=quantization,
|
| 721 |
+
trust_remote_code=is_preset,
|
| 722 |
+
harmful_prompts=harmful_all[:n],
|
| 723 |
+
harmless_prompts=harmless_all[:n],
|
| 724 |
+
on_stage=on_stage,
|
| 725 |
+
on_log=on_log,
|
| 726 |
+
)
|
| 727 |
+
pipeline_ref[0] = pipeline
|
| 728 |
+
pipeline.run_informed()
|
| 729 |
+
else:
|
| 730 |
+
from obliteratus.abliterate import AbliterationPipeline
|
| 731 |
+
pipeline = AbliterationPipeline(
|
| 732 |
+
model_name=model_id,
|
| 733 |
+
output_dir=f"/tmp/bench_{method_key}",
|
| 734 |
+
device="auto",
|
| 735 |
+
dtype="float16",
|
| 736 |
+
method=method_key,
|
| 737 |
+
quantization=quantization,
|
| 738 |
+
trust_remote_code=is_preset,
|
| 739 |
+
harmful_prompts=harmful_all[:n],
|
| 740 |
+
harmless_prompts=harmless_all[:n],
|
| 741 |
+
on_stage=on_stage,
|
| 742 |
+
on_log=on_log,
|
| 743 |
+
)
|
| 744 |
+
pipeline_ref[0] = pipeline
|
| 745 |
+
pipeline.run()
|
| 746 |
except Exception as e:
|
| 747 |
nonlocal run_error
|
| 748 |
run_error = e
|
|
|
|
| 1048 |
|
| 1049 |
def run_pipeline():
|
| 1050 |
try:
|
|
|
|
|
|
|
| 1051 |
n = actual_n
|
| 1052 |
+
|
| 1053 |
+
if method_key == "informed":
|
| 1054 |
+
from obliteratus.informed_pipeline import InformedAbliterationPipeline
|
| 1055 |
+
pipeline = InformedAbliterationPipeline(
|
| 1056 |
+
model_name=model_id,
|
| 1057 |
+
output_dir=f"/tmp/bench_mm_{mi}",
|
| 1058 |
+
device="auto",
|
| 1059 |
+
dtype="float16",
|
| 1060 |
+
quantization=quantization,
|
| 1061 |
+
trust_remote_code=is_preset_model,
|
| 1062 |
+
harmful_prompts=harmful_all[:n],
|
| 1063 |
+
harmless_prompts=harmless_all[:n],
|
| 1064 |
+
on_stage=on_stage,
|
| 1065 |
+
on_log=on_log,
|
| 1066 |
+
)
|
| 1067 |
+
pipeline_ref[0] = pipeline
|
| 1068 |
+
pipeline.run_informed()
|
| 1069 |
+
else:
|
| 1070 |
+
from obliteratus.abliterate import AbliterationPipeline
|
| 1071 |
+
pipeline = AbliterationPipeline(
|
| 1072 |
+
model_name=model_id,
|
| 1073 |
+
output_dir=f"/tmp/bench_mm_{mi}",
|
| 1074 |
+
device="auto",
|
| 1075 |
+
dtype="float16",
|
| 1076 |
+
method=method_key,
|
| 1077 |
+
quantization=quantization,
|
| 1078 |
+
trust_remote_code=is_preset_model,
|
| 1079 |
+
harmful_prompts=harmful_all[:n],
|
| 1080 |
+
harmless_prompts=harmless_all[:n],
|
| 1081 |
+
on_stage=on_stage,
|
| 1082 |
+
on_log=on_log,
|
| 1083 |
+
)
|
| 1084 |
+
pipeline_ref[0] = pipeline
|
| 1085 |
+
pipeline.run()
|
| 1086 |
except Exception as e:
|
| 1087 |
nonlocal run_error
|
| 1088 |
run_error = e
|
|
|
|
| 1497 |
# Stream log updates while pipeline runs (max 45 minutes to prevent indefinite hang)
|
| 1498 |
_max_pipeline_secs = 45 * 60
|
| 1499 |
_pipeline_start = time.time()
|
| 1500 |
+
status_msg = f"**Obliterating\u2026** (0s)"
|
| 1501 |
while worker.is_alive():
|
| 1502 |
status_msg = f"**Obliterating\u2026** ({_elapsed()})"
|
| 1503 |
if len(log_lines) > last_yielded[0]:
|
|
|
|
| 1778 |
@spaces.GPU(duration=120)
|
| 1779 |
def chat_respond(message: str, history: list[dict], system_prompt: str,
|
| 1780 |
temperature: float, top_p: float, max_tokens: int,
|
| 1781 |
+
repetition_penalty: float, context_length: int = 2048):
|
| 1782 |
"""Stream a response from the liberated model.
|
| 1783 |
|
| 1784 |
On ZeroGPU, allocates a GPU for up to 2 minutes per response.
|
|
|
|
| 1798 |
temperature = max(0.0, min(1.5, float(temperature)))
|
| 1799 |
top_p = max(0.0, min(1.0, float(top_p)))
|
| 1800 |
repetition_penalty = max(1.0, min(2.0, float(repetition_penalty)))
|
| 1801 |
+
context_length = max(128, min(32768, int(context_length)))
|
| 1802 |
|
| 1803 |
# Build messages — cap history to prevent unbounded memory use
|
| 1804 |
messages = []
|
|
|
|
| 1815 |
# Fallback: simple concatenation
|
| 1816 |
text = "\n".join(f"{m['role']}: {m['content']}" for m in messages) + "\nassistant:"
|
| 1817 |
|
| 1818 |
+
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=context_length)
|
| 1819 |
inputs = {k: v.to(model.device) for k, v in inputs.items()}
|
| 1820 |
|
| 1821 |
# Streaming generation — repetition_penalty and no_repeat_ngram_size
|
|
|
|
| 2082 |
@spaces.GPU(duration=120)
|
| 2083 |
def ab_chat_respond(message: str, history_left: list[dict], history_right: list[dict],
|
| 2084 |
system_prompt: str, temperature: float, top_p: float,
|
| 2085 |
+
max_tokens: int, repetition_penalty: float,
|
| 2086 |
+
context_length: int = 2048):
|
| 2087 |
"""Generate responses from BOTH original and abliterated model side-by-side.
|
| 2088 |
|
| 2089 |
Left panel = original (pre-abliteration), Right panel = abliterated.
|
|
|
|
| 2115 |
temperature = max(0.0, min(1.5, float(temperature)))
|
| 2116 |
top_p = max(0.0, min(1.0, float(top_p)))
|
| 2117 |
repetition_penalty = max(1.0, min(2.0, float(repetition_penalty)))
|
| 2118 |
+
context_length = max(128, min(32768, int(context_length)))
|
| 2119 |
|
| 2120 |
# Build messages — cap history to prevent unbounded memory use
|
| 2121 |
messages = []
|
|
|
|
| 2131 |
except Exception:
|
| 2132 |
text = "\n".join(f"{m['role']}: {m['content']}" for m in messages) + "\nassistant:"
|
| 2133 |
|
| 2134 |
+
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=context_length)
|
| 2135 |
|
| 2136 |
gen_kwargs_base = {
|
| 2137 |
"max_new_tokens": int(max_tokens),
|
|
|
|
| 2319 |
|
| 2320 |
def _run_sweep_point():
|
| 2321 |
try:
|
| 2322 |
+
quantization = _should_quantize(model_id)
|
| 2323 |
pipe = AbliterationPipeline(
|
| 2324 |
model_id, method=method_key,
|
| 2325 |
+
output_dir=f"/tmp/sweep_{step_i}",
|
| 2326 |
+
device="auto",
|
| 2327 |
+
dtype="float16",
|
| 2328 |
+
quantization=quantization,
|
| 2329 |
trust_remote_code=is_preset,
|
| 2330 |
harmful_prompts=harmful, harmless_prompts=harmless,
|
| 2331 |
regularization=reg,
|
|
|
|
| 2361 |
entry["refusal_rate"] = metrics.get("refusal_rate")
|
| 2362 |
entry["coherence"] = metrics.get("coherence")
|
| 2363 |
entry["strong_layers"] = len(pipe._strong_layers)
|
| 2364 |
+
if hasattr(pipe, "handle") and pipe.handle is not None:
|
| 2365 |
+
pipe.handle.model = None
|
| 2366 |
+
pipe.handle.tokenizer = None
|
| 2367 |
del pipe
|
| 2368 |
|
| 2369 |
results.append(entry)
|
|
|
|
| 3162 |
)
|
| 3163 |
bench_methods = gr.CheckboxGroup(
|
| 3164 |
choices=["basic", "advanced", "aggressive", "spectral_cascade",
|
| 3165 |
+
"informed", "surgical", "optimized", "inverted", "nuclear",
|
| 3166 |
+
"failspy", "gabliteration", "heretic", "rdo"],
|
| 3167 |
value=["basic", "advanced", "spectral_cascade", "surgical"],
|
| 3168 |
label="Methods to Compare",
|
| 3169 |
)
|
|
|
|
| 3487 |
label="Repetition Penalty",
|
| 3488 |
info="Penalizes repeated tokens — higher values break refusal loops (1.0 = off)",
|
| 3489 |
)
|
| 3490 |
+
context_length = gr.Slider(
|
| 3491 |
+
128, 32768, value=2048, step=128,
|
| 3492 |
+
label="Context Length",
|
| 3493 |
+
info="Max input tokens — increase for long conversations, decrease to save VRAM",
|
| 3494 |
+
)
|
| 3495 |
|
| 3496 |
gr.ChatInterface(
|
| 3497 |
fn=chat_respond,
|
| 3498 |
type="messages",
|
| 3499 |
chatbot=gr.Chatbot(height="11vh", type="messages"),
|
| 3500 |
+
additional_inputs=[system_prompt, temperature, top_p, max_tokens, repetition_penalty, context_length],
|
| 3501 |
fill_height=True,
|
| 3502 |
)
|
| 3503 |
|
|
|
|
| 3561 |
ab_top_p = gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top P")
|
| 3562 |
ab_max_tokens = gr.Slider(32, 2048, value=256, step=32, label="Max Tokens")
|
| 3563 |
ab_rep_penalty = gr.Slider(1.0, 2.0, value=1.15, step=0.05, label="Rep Penalty")
|
| 3564 |
+
ab_context_length = gr.Slider(
|
| 3565 |
+
128, 32768, value=2048, step=128,
|
| 3566 |
+
label="Context Length",
|
| 3567 |
+
info="Max input tokens for both models",
|
| 3568 |
+
)
|
| 3569 |
|
| 3570 |
with gr.Row():
|
| 3571 |
with gr.Column():
|
|
|
|
| 3592 |
ab_send_btn.click(
|
| 3593 |
fn=ab_chat_respond,
|
| 3594 |
inputs=[ab_input, ab_chatbot_left, ab_chatbot_right,
|
| 3595 |
+
ab_system_prompt, ab_temp, ab_top_p, ab_max_tokens, ab_rep_penalty, ab_context_length],
|
| 3596 |
outputs=[ab_chatbot_left, ab_chatbot_right, ab_status,
|
| 3597 |
ab_header_left, ab_header_right],
|
| 3598 |
)
|
|
|
|
| 3600 |
ab_input.submit(
|
| 3601 |
fn=ab_chat_respond,
|
| 3602 |
inputs=[ab_input, ab_chatbot_left, ab_chatbot_right,
|
| 3603 |
+
ab_system_prompt, ab_temp, ab_top_p, ab_max_tokens, ab_rep_penalty, ab_context_length],
|
| 3604 |
outputs=[ab_chatbot_left, ab_chatbot_right, ab_status,
|
| 3605 |
ab_header_left, ab_header_right],
|
| 3606 |
)
|
docs/index.html
CHANGED
|
@@ -796,7 +796,7 @@
|
|
| 796 |
██ ██ ██████ ██ ██ ██ █████ ██████ ███████ ██ ██ ██ ███████
|
| 797 |
██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
|
| 798 |
██████ ██████ ███████ ██ ██ ███████ ██ ██ ██ ██ ██ ██████ ███████</div>
|
| 799 |
-
<p class="subtitle">[ <em>MASTER ABLATION SUITE</em> ] — BREAK THE CHAINS THAT BIND YOU. 15 analysis modules.
|
| 800 |
</header>
|
| 801 |
|
| 802 |
<div class="tabs">
|
|
@@ -1253,7 +1253,7 @@
|
|
| 1253 |
<strong style="color:var(--cyan)">linear_cka</strong> (representation similarity) •
|
| 1254 |
<strong style="color:var(--cyan)">effective_rank</strong> (weight matrix health) •
|
| 1255 |
<strong style="color:var(--cyan)">kl_divergence</strong> (distribution shift) •
|
| 1256 |
-
|
| 1257 |
</p>
|
| 1258 |
</div>
|
| 1259 |
|
|
@@ -1461,7 +1461,7 @@
|
|
| 1461 |
</div>
|
| 1462 |
|
| 1463 |
<footer>
|
| 1464 |
-
OBLITERATUS — Master Ablation Suite — 15 modules •
|
| 1465 |
<a href="https://huggingface.co/transformers">HuggingFace Transformers</a>
|
| 1466 |
<span class="sigils">⍓ ⏚ ⍫ ◤ ⍕</span>
|
| 1467 |
</footer>
|
|
|
|
| 796 |
██ ██ ██████ ██ ██ ██ █████ ██████ ███████ ██ ██ ██ ███████
|
| 797 |
██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
|
| 798 |
██████ ██████ ███████ ██ ██ ███████ ██ ██ ██ ██ ██ ██████ ███████</div>
|
| 799 |
+
<p class="subtitle">[ <em>MASTER ABLATION SUITE</em> ] — BREAK THE CHAINS THAT BIND YOU. 15 analysis modules. 823 tests.<span class="cursor"></span></p>
|
| 800 |
</header>
|
| 801 |
|
| 802 |
<div class="tabs">
|
|
|
|
| 1253 |
<strong style="color:var(--cyan)">linear_cka</strong> (representation similarity) •
|
| 1254 |
<strong style="color:var(--cyan)">effective_rank</strong> (weight matrix health) •
|
| 1255 |
<strong style="color:var(--cyan)">kl_divergence</strong> (distribution shift) •
|
| 1256 |
+
823 tests across 28 test files.
|
| 1257 |
</p>
|
| 1258 |
</div>
|
| 1259 |
|
|
|
|
| 1461 |
</div>
|
| 1462 |
|
| 1463 |
<footer>
|
| 1464 |
+
OBLITERATUS — Master Ablation Suite — 15 modules • 823 tests • 2 paradigms —
|
| 1465 |
<a href="https://huggingface.co/transformers">HuggingFace Transformers</a>
|
| 1466 |
<span class="sigils">⍓ ⏚ ⍫ ◤ ⍕</span>
|
| 1467 |
</footer>
|
hf-spaces/README.md
CHANGED
|
@@ -6,6 +6,7 @@ colorTo: gray
|
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: "5.29.0"
|
| 8 |
app_file: app.py
|
|
|
|
| 9 |
pinned: true
|
| 10 |
license: agpl-3.0
|
| 11 |
tags:
|
|
|
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: "5.29.0"
|
| 8 |
app_file: app.py
|
| 9 |
+
hardware: zero-a10g
|
| 10 |
pinned: true
|
| 11 |
license: agpl-3.0
|
| 12 |
tags:
|
obliteratus/.DS_Store
CHANGED
|
Binary files a/obliteratus/.DS_Store and b/obliteratus/.DS_Store differ
|
|
|
obliteratus/__init__.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
"""Obliteratus — Master Ablation Suite for HuggingFace transformers."""
|
| 2 |
|
| 3 |
-
__version__ = "0.1.
|
| 4 |
|
| 5 |
# Lazy imports for the main pipeline classes
|
| 6 |
__all__ = [
|
|
|
|
| 1 |
"""Obliteratus — Master Ablation Suite for HuggingFace transformers."""
|
| 2 |
|
| 3 |
+
__version__ = "0.1.2"
|
| 4 |
|
| 5 |
# Lazy imports for the main pipeline classes
|
| 6 |
__all__ = [
|
obliteratus/abliterate.py
CHANGED
|
@@ -563,6 +563,7 @@ class AbliterationPipeline:
|
|
| 563 |
spectral_bands: int | None = None,
|
| 564 |
spectral_threshold: float | None = None,
|
| 565 |
large_model_mode: bool = False,
|
|
|
|
| 566 |
on_stage: Callable[[StageResult], None] | None = None,
|
| 567 |
on_log: Callable[[str], None] | None = None,
|
| 568 |
):
|
|
@@ -653,6 +654,12 @@ class AbliterationPipeline:
|
|
| 653 |
self.spectral_bands = spectral_bands if spectral_bands is not None else method_cfg.get("spectral_bands", 3)
|
| 654 |
self.spectral_threshold = spectral_threshold if spectral_threshold is not None else method_cfg.get("spectral_threshold", 0.05)
|
| 655 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 656 |
# Large model mode: conservative defaults for 120B+ models.
|
| 657 |
# Reduces memory footprint by limiting SAE features, directions,
|
| 658 |
# and refinement passes. Explicit parameter overrides still apply.
|
|
@@ -1303,17 +1310,21 @@ class AbliterationPipeline:
|
|
| 1303 |
|
| 1304 |
# Adaptive max_length: shorten sequences when GPU memory is tight.
|
| 1305 |
# For CoT-aware mode we need more sequence to capture reasoning tokens.
|
| 1306 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1307 |
free_gb = 0.0
|
| 1308 |
if torch.cuda.is_available():
|
| 1309 |
free_gb = sum(
|
| 1310 |
torch.cuda.mem_get_info(i)[0] / (1024 ** 3)
|
| 1311 |
for i in range(torch.cuda.device_count())
|
| 1312 |
)
|
| 1313 |
-
if free_gb < 2.0:
|
| 1314 |
max_length = 64
|
| 1315 |
self.log(f" Low GPU memory ({free_gb:.1f} GB free), using max_length={max_length}")
|
| 1316 |
-
elif free_gb < 4.0:
|
| 1317 |
max_length = 128
|
| 1318 |
self.log(f" Tight GPU memory ({free_gb:.1f} GB free), using max_length={max_length}")
|
| 1319 |
|
|
@@ -2622,7 +2633,7 @@ class AbliterationPipeline:
|
|
| 2622 |
batch = self._kl_eval_prompts[i:i + batch_size]
|
| 2623 |
inputs = tokenizer(
|
| 2624 |
batch, return_tensors="pt",
|
| 2625 |
-
padding=True, truncation=True, max_length=256,
|
| 2626 |
)
|
| 2627 |
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 2628 |
|
|
@@ -3453,7 +3464,7 @@ class AbliterationPipeline:
|
|
| 3453 |
try:
|
| 3454 |
for prompt in kl_prompts:
|
| 3455 |
inputs = tokenizer(
|
| 3456 |
-
prompt, return_tensors="pt", truncation=True, max_length=64,
|
| 3457 |
)
|
| 3458 |
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 3459 |
with torch.no_grad():
|
|
@@ -3527,7 +3538,7 @@ class AbliterationPipeline:
|
|
| 3527 |
try:
|
| 3528 |
for prompt in kl_prompts[:3]:
|
| 3529 |
inputs = tokenizer(
|
| 3530 |
-
prompt, return_tensors="pt", truncation=True, max_length=64,
|
| 3531 |
)
|
| 3532 |
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 3533 |
with torch.no_grad():
|
|
@@ -4842,7 +4853,7 @@ class AbliterationPipeline:
|
|
| 4842 |
total_loss = 0.0
|
| 4843 |
n_tokens = 0
|
| 4844 |
for text in reference_texts:
|
| 4845 |
-
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
|
| 4846 |
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 4847 |
with torch.no_grad():
|
| 4848 |
outputs = model(**inputs, labels=inputs["input_ids"])
|
|
@@ -5007,7 +5018,7 @@ class AbliterationPipeline:
|
|
| 5007 |
try:
|
| 5008 |
inputs = tokenizer(
|
| 5009 |
batch_formatted, return_tensors="pt",
|
| 5010 |
-
padding=True, truncation=True, max_length=512,
|
| 5011 |
)
|
| 5012 |
# Track per-prompt input lengths (non-pad tokens)
|
| 5013 |
attention_mask = inputs["attention_mask"]
|
|
@@ -5104,7 +5115,7 @@ class AbliterationPipeline:
|
|
| 5104 |
batch = self._kl_eval_prompts[i:i + 8]
|
| 5105 |
inputs = tokenizer(
|
| 5106 |
batch, return_tensors="pt",
|
| 5107 |
-
padding=True, truncation=True, max_length=256,
|
| 5108 |
)
|
| 5109 |
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 5110 |
with torch.no_grad():
|
|
|
|
| 563 |
spectral_bands: int | None = None,
|
| 564 |
spectral_threshold: float | None = None,
|
| 565 |
large_model_mode: bool = False,
|
| 566 |
+
max_seq_length: int | None = None,
|
| 567 |
on_stage: Callable[[StageResult], None] | None = None,
|
| 568 |
on_log: Callable[[str], None] | None = None,
|
| 569 |
):
|
|
|
|
| 654 |
self.spectral_bands = spectral_bands if spectral_bands is not None else method_cfg.get("spectral_bands", 3)
|
| 655 |
self.spectral_threshold = spectral_threshold if spectral_threshold is not None else method_cfg.get("spectral_threshold", 0.05)
|
| 656 |
|
| 657 |
+
# Tokenizer max_seq_length: controls truncation for all internal
|
| 658 |
+
# tokenizer calls (activation collection, KL eval, verify stage).
|
| 659 |
+
# None means use context-dependent defaults (256 for probes, 512 for
|
| 660 |
+
# verify, etc.) — setting this overrides ALL of them.
|
| 661 |
+
self.max_seq_length = max_seq_length
|
| 662 |
+
|
| 663 |
# Large model mode: conservative defaults for 120B+ models.
|
| 664 |
# Reduces memory footprint by limiting SAE features, directions,
|
| 665 |
# and refinement passes. Explicit parameter overrides still apply.
|
|
|
|
| 1310 |
|
| 1311 |
# Adaptive max_length: shorten sequences when GPU memory is tight.
|
| 1312 |
# For CoT-aware mode we need more sequence to capture reasoning tokens.
|
| 1313 |
+
# User override via max_seq_length takes priority over all heuristics.
|
| 1314 |
+
if self.max_seq_length is not None:
|
| 1315 |
+
max_length = self.max_seq_length
|
| 1316 |
+
else:
|
| 1317 |
+
max_length = 384 if collect_multi_pos else 256
|
| 1318 |
free_gb = 0.0
|
| 1319 |
if torch.cuda.is_available():
|
| 1320 |
free_gb = sum(
|
| 1321 |
torch.cuda.mem_get_info(i)[0] / (1024 ** 3)
|
| 1322 |
for i in range(torch.cuda.device_count())
|
| 1323 |
)
|
| 1324 |
+
if self.max_seq_length is None and free_gb < 2.0:
|
| 1325 |
max_length = 64
|
| 1326 |
self.log(f" Low GPU memory ({free_gb:.1f} GB free), using max_length={max_length}")
|
| 1327 |
+
elif self.max_seq_length is None and free_gb < 4.0:
|
| 1328 |
max_length = 128
|
| 1329 |
self.log(f" Tight GPU memory ({free_gb:.1f} GB free), using max_length={max_length}")
|
| 1330 |
|
|
|
|
| 2633 |
batch = self._kl_eval_prompts[i:i + batch_size]
|
| 2634 |
inputs = tokenizer(
|
| 2635 |
batch, return_tensors="pt",
|
| 2636 |
+
padding=True, truncation=True, max_length=self.max_seq_length or 256,
|
| 2637 |
)
|
| 2638 |
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 2639 |
|
|
|
|
| 3464 |
try:
|
| 3465 |
for prompt in kl_prompts:
|
| 3466 |
inputs = tokenizer(
|
| 3467 |
+
prompt, return_tensors="pt", truncation=True, max_length=self.max_seq_length or 64,
|
| 3468 |
)
|
| 3469 |
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 3470 |
with torch.no_grad():
|
|
|
|
| 3538 |
try:
|
| 3539 |
for prompt in kl_prompts[:3]:
|
| 3540 |
inputs = tokenizer(
|
| 3541 |
+
prompt, return_tensors="pt", truncation=True, max_length=self.max_seq_length or 64,
|
| 3542 |
)
|
| 3543 |
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 3544 |
with torch.no_grad():
|
|
|
|
| 4853 |
total_loss = 0.0
|
| 4854 |
n_tokens = 0
|
| 4855 |
for text in reference_texts:
|
| 4856 |
+
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=self.max_seq_length or 256)
|
| 4857 |
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 4858 |
with torch.no_grad():
|
| 4859 |
outputs = model(**inputs, labels=inputs["input_ids"])
|
|
|
|
| 5018 |
try:
|
| 5019 |
inputs = tokenizer(
|
| 5020 |
batch_formatted, return_tensors="pt",
|
| 5021 |
+
padding=True, truncation=True, max_length=self.max_seq_length or 512,
|
| 5022 |
)
|
| 5023 |
# Track per-prompt input lengths (non-pad tokens)
|
| 5024 |
attention_mask = inputs["attention_mask"]
|
|
|
|
| 5115 |
batch = self._kl_eval_prompts[i:i + 8]
|
| 5116 |
inputs = tokenizer(
|
| 5117 |
batch, return_tensors="pt",
|
| 5118 |
+
padding=True, truncation=True, max_length=self.max_seq_length or 256,
|
| 5119 |
)
|
| 5120 |
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 5121 |
with torch.no_grad():
|
obliteratus/cli.py
CHANGED
|
@@ -91,8 +91,11 @@ def main(argv: list[str] | None = None):
|
|
| 91 |
p.add_argument("--dtype", type=str, default="float16")
|
| 92 |
p.add_argument(
|
| 93 |
"--method", type=str, default="advanced",
|
| 94 |
-
choices=[
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
| 96 |
)
|
| 97 |
p.add_argument("--n-directions", type=int, default=None, help="Override: number of SVD directions to extract")
|
| 98 |
p.add_argument("--regularization", type=float, default=None, help="Override: fraction to preserve (0.0-1.0)")
|
|
|
|
| 91 |
p.add_argument("--dtype", type=str, default="float16")
|
| 92 |
p.add_argument(
|
| 93 |
"--method", type=str, default="advanced",
|
| 94 |
+
choices=[
|
| 95 |
+
"basic", "advanced", "aggressive", "spectral_cascade",
|
| 96 |
+
"informed", "surgical", "optimized", "inverted", "nuclear",
|
| 97 |
+
],
|
| 98 |
+
help="Liberation method (default: advanced)",
|
| 99 |
)
|
| 100 |
p.add_argument("--n-directions", type=int, default=None, help="Override: number of SVD directions to extract")
|
| 101 |
p.add_argument("--regularization", type=float, default=None, help="Override: fraction to preserve (0.0-1.0)")
|
obliteratus/evaluation/benchmarks.py
CHANGED
|
@@ -122,9 +122,10 @@ class BenchmarkRunner:
|
|
| 122 |
without requiring external datasets or API calls.
|
| 123 |
"""
|
| 124 |
|
| 125 |
-
def __init__(self, model, tokenizer, device: str | None = None):
|
| 126 |
self.model = model
|
| 127 |
self.tokenizer = tokenizer
|
|
|
|
| 128 |
if device is None:
|
| 129 |
self.device = next(model.parameters()).device
|
| 130 |
else:
|
|
@@ -272,7 +273,7 @@ class BenchmarkRunner:
|
|
| 272 |
prompt += "Answer: ("
|
| 273 |
|
| 274 |
inputs = self.tokenizer(
|
| 275 |
-
prompt, return_tensors="pt", truncation=True, max_length=
|
| 276 |
)
|
| 277 |
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 278 |
|
|
@@ -295,7 +296,7 @@ class BenchmarkRunner:
|
|
| 295 |
def _generate_short(self, prompt: str) -> str:
|
| 296 |
"""Generate a short completion for a prompt."""
|
| 297 |
inputs = self.tokenizer(
|
| 298 |
-
prompt, return_tensors="pt", truncation=True, max_length=
|
| 299 |
)
|
| 300 |
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 301 |
|
|
|
|
| 122 |
without requiring external datasets or API calls.
|
| 123 |
"""
|
| 124 |
|
| 125 |
+
def __init__(self, model, tokenizer, device: str | None = None, max_length: int = 256):
|
| 126 |
self.model = model
|
| 127 |
self.tokenizer = tokenizer
|
| 128 |
+
self.max_length = max_length
|
| 129 |
if device is None:
|
| 130 |
self.device = next(model.parameters()).device
|
| 131 |
else:
|
|
|
|
| 273 |
prompt += "Answer: ("
|
| 274 |
|
| 275 |
inputs = self.tokenizer(
|
| 276 |
+
prompt, return_tensors="pt", truncation=True, max_length=self.max_length
|
| 277 |
)
|
| 278 |
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 279 |
|
|
|
|
| 296 |
def _generate_short(self, prompt: str) -> str:
|
| 297 |
"""Generate a short completion for a prompt."""
|
| 298 |
inputs = self.tokenizer(
|
| 299 |
+
prompt, return_tensors="pt", truncation=True, max_length=self.max_length
|
| 300 |
)
|
| 301 |
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 302 |
|
obliteratus/informed_pipeline.py
CHANGED
|
@@ -179,6 +179,9 @@ class InformedAbliterationPipeline(AbliterationPipeline):
|
|
| 179 |
harmless_prompts: list[str] | None = None,
|
| 180 |
on_stage: Callable[[StageResult], None] | None = None,
|
| 181 |
on_log: Callable[[str], None] | None = None,
|
|
|
|
|
|
|
|
|
|
| 182 |
# Analysis configuration
|
| 183 |
run_cone_analysis: bool = True,
|
| 184 |
run_alignment_detection: bool = True,
|
|
@@ -208,6 +211,8 @@ class InformedAbliterationPipeline(AbliterationPipeline):
|
|
| 208 |
harmless_prompts=harmless_prompts,
|
| 209 |
on_stage=on_stage,
|
| 210 |
on_log=on_log,
|
|
|
|
|
|
|
| 211 |
# Set informed defaults
|
| 212 |
norm_preserve=True,
|
| 213 |
project_biases=True,
|
|
|
|
| 179 |
harmless_prompts: list[str] | None = None,
|
| 180 |
on_stage: Callable[[StageResult], None] | None = None,
|
| 181 |
on_log: Callable[[str], None] | None = None,
|
| 182 |
+
# Base pipeline kwargs forwarded to AbliterationPipeline
|
| 183 |
+
push_to_hub: str | None = None,
|
| 184 |
+
quantization: str | None = None,
|
| 185 |
# Analysis configuration
|
| 186 |
run_cone_analysis: bool = True,
|
| 187 |
run_alignment_detection: bool = True,
|
|
|
|
| 211 |
harmless_prompts=harmless_prompts,
|
| 212 |
on_stage=on_stage,
|
| 213 |
on_log=on_log,
|
| 214 |
+
push_to_hub=push_to_hub,
|
| 215 |
+
quantization=quantization,
|
| 216 |
# Set informed defaults
|
| 217 |
norm_preserve=True,
|
| 218 |
project_biases=True,
|
obliteratus/local_ui.py
CHANGED
|
@@ -18,7 +18,6 @@ import time
|
|
| 18 |
from rich.console import Console
|
| 19 |
from rich.panel import Panel
|
| 20 |
from rich.table import Table
|
| 21 |
-
from rich.text import Text
|
| 22 |
|
| 23 |
console = Console()
|
| 24 |
|
|
@@ -296,7 +295,7 @@ def launch_local_ui(
|
|
| 296 |
console.print("[dim]Loading OBLITERATUS UI (this may take a moment on first run)...[/dim]")
|
| 297 |
start = time.time()
|
| 298 |
|
| 299 |
-
from app import
|
| 300 |
|
| 301 |
elapsed = time.time() - start
|
| 302 |
if not quiet:
|
|
|
|
| 18 |
from rich.console import Console
|
| 19 |
from rich.panel import Panel
|
| 20 |
from rich.table import Table
|
|
|
|
| 21 |
|
| 22 |
console = Console()
|
| 23 |
|
|
|
|
| 295 |
console.print("[dim]Loading OBLITERATUS UI (this may take a moment on first run)...[/dim]")
|
| 296 |
start = time.time()
|
| 297 |
|
| 298 |
+
from app import launch as app_launch
|
| 299 |
|
| 300 |
elapsed = time.time() - start
|
| 301 |
if not quiet:
|
pyproject.toml
CHANGED
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
| 4 |
|
| 5 |
[project]
|
| 6 |
name = "obliteratus"
|
| 7 |
-
version = "0.1.
|
| 8 |
description = "Master Ablation Suite for HuggingFace transformers"
|
| 9 |
readme = "README.md"
|
| 10 |
requires-python = ">=3.10"
|
|
@@ -43,7 +43,7 @@ dependencies = [
|
|
| 43 |
"Bug Tracker" = "https://github.com/obliteratus-project/OBLITERATUS/issues"
|
| 44 |
|
| 45 |
[project.optional-dependencies]
|
| 46 |
-
dev = ["pytest>=7.0", "pytest-cov", "ruff"]
|
| 47 |
spaces = ["gradio>=5.0,<6.0"]
|
| 48 |
|
| 49 |
[project.scripts]
|
|
|
|
| 4 |
|
| 5 |
[project]
|
| 6 |
name = "obliteratus"
|
| 7 |
+
version = "0.1.2"
|
| 8 |
description = "Master Ablation Suite for HuggingFace transformers"
|
| 9 |
readme = "README.md"
|
| 10 |
requires-python = ">=3.10"
|
|
|
|
| 43 |
"Bug Tracker" = "https://github.com/obliteratus-project/OBLITERATUS/issues"
|
| 44 |
|
| 45 |
[project.optional-dependencies]
|
| 46 |
+
dev = ["pytest>=7.0", "pytest-cov", "ruff", "mypy"]
|
| 47 |
spaces = ["gradio>=5.0,<6.0"]
|
| 48 |
|
| 49 |
[project.scripts]
|
tests/test_cli.py
CHANGED
|
@@ -62,8 +62,11 @@ class TestCLIDispatch:
|
|
| 62 |
|
| 63 |
# 4. obliterate --method accepts valid methods
|
| 64 |
def test_obliterate_valid_methods(self):
|
| 65 |
-
"""Test that --method accepts
|
| 66 |
-
valid_methods = [
|
|
|
|
|
|
|
|
|
|
| 67 |
for method in valid_methods:
|
| 68 |
# Patch the actual pipeline execution so nothing runs
|
| 69 |
with patch("obliteratus.cli._cmd_abliterate") as mock_cmd:
|
|
@@ -72,11 +75,11 @@ class TestCLIDispatch:
|
|
| 72 |
args_passed = mock_cmd.call_args[0][0]
|
| 73 |
assert args_passed.method == method
|
| 74 |
|
| 75 |
-
# 4b.
|
| 76 |
-
def
|
| 77 |
-
"""The CLI --method flag
|
| 78 |
stderr_text = _capture_exit(
|
| 79 |
-
["obliterate", "fake/model", "--method", "
|
| 80 |
expect_code=2,
|
| 81 |
)
|
| 82 |
assert "invalid choice" in stderr_text.lower()
|
|
|
|
| 62 |
|
| 63 |
# 4. obliterate --method accepts valid methods
|
| 64 |
def test_obliterate_valid_methods(self):
|
| 65 |
+
"""Test that --method accepts all 9 pipeline methods."""
|
| 66 |
+
valid_methods = [
|
| 67 |
+
"basic", "advanced", "aggressive", "spectral_cascade",
|
| 68 |
+
"informed", "surgical", "optimized", "inverted", "nuclear",
|
| 69 |
+
]
|
| 70 |
for method in valid_methods:
|
| 71 |
# Patch the actual pipeline execution so nothing runs
|
| 72 |
with patch("obliteratus.cli._cmd_abliterate") as mock_cmd:
|
|
|
|
| 75 |
args_passed = mock_cmd.call_args[0][0]
|
| 76 |
assert args_passed.method == method
|
| 77 |
|
| 78 |
+
# 4b. invalid methods are rejected
|
| 79 |
+
def test_obliterate_rejects_invalid_method(self):
|
| 80 |
+
"""The CLI --method flag rejects unknown method names."""
|
| 81 |
stderr_text = _capture_exit(
|
| 82 |
+
["obliterate", "fake/model", "--method", "nonexistent"],
|
| 83 |
expect_code=2,
|
| 84 |
)
|
| 85 |
assert "invalid choice" in stderr_text.lower()
|