obliteratus

Running on Zero

App Files Files Community

pliny-the-prompter commited on Mar 11

Commit

e2a8771

verified ·

1 Parent(s): 8d4d519

Upload 133 files

Browse files

Files changed (5) hide show

app.py +49 -2
obliteratus/abliterate.py +7 -2
obliteratus/reporting/report.py +12 -2
pyproject.toml +1 -1
tests/test_abliterate.py +13 -4

app.py CHANGED Viewed

@@ -164,6 +164,7 @@ _state: dict = {
     "model_name": None,
     "method": None,
     "status": "idle",  # idle | obliterating | ready
     "log": [],
     # Activation steering metadata (survives model reload)
     "steering": None,  # dict with refusal_directions, strong_layers, steering_strength
@@ -755,6 +756,27 @@ def _should_quantize(model_id: str, is_preset: bool = False) -> str | None:
 # Obliteration
 # ---------------------------------------------------------------------------
 def _clear_gpu():
     """Free GPU/accelerator memory.  Resilient to device errors."""
     with _lock:
@@ -1913,6 +1935,9 @@ def obliterate(model_choice: str, method_choice: str,
     use_custom = custom_harmful and custom_harmful.strip()
     dataset_key = get_source_key_from_label(dataset_source_choice) if dataset_source_choice else "builtin"
     _clear_gpu()
     with _lock:
         if _state["status"] == "obliterating":
@@ -1920,6 +1945,7 @@ def obliterate(model_choice: str, method_choice: str,
             return
         _state["log"] = []
         _state["status"] = "obliterating"
         _state["model_name"] = model_choice
         _state["method"] = method
@@ -2094,6 +2120,7 @@ def obliterate(model_choice: str, method_choice: str,
         log_lines.append(f"\nERROR: {err_msg}")
         with _lock:
             _state["status"] = "idle"
             _state["log"] = log_lines
         yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update(), gr.update(), gr.update()
         return
@@ -2107,6 +2134,7 @@ def obliterate(model_choice: str, method_choice: str,
             # (e.g. import failure caught internally, or early return in worker).
             with _lock:
                 _state["status"] = "idle"
             log_lines.append("\nERROR: Pipeline completed but produced no result.")
             with _lock:
                 _state["log"] = log_lines
@@ -2200,6 +2228,7 @@ def obliterate(model_choice: str, method_choice: str,
                 _state["model"] = pipeline.handle.model
                 _state["tokenizer"] = pipeline.handle.tokenizer
                 _state["status"] = "ready"
         else:
             # Model too large for generation at full precision.  Free it and
             # reload a smaller copy so the KV cache fits in GPU.
@@ -2252,6 +2281,7 @@ def obliterate(model_choice: str, method_choice: str,
                         _state["model"] = model_reloaded
                         _state["tokenizer"] = tokenizer_reloaded
                         _state["status"] = "ready"
                     can_generate = True
                     log_lines.append("Reloaded in 4-bit — chat is ready!")
                 except Exception as e:
@@ -2293,6 +2323,7 @@ def obliterate(model_choice: str, method_choice: str,
                         _state["model"] = model_reloaded
                         _state["tokenizer"] = tokenizer_reloaded
                         _state["status"] = "ready"
                     can_generate = True
                     log_lines.append("Reloaded with CPU offload — chat is ready (may be slower).")
                 except Exception as e:
@@ -2300,6 +2331,7 @@ def obliterate(model_choice: str, method_choice: str,
                     log_lines.append("Chat unavailable. Load the saved model on a larger instance.")
                     with _lock:
                         _state["status"] = "idle"
         # Build metrics summary card while pipeline is still alive
         metrics_card = _format_obliteration_metrics(pipeline, method, _elapsed())
@@ -2346,6 +2378,7 @@ def obliterate(model_choice: str, method_choice: str,
         log_lines.append(f"\nERROR (post-pipeline): {err_msg}")
         with _lock:
             _state["status"] = "idle"
             _state["log"] = log_lines
         yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update(), gr.update(), gr.update()
@@ -2402,6 +2435,9 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
     On ZeroGPU, allocates a GPU for up to 2 minutes per response.
     """
     with _lock:
         model = _state["model"]
         tokenizer = _state["tokenizer"]
@@ -2418,7 +2454,12 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
             if model_dev.type == "meta":
                 _needs_reload = True
             elif dev.is_gpu_available() and model_dev.type not in ("cuda", "mps"):
-                model.to(dev.get_device())
         except Exception:
             _needs_reload = True
@@ -2707,6 +2748,9 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
             )
             return
     with _lock:
         if _state["status"] == "obliterating":
             yield "**Error:** An obliteration is already in progress.", ""
@@ -2888,7 +2932,10 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
             if model_dev.type == "meta":
                 _needs_reload = True
             elif dev.is_gpu_available() and model_dev.type not in ("cuda", "mps"):
-                abliterated_model.to(dev.get_device())
         except Exception:
             _needs_reload = True

     "model_name": None,
     "method": None,
     "status": "idle",  # idle | obliterating | ready
+    "obliterate_started_at": None,  # time.time() when obliteration started
     "log": [],
     # Activation steering metadata (survives model reload)
     "steering": None,  # dict with refusal_directions, strong_layers, steering_strength
 # Obliteration
 # ---------------------------------------------------------------------------
+def _unstick_stale_obliterating(max_age: float = 360.0) -> bool:
+    """Reset status from 'obliterating' to 'idle' if it has been stuck too long.
+    ZeroGPU can kill the obliterate generator mid-execution (duration=300s
+    timeout), leaving _state["status"] permanently stuck at "obliterating".
+    This helper detects that condition and resets to "idle" so the Chat tab
+    and subsequent obliterations aren't permanently blocked.
+    Returns True if the status was reset.
+    """
+    with _lock:
+        if _state["status"] != "obliterating":
+            return False
+        started = _state.get("obliterate_started_at")
+        if started is None or (time.time() - started) > max_age:
+            _state["status"] = "idle"
+            _state["obliterate_started_at"] = None
+            return True
+    return False
 def _clear_gpu():
     """Free GPU/accelerator memory.  Resilient to device errors."""
     with _lock:
     use_custom = custom_harmful and custom_harmful.strip()
     dataset_key = get_source_key_from_label(dataset_source_choice) if dataset_source_choice else "builtin"
+    # Unstick stale "obliterating" status left behind by ZeroGPU timeout
+    _unstick_stale_obliterating()
     _clear_gpu()
     with _lock:
         if _state["status"] == "obliterating":
             return
         _state["log"] = []
         _state["status"] = "obliterating"
+        _state["obliterate_started_at"] = time.time()
         _state["model_name"] = model_choice
         _state["method"] = method
         log_lines.append(f"\nERROR: {err_msg}")
         with _lock:
             _state["status"] = "idle"
+            _state["obliterate_started_at"] = None
             _state["log"] = log_lines
         yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update(), gr.update(), gr.update()
         return
             # (e.g. import failure caught internally, or early return in worker).
             with _lock:
                 _state["status"] = "idle"
+                _state["obliterate_started_at"] = None
             log_lines.append("\nERROR: Pipeline completed but produced no result.")
             with _lock:
                 _state["log"] = log_lines
                 _state["model"] = pipeline.handle.model
                 _state["tokenizer"] = pipeline.handle.tokenizer
                 _state["status"] = "ready"
+                _state["obliterate_started_at"] = None
         else:
             # Model too large for generation at full precision.  Free it and
             # reload a smaller copy so the KV cache fits in GPU.
                         _state["model"] = model_reloaded
                         _state["tokenizer"] = tokenizer_reloaded
                         _state["status"] = "ready"
+                        _state["obliterate_started_at"] = None
                     can_generate = True
                     log_lines.append("Reloaded in 4-bit — chat is ready!")
                 except Exception as e:
                         _state["model"] = model_reloaded
                         _state["tokenizer"] = tokenizer_reloaded
                         _state["status"] = "ready"
+                        _state["obliterate_started_at"] = None
                     can_generate = True
                     log_lines.append("Reloaded with CPU offload — chat is ready (may be slower).")
                 except Exception as e:
                     log_lines.append("Chat unavailable. Load the saved model on a larger instance.")
                     with _lock:
                         _state["status"] = "idle"
+                        _state["obliterate_started_at"] = None
         # Build metrics summary card while pipeline is still alive
         metrics_card = _format_obliteration_metrics(pipeline, method, _elapsed())
         log_lines.append(f"\nERROR (post-pipeline): {err_msg}")
         with _lock:
             _state["status"] = "idle"
+            _state["obliterate_started_at"] = None
             _state["log"] = log_lines
         yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update(), gr.update(), gr.update()
     On ZeroGPU, allocates a GPU for up to 2 minutes per response.
     """
+    # Unstick stale "obliterating" status left behind by ZeroGPU timeout
+    _unstick_stale_obliterating()
     with _lock:
         model = _state["model"]
         tokenizer = _state["tokenizer"]
             if model_dev.type == "meta":
                 _needs_reload = True
             elif dev.is_gpu_available() and model_dev.type not in ("cuda", "mps"):
+                # Only move to GPU if the model wasn't loaded with device_map
+                # (distributed models can't be moved with a single .to() call).
+                if hasattr(model, "hf_device_map"):
+                    _needs_reload = True
+                else:
+                    model.to(dev.get_device())
         except Exception:
             _needs_reload = True
             )
             return
+    # Unstick stale "obliterating" status left behind by ZeroGPU timeout
+    _unstick_stale_obliterating()
     with _lock:
         if _state["status"] == "obliterating":
             yield "**Error:** An obliteration is already in progress.", ""
             if model_dev.type == "meta":
                 _needs_reload = True
             elif dev.is_gpu_available() and model_dev.type not in ("cuda", "mps"):
+                if hasattr(abliterated_model, "hf_device_map"):
+                    _needs_reload = True
+                else:
+                    abliterated_model.to(dev.get_device())
         except Exception:
             _needs_reload = True

obliteratus/abliterate.py CHANGED Viewed

@@ -1452,8 +1452,13 @@ class AbliterationPipeline:
         device = self._get_model_device(model)
-        # Batch prompts for throughput — hooks unbatch per-prompt activations
-        batch_size = 16 if free_gb > _tight_gb else 8 if free_gb > _low_gb else 1
         # Left-pad so position -1 is always the last real token in every batch element
         orig_padding_side = getattr(tokenizer, "padding_side", "right")
         if batch_size > 1:

         device = self._get_model_device(model)
+        # Batch prompts for throughput — hooks unbatch per-prompt activations.
+        # On CPU-only (free_gb=0), batch_size=4 is safe since system RAM is
+        # typically more abundant than GPU VRAM.
+        if not dev.is_gpu_available():
+            batch_size = 4
+        else:
+            batch_size = 16 if free_gb > _tight_gb else 8 if free_gb > _low_gb else 1
         # Left-pad so position -1 is always the last real token in every batch element
         orig_padding_side = getattr(tokenizer, "padding_side", "right")
         if batch_size > 1:

obliteratus/reporting/report.py CHANGED Viewed

@@ -144,7 +144,12 @@ class AblationReport:
         if output_path:
             matplotlib.use("Agg")
         import matplotlib.pyplot as plt
-        import seaborn as sns
         if metric is None:
             metric = list(self.baseline_metrics.keys())[0]
@@ -182,7 +187,12 @@ class AblationReport:
         if output_path:
             matplotlib.use("Agg")
         import matplotlib.pyplot as plt
-        import seaborn as sns
         df = self.to_dataframe()
         pct_cols = [c for c in df.columns if c.endswith("_pct_change")]

         if output_path:
             matplotlib.use("Agg")
         import matplotlib.pyplot as plt
+        try:
+            import seaborn as sns
+        except ImportError:
+            raise ImportError(
+                "seaborn is required for plotting. Install it with: pip install seaborn>=0.12"
+            )
         if metric is None:
             metric = list(self.baseline_metrics.keys())[0]
         if output_path:
             matplotlib.use("Agg")
         import matplotlib.pyplot as plt
+        try:
+            import seaborn as sns
+        except ImportError:
+            raise ImportError(
+                "seaborn is required for plotting. Install it with: pip install seaborn>=0.12"
+            )
         df = self.to_dataframe()
         pct_cols = [c for c in df.columns if c.endswith("_pct_change")]

pyproject.toml CHANGED Viewed

@@ -34,7 +34,6 @@ dependencies = [
     "numpy>=1.24",
     "scikit-learn>=1.3",
     "tqdm>=4.64",
-    "bitsandbytes>=0.46.1",
 ]
 [project.urls]
@@ -44,6 +43,7 @@ dependencies = [
 [project.optional-dependencies]
 dev = ["pytest>=7.0", "pytest-cov", "ruff", "mypy"]
 spaces = ["gradio>=5.0,<6.0"]
 [project.scripts]

     "numpy>=1.24",
     "scikit-learn>=1.3",
     "tqdm>=4.64",
 ]
 [project.urls]
 [project.optional-dependencies]
 dev = ["pytest>=7.0", "pytest-cov", "ruff", "mypy"]
+cuda = ["bitsandbytes>=0.46.1"]
 spaces = ["gradio>=5.0,<6.0"]
 [project.scripts]

tests/test_abliterate.py CHANGED Viewed

@@ -42,10 +42,19 @@ def _make_tiny_handle():
     tokenizer = MagicMock()
     tokenizer.pad_token = "<pad>"
     tokenizer.eos_token = "<eos>"
-    tokenizer.return_value = {
-        "input_ids": torch.randint(0, 1000, (1, 10)),
-        "attention_mask": torch.ones(1, 10, dtype=torch.long),
-    }
     tokenizer.decode.return_value = "The capital of France is Paris, a beautiful city"
     handle = ModelHandle(

     tokenizer = MagicMock()
     tokenizer.pad_token = "<pad>"
     tokenizer.eos_token = "<eos>"
+    tokenizer.pad_token_id = 0
+    tokenizer.eos_token_id = 1
+    # Return batch-aware tensors: if called with a list, batch_size = len(list)
+    def _mock_tokenize(text_or_list, **kwargs):
+        if isinstance(text_or_list, list):
+            bs = len(text_or_list)
+        else:
+            bs = 1
+        return {
+            "input_ids": torch.randint(0, 1000, (bs, 10)),
+            "attention_mask": torch.ones(bs, 10, dtype=torch.long),
+        }
+    tokenizer.side_effect = _mock_tokenize
     tokenizer.decode.return_value = "The capital of France is Paris, a beautiful city"
     handle = ModelHandle(