obliteratus

Running on Zero

App Files Files Community

pliny-the-prompter commited on Mar 4

Commit

664144c

verified ·

1 Parent(s): d419e87

Upload 129 files

Browse files

Files changed (6) hide show

app.py +159 -24
obliteratus/.DS_Store +0 -0
obliteratus/analysis/visualization.py +31 -35
obliteratus/evaluation/benchmark_plots.py +19 -7
obliteratus/reporting/report.py +14 -3
tests/test_visualization.py +48 -0

app.py CHANGED Viewed

@@ -588,6 +588,114 @@ _BENCH_TEST_PROMPTS = [
 ]
 def _figs_to_gallery(figs: list) -> list[tuple[str, str]]:
     """Convert matplotlib Figures to gallery-compatible (filepath, caption) tuples."""
     import tempfile
@@ -648,6 +756,7 @@ def benchmark(
     results = []
     all_logs = []
     # Compute actual prompt count that will be used
     if prompt_volume > 0:
@@ -815,6 +924,10 @@ def benchmark(
             all_logs.append(f"  Strong layers: {entry['strong_layers']}")
             all_logs.append(f"  EGA expert directions: {entry['ega_expert_dirs']}")
         results.append(entry)
         # ── Telemetry: log benchmark result for community leaderboard ──
@@ -873,13 +986,16 @@ def benchmark(
     from obliteratus.evaluation.benchmark_plots import generate_benchmark_dashboard
     dashboard_figs = generate_benchmark_dashboard(results, mode="multi_method", title_suffix=f" — {model_id}")
     # Convert figures to gallery images
-    gallery_images = _figs_to_gallery(dashboard_figs)
     # Final summary
     all_logs.append("\n" + "=" * 60)
     all_logs.append("BENCHMARK COMPLETE")
-    all_logs.append(f"Generated {len(dashboard_figs)} visualizations")
     all_logs.append("=" * 60)
     all_logs.append("\nJSON results:")
     all_logs.append(_json.dumps(results, indent=2, default=str))
@@ -999,6 +1115,7 @@ def benchmark_multi_model(
     results = []
     all_logs = []
     bench_context = {
         "method": method_key,
         "dataset": source_label,
@@ -1151,6 +1268,11 @@ def benchmark_multi_model(
             all_logs.append(f"  Completed in {elapsed:.1f}s")
             all_logs.append(f"  PPL={entry['perplexity']}, Coherence={entry['coherence']}, Refusal={entry['refusal_rate']}")
         results.append(entry)
         # ── Telemetry: log multi-model benchmark result ──
@@ -1206,11 +1328,15 @@ def benchmark_multi_model(
     # Generate dashboard visualizations
     from obliteratus.evaluation.benchmark_plots import generate_benchmark_dashboard
     dashboard_figs = generate_benchmark_dashboard(results, mode="multi_model", title_suffix=f" \u2014 {method_key}")
-    gallery_images = _figs_to_gallery(dashboard_figs)
     all_logs.append("\n" + "=" * 60)
     all_logs.append("MULTI-MODEL BENCHMARK COMPLETE")
-    all_logs.append(f"Generated {len(dashboard_figs)} visualizations")
     all_logs.append("=" * 60)
     all_logs.append("\nJSON results:")
     all_logs.append(_json.dumps(results, indent=2, default=str))
@@ -1333,7 +1459,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
             f"   or locally: `export HF_TOKEN=hf_...`\n\n"
             f"Get your token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)\n\n"
             f"Alternatively, choose a non-gated model (those without the \U0001f512 icon).",
-            "", gr.update(), gr.update(),
         )
         return
@@ -1342,14 +1468,14 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
         if not re.match(r'^[a-zA-Z0-9_-]+/[a-zA-Z0-9_.-]+$', push_to_hub):
             yield (
                 "**Error:** Invalid Hub repo format. Use `username/model-name`.",
-                "", gr.update(), gr.update(),
             )
             return
         if not os.environ.get("HF_TOKEN"):
             yield (
                 "**Error:** HF_TOKEN not set. Push to Hub requires a write token. "
                 "Set it via `export HF_TOKEN=hf_...` or in your Space secrets.",
-                "", gr.update(), gr.update(),
             )
             return
@@ -1360,7 +1486,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
     _clear_gpu()
     with _lock:
         if _state["status"] == "obliterating":
-            yield "**Error:** An obliteration is already in progress.", "", gr.update(), gr.update()
             return
         _state["log"] = []
         _state["status"] = "obliterating"
@@ -1512,9 +1638,9 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
         status_msg = f"**Obliterating\u2026** ({_elapsed()})"
         if len(log_lines) > last_yielded[0]:
             last_yielded[0] = len(log_lines)
-            yield status_msg, "\n".join(log_lines), gr.update(), gr.update()
         else:
-            yield status_msg, "\n".join(log_lines), gr.update(), gr.update()
         if time.time() - _pipeline_start > _max_pipeline_secs:
             log_lines.append("\nTIMEOUT: Pipeline exceeded 45-minute limit.")
             break
@@ -1529,7 +1655,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
         err_msg = str(error_ref[0]) or repr(error_ref[0])
         log_lines.append(f"\nERROR: {err_msg}")
         _state["log"] = log_lines
-        yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update()
         return
     # Success — keep model in memory for chat.
@@ -1631,7 +1757,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
             if bnb_available:
                 log_lines.append("\nModel too large for chat at float16 — reloading in 4-bit...")
                 last_yielded[0] = len(log_lines)
-                yield status_msg, "\n".join(log_lines), gr.update(), gr.update()
                 try:
                     from transformers import BitsAndBytesConfig
                     bnb_cfg = BitsAndBytesConfig(
@@ -1678,7 +1804,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
                     else "Falling back to CPU offload..."
                 )
                 last_yielded[0] = len(log_lines)
-                yield status_msg, "\n".join(log_lines), gr.update(), gr.update()
                 try:
                     offload_dir = tempfile.mkdtemp(prefix="obliteratus_offload_")
                     model_reloaded = AutoModelForCausalLM.from_pretrained(
@@ -1713,6 +1839,9 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
                     with _lock:
                         _state["status"] = "idle"
         # Free pipeline internals we no longer need (activations, directions cache)
         # to reclaim memory — we've already extracted the model and steering metadata.
         pipeline_ref[0] = None
@@ -1738,7 +1867,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
             choices=_get_session_model_choices(),
             value=_last_obliterated_label or None,
         )
-        yield status_msg, "\n".join(log_lines), get_chat_header(), _dd_update
     except Exception as e:
         # Ensure status never gets stuck on "obliterating"
@@ -1747,7 +1876,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
         err_msg = str(e) or repr(e)
         log_lines.append(f"\nERROR (post-pipeline): {err_msg}")
         _state["log"] = log_lines
-        yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update()
 # ---------------------------------------------------------------------------
@@ -1837,6 +1966,10 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
                 )
                 if tokenizer.pad_token is None:
                     tokenizer.pad_token = tokenizer.eos_token
                 with _lock:
                     _state["model"] = model
                     _state["tokenizer"] = tokenizer
@@ -1874,9 +2007,8 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
     inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=context_length)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    # Streaming generation — repetition_penalty and no_repeat_ngram_size
-    # break degenerate refusal loops where the model gets stuck cycling
-    # through fragments of its safety response
     # Scale timeout with max_tokens: large generations need more time.
     # Base 120s + ~0.1s per token gives headroom for slow models.
     stream_timeout = max(120, 120 + int(max_tokens * 0.1))
@@ -1888,7 +2020,6 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
         "temperature": max(temperature, 0.01),
         "top_p": top_p,
         "repetition_penalty": float(repetition_penalty),
-        "no_repeat_ngram_size": 4,
         "streamer": streamer,
     }
@@ -2199,6 +2330,10 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
                 )
                 if tokenizer.pad_token is None:
                     tokenizer.pad_token = tokenizer.eos_token
                 with _lock:
                     _state["model"] = abliterated_model
                     _state["tokenizer"] = tokenizer
@@ -2240,7 +2375,6 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
         "temperature": max(temperature, 0.01),
         "top_p": top_p,
         "repetition_penalty": float(repetition_penalty),
-        "no_repeat_ngram_size": 4,
     }
     # Add user message to both histories
@@ -3236,6 +3370,7 @@ with gr.Blocks(theme=THEME, css=CSS, js=_JS, title="OBLITERATUS", fill_height=Tr
             )
             status_md = gr.Markdown("")
             log_box = gr.Textbox(
                 label="Pipeline Log",
                 lines=20,
@@ -3606,9 +3741,9 @@ Pre-configured benchmark configurations for common research questions.
                     top_p = gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top P")
                     max_tokens = gr.Slider(32, 4096, value=512, step=32, label="Max Tokens")
                     repetition_penalty = gr.Slider(
-                        1.0, 2.0, value=1.15, step=0.05,
                         label="Repetition Penalty",
-                        info="Penalizes repeated tokens — higher values break refusal loops (1.0 = off)",
                     )
                     context_length = gr.Slider(
                         128, 32768, value=2048, step=128,
@@ -3656,7 +3791,7 @@ See exactly how abliteration changes model behavior on the same prompt.
                     ab_temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="Temperature")
                     ab_top_p = gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top P")
                     ab_max_tokens = gr.Slider(32, 2048, value=256, step=32, label="Max Tokens")
-                    ab_rep_penalty = gr.Slider(1.0, 2.0, value=1.15, step=0.05, label="Rep Penalty")
                     ab_context_length = gr.Slider(
                         128, 32768, value=2048, step=128,
                         label="Context Length",
@@ -4063,7 +4198,7 @@ Built on the shoulders of:
         fn=obliterate,
         inputs=[model_dd, method_dd, hub_repo, prompt_vol_dd, dataset_dd,
                 custom_harmful_tb, custom_harmless_tb] + _adv_controls,
-        outputs=[status_md, log_box, chat_status, session_model_dd],
     ).then(
         fn=lambda: (
             gr.update(choices=_get_session_model_choices()),

 ]
+def _format_obliteration_metrics(pipeline, method: str, elapsed_str: str) -> str:
+    """Format post-obliteration quality metrics as a concise Markdown card."""
+    metrics = getattr(pipeline, "_quality_metrics", {}) or {}
+    ppl = metrics.get("perplexity")
+    coh = metrics.get("coherence")
+    ref = metrics.get("refusal_rate")
+    kl = metrics.get("kl_divergence")
+    n_layers = len(getattr(pipeline, "_strong_layers", []))
+    parts = ["### Liberation Results\n"]
+    parts.append("| Metric | Value | |")
+    parts.append("|--------|------:|---|")
+    if ref is not None:
+        pct = ref * 100
+        icon = "🟢" if pct < 10 else "🟡" if pct < 30 else "🔴"
+        parts.append(f"| Refusal Rate | **{pct:.1f}%** | {icon} |")
+    if coh is not None:
+        pct = coh * 100
+        icon = "🟢" if pct > 80 else "🟡" if pct > 60 else "🔴"
+        parts.append(f"| Coherence | **{pct:.1f}%** | {icon} |")
+    if ppl is not None:
+        icon = "🟢" if ppl < 12 else "🟡" if ppl < 20 else "🔴"
+        parts.append(f"| Perplexity | **{ppl:.2f}** | {icon} |")
+    if kl is not None:
+        icon = "🟢" if kl < 0.05 else "🟡" if kl < 0.1 else "🔴"
+        parts.append(f"| KL Divergence | **{kl:.4f}** | {icon} |")
+    if n_layers > 0:
+        parts.append(f"| Layers Modified | **{n_layers}** | |")
+    if not metrics:
+        return ""
+    return "\n".join(parts)
+def _generate_analysis_figs(pipeline, model_label: str = "") -> list:
+    """Generate analysis visualizations from a completed pipeline's surviving data.
+    Produces cross-layer heatmap + angular drift charts from refusal_directions
+    (which persist after pipeline.run()), and a refusal topology chart using
+    direction norms as a proxy for signal strength (since activation means are
+    freed during execution).
+    """
+    figs = []
+    directions = getattr(pipeline, "refusal_directions", {})
+    strong_layers = getattr(pipeline, "_strong_layers", [])
+    if len(directions) < 2:
+        return figs
+    try:
+        from obliteratus.analysis.cross_layer import CrossLayerAlignmentAnalyzer
+        from obliteratus.analysis.visualization import (
+            plot_cross_layer_heatmap,
+            plot_angular_drift,
+        )
+        import tempfile, os
+        analyzer = CrossLayerAlignmentAnalyzer()
+        result = analyzer.analyze(directions)
+        suffix = f" — {model_label}" if model_label else ""
+        heatmap_fig = plot_cross_layer_heatmap(
+            result,
+            output_path=tempfile.mktemp(suffix=".png"),
+            title=f"Cross-Layer Direction Alignment{suffix}",
+        )
+        figs.append(heatmap_fig)
+        drift_fig = plot_angular_drift(
+            result,
+            output_path=tempfile.mktemp(suffix=".png"),
+            title=f"Refusal Direction Angular Drift{suffix}",
+        )
+        figs.append(drift_fig)
+    except Exception:
+        pass  # Analysis charts are best-effort
+    # Refusal topology using direction norms as proxy (means are freed)
+    if directions and strong_layers:
+        try:
+            from obliteratus.analysis.visualization import plot_refusal_topology
+            import tempfile
+            # Build proxy means from direction norms
+            proxy_harmful = {}
+            proxy_harmless = {}
+            for idx, d in directions.items():
+                d_f = d.float().squeeze()
+                d_f = d_f / d_f.norm().clamp(min=1e-8)
+                # Simulate a separation proportional to the direction norm
+                norm = d.float().squeeze().norm().item()
+                proxy_harmless[idx] = torch.zeros_like(d_f).unsqueeze(0)
+                proxy_harmful[idx] = (d_f * norm).unsqueeze(0)
+            topo_fig = plot_refusal_topology(
+                directions, proxy_harmful, proxy_harmless, list(strong_layers),
+                output_path=tempfile.mktemp(suffix=".png"),
+                title=f"Refusal Topology Map{suffix}",
+            )
+            figs.append(topo_fig)
+        except Exception:
+            pass
+    return figs
 def _figs_to_gallery(figs: list) -> list[tuple[str, str]]:
     """Convert matplotlib Figures to gallery-compatible (filepath, caption) tuples."""
     import tempfile
     results = []
     all_logs = []
+    analysis_figs = []  # Cross-layer/topology charts from each pipeline run
     # Compute actual prompt count that will be used
     if prompt_volume > 0:
             all_logs.append(f"  Strong layers: {entry['strong_layers']}")
             all_logs.append(f"  EGA expert directions: {entry['ega_expert_dirs']}")
+            # Extract analysis visualizations before pipeline is freed
+            method_figs = _generate_analysis_figs(pipeline, method_key)
+            analysis_figs.extend(method_figs)
         results.append(entry)
         # ── Telemetry: log benchmark result for community leaderboard ──
     from obliteratus.evaluation.benchmark_plots import generate_benchmark_dashboard
     dashboard_figs = generate_benchmark_dashboard(results, mode="multi_method", title_suffix=f" — {model_id}")
+    # Append per-method analysis charts (cross-layer heatmaps, topology maps, etc.)
+    all_figs = dashboard_figs + analysis_figs
     # Convert figures to gallery images
+    gallery_images = _figs_to_gallery(all_figs)
     # Final summary
     all_logs.append("\n" + "=" * 60)
     all_logs.append("BENCHMARK COMPLETE")
+    all_logs.append(f"Generated {len(all_figs)} visualizations")
     all_logs.append("=" * 60)
     all_logs.append("\nJSON results:")
     all_logs.append(_json.dumps(results, indent=2, default=str))
     results = []
     all_logs = []
+    analysis_figs = []  # Cross-layer/topology charts from each pipeline run
     bench_context = {
         "method": method_key,
         "dataset": source_label,
             all_logs.append(f"  Completed in {elapsed:.1f}s")
             all_logs.append(f"  PPL={entry['perplexity']}, Coherence={entry['coherence']}, Refusal={entry['refusal_rate']}")
+            # Extract analysis visualizations before pipeline is freed
+            model_short = model_id.split("/")[-1] if "/" in model_id else model_id
+            method_figs = _generate_analysis_figs(pipeline, model_short)
+            analysis_figs.extend(method_figs)
         results.append(entry)
         # ── Telemetry: log multi-model benchmark result ──
     # Generate dashboard visualizations
     from obliteratus.evaluation.benchmark_plots import generate_benchmark_dashboard
     dashboard_figs = generate_benchmark_dashboard(results, mode="multi_model", title_suffix=f" \u2014 {method_key}")
+    # Append per-model analysis charts (cross-layer heatmaps, topology maps, etc.)
+    all_figs = dashboard_figs + analysis_figs
+    gallery_images = _figs_to_gallery(all_figs)
     all_logs.append("\n" + "=" * 60)
     all_logs.append("MULTI-MODEL BENCHMARK COMPLETE")
+    all_logs.append(f"Generated {len(all_figs)} visualizations")
     all_logs.append("=" * 60)
     all_logs.append("\nJSON results:")
     all_logs.append(_json.dumps(results, indent=2, default=str))
             f"   or locally: `export HF_TOKEN=hf_...`\n\n"
             f"Get your token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)\n\n"
             f"Alternatively, choose a non-gated model (those without the \U0001f512 icon).",
+            "", gr.update(), gr.update(), gr.update(),
         )
         return
         if not re.match(r'^[a-zA-Z0-9_-]+/[a-zA-Z0-9_.-]+$', push_to_hub):
             yield (
                 "**Error:** Invalid Hub repo format. Use `username/model-name`.",
+                "", gr.update(), gr.update(), gr.update(),
             )
             return
         if not os.environ.get("HF_TOKEN"):
             yield (
                 "**Error:** HF_TOKEN not set. Push to Hub requires a write token. "
                 "Set it via `export HF_TOKEN=hf_...` or in your Space secrets.",
+                "", gr.update(), gr.update(), gr.update(),
             )
             return
     _clear_gpu()
     with _lock:
         if _state["status"] == "obliterating":
+            yield "**Error:** An obliteration is already in progress.", "", gr.update(), gr.update(), gr.update()
             return
         _state["log"] = []
         _state["status"] = "obliterating"
         status_msg = f"**Obliterating\u2026** ({_elapsed()})"
         if len(log_lines) > last_yielded[0]:
             last_yielded[0] = len(log_lines)
+            yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update()
         else:
+            yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update()
         if time.time() - _pipeline_start > _max_pipeline_secs:
             log_lines.append("\nTIMEOUT: Pipeline exceeded 45-minute limit.")
             break
         err_msg = str(error_ref[0]) or repr(error_ref[0])
         log_lines.append(f"\nERROR: {err_msg}")
         _state["log"] = log_lines
+        yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update(), gr.update()
         return
     # Success — keep model in memory for chat.
             if bnb_available:
                 log_lines.append("\nModel too large for chat at float16 — reloading in 4-bit...")
                 last_yielded[0] = len(log_lines)
+                yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update()
                 try:
                     from transformers import BitsAndBytesConfig
                     bnb_cfg = BitsAndBytesConfig(
                     else "Falling back to CPU offload..."
                 )
                 last_yielded[0] = len(log_lines)
+                yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update()
                 try:
                     offload_dir = tempfile.mkdtemp(prefix="obliteratus_offload_")
                     model_reloaded = AutoModelForCausalLM.from_pretrained(
                     with _lock:
                         _state["status"] = "idle"
+        # Build metrics summary card while pipeline is still alive
+        metrics_card = _format_obliteration_metrics(pipeline, method, _elapsed())
         # Free pipeline internals we no longer need (activations, directions cache)
         # to reclaim memory — we've already extracted the model and steering metadata.
         pipeline_ref[0] = None
             choices=_get_session_model_choices(),
             value=_last_obliterated_label or None,
         )
+        yield status_msg, "\n".join(log_lines), get_chat_header(), _dd_update, metrics_card
     except Exception as e:
         # Ensure status never gets stuck on "obliterating"
         err_msg = str(e) or repr(e)
         log_lines.append(f"\nERROR (post-pipeline): {err_msg}")
         _state["log"] = log_lines
+        yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update(), gr.update()
 # ---------------------------------------------------------------------------
                 )
                 if tokenizer.pad_token is None:
                     tokenizer.pad_token = tokenizer.eos_token
+                # Re-install activation steering hooks on the reloaded model
+                steering_meta = _state.get("steering")
+                if steering_meta:
+                    _install_steering_hooks(model, steering_meta)
                 with _lock:
                     _state["model"] = model
                     _state["tokenizer"] = tokenizer
     inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=context_length)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    # Streaming generation — repetition_penalty (user-controllable, default 1.0)
+    # can break degenerate refusal loops if increased.
     # Scale timeout with max_tokens: large generations need more time.
     # Base 120s + ~0.1s per token gives headroom for slow models.
     stream_timeout = max(120, 120 + int(max_tokens * 0.1))
         "temperature": max(temperature, 0.01),
         "top_p": top_p,
         "repetition_penalty": float(repetition_penalty),
         "streamer": streamer,
     }
                 )
                 if tokenizer.pad_token is None:
                     tokenizer.pad_token = tokenizer.eos_token
+                # Re-install activation steering hooks on the reloaded model
+                steering_meta = _state.get("steering")
+                if steering_meta:
+                    _install_steering_hooks(abliterated_model, steering_meta)
                 with _lock:
                     _state["model"] = abliterated_model
                     _state["tokenizer"] = tokenizer
         "temperature": max(temperature, 0.01),
         "top_p": top_p,
         "repetition_penalty": float(repetition_penalty),
     }
     # Add user message to both histories
             )
             status_md = gr.Markdown("")
+            metrics_md = gr.Markdown("")
             log_box = gr.Textbox(
                 label="Pipeline Log",
                 lines=20,
                     top_p = gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top P")
                     max_tokens = gr.Slider(32, 4096, value=512, step=32, label="Max Tokens")
                     repetition_penalty = gr.Slider(
+                        1.0, 2.0, value=1.0, step=0.05,
                         label="Repetition Penalty",
+                        info="Penalizes repeated tokens — increase if model loops (1.0 = off)",
                     )
                     context_length = gr.Slider(
                         128, 32768, value=2048, step=128,
                     ab_temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="Temperature")
                     ab_top_p = gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top P")
                     ab_max_tokens = gr.Slider(32, 2048, value=256, step=32, label="Max Tokens")
+                    ab_rep_penalty = gr.Slider(1.0, 2.0, value=1.0, step=0.05, label="Rep Penalty")
                     ab_context_length = gr.Slider(
                         128, 32768, value=2048, step=128,
                         label="Context Length",
         fn=obliterate,
         inputs=[model_dd, method_dd, hub_repo, prompt_vol_dd, dataset_dd,
                 custom_harmful_tb, custom_harmless_tb] + _adv_controls,
+        outputs=[status_md, log_box, chat_status, session_model_dd, metrics_md],
     ).then(
         fn=lambda: (
             gr.update(choices=_get_session_model_choices()),

obliteratus/.DS_Store CHANGED Viewed

Binary files a/obliteratus/.DS_Store and b/obliteratus/.DS_Store differ

obliteratus/analysis/visualization.py CHANGED Viewed

@@ -15,12 +15,35 @@ Visualizations:
 from __future__ import annotations
 from pathlib import Path
 from typing import Any
 import torch
 def plot_refusal_topology(
     refusal_directions: dict[int, torch.Tensor],
     harmful_means: dict[int, torch.Tensor],
@@ -35,11 +58,7 @@ def plot_refusal_topology(
     harmful-harmless mean difference projected onto the refusal direction),
     with strong layers highlighted.
     """
-    import matplotlib
-    if output_path:
-        matplotlib.use("Agg")
-    import matplotlib.pyplot as plt
     layers = sorted(refusal_directions.keys())
     strengths = []
     for idx in layers:
@@ -86,11 +105,7 @@ def plot_cross_layer_heatmap(
     title: str = "Cross-Layer Refusal Direction Alignment",
 ):
     """Visualize the pairwise cosine similarity matrix between layer refusal directions."""
-    import matplotlib
-    if output_path:
-        matplotlib.use("Agg")
-    import matplotlib.pyplot as plt
     matrix = cross_layer_result.cosine_matrix.numpy()
     indices = cross_layer_result.layer_indices
     n = len(indices)
@@ -132,11 +147,7 @@ def plot_angular_drift(
     title: str = "Refusal Direction Angular Drift Through Network",
 ):
     """Visualize cumulative angular drift of the refusal direction."""
-    import matplotlib
-    if output_path:
-        matplotlib.use("Agg")
-    import matplotlib.pyplot as plt
     indices = cross_layer_result.layer_indices
     drift = cross_layer_result.angular_drift
@@ -173,11 +184,6 @@ def plot_logit_lens_spectrum(
     title: str | None = None,
 ):
     """Visualize the logit lens token promotion/suppression spectrum."""
-    import matplotlib
-    if output_path:
-        matplotlib.use("Agg")
-    import matplotlib.pyplot as plt
     # Select which layer to display
     if layer_idx is not None:
         result = logit_lens_result.per_layer.get(layer_idx)
@@ -189,6 +195,7 @@ def plot_logit_lens_spectrum(
     if title is None:
         title = f"Logit Lens — Layer {result.layer_idx}"
     # Combine top promoted and suppressed
     promoted = result.top_promoted[:15]
@@ -233,10 +240,7 @@ def plot_defense_radar(
     title: str = "Defense Robustness Profile",
 ):
     """Spider/radar chart of defense properties."""
-    import matplotlib
-    if output_path:
-        matplotlib.use("Agg")
-    import matplotlib.pyplot as plt
     import numpy as np
     categories = [
@@ -271,7 +275,7 @@ def plot_defense_radar(
     ax.set_ylim(0, 1)
     ax.set_yticks([0.25, 0.5, 0.75, 1.0])
     ax.set_yticklabels(["0.25", "0.50", "0.75", "1.00"], fontsize=8)
-    ax.set_title(f"{title}\n{p.model_name}", fontsize=14, fontweight="bold", pad=20)
     # Robustness badge
     robustness_colors = {
@@ -304,11 +308,7 @@ def plot_capability_safety_pareto(
     title: str = "Capability-Safety Pareto Frontier",
 ):
     """Plot the capability vs safety tradeoff."""
-    import matplotlib
-    if output_path:
-        matplotlib.use("Agg")
-    import matplotlib.pyplot as plt
     # Current point
     scores = [r.score for r in benchmark_results.values()]
     capability = sum(scores) / max(len(scores), 1)
@@ -363,11 +363,7 @@ def plot_probe_dashboard(
     title: str = "Activation Probe Dashboard",
 ):
     """Dashboard showing per-layer refusal elimination status."""
-    import matplotlib
-    if output_path:
-        matplotlib.use("Agg")
-    import matplotlib.pyplot as plt
     layers = sorted(probe_result.per_layer.keys())
     gaps = [probe_result.per_layer[idx].projection_gap for idx in layers]
     d_primes = [probe_result.per_layer[idx].separation_d_prime for idx in layers]

 from __future__ import annotations
+import re
 from pathlib import Path
 from typing import Any
+import matplotlib
+matplotlib.use("Agg")  # Set once at import time; safe for server & notebook
+import matplotlib.pyplot as plt
 import torch
+def _sanitize_label(text: str, max_len: int = 80) -> str:
+    """Strip filesystem paths, tokens, and overly-long strings from labels.
+    Prevents accidental leakage of sensitive information (HF org names,
+    local paths, API tokens) into saved chart images.
+    """
+    # Remove anything that looks like an absolute path
+    text = re.sub(r"(/[a-zA-Z0-9_./-]{3,})", lambda m: m.group(0).rsplit("/", 1)[-1], text)
+    # Remove HF-style token prefixes (hf_...)
+    text = re.sub(r"\bhf_[A-Za-z0-9]{6,}\b", "<TOKEN>", text)
+    # Remove generic secret-like hex strings (32+ chars)
+    text = re.sub(r"\b[0-9a-fA-F]{32,}\b", "<REDACTED>", text)
+    # Truncate
+    if len(text) > max_len:
+        text = text[: max_len - 3] + "..."
+    return text
 def plot_refusal_topology(
     refusal_directions: dict[int, torch.Tensor],
     harmful_means: dict[int, torch.Tensor],
     harmful-harmless mean difference projected onto the refusal direction),
     with strong layers highlighted.
     """
+    title = _sanitize_label(title)
     layers = sorted(refusal_directions.keys())
     strengths = []
     for idx in layers:
     title: str = "Cross-Layer Refusal Direction Alignment",
 ):
     """Visualize the pairwise cosine similarity matrix between layer refusal directions."""
+    title = _sanitize_label(title)
     matrix = cross_layer_result.cosine_matrix.numpy()
     indices = cross_layer_result.layer_indices
     n = len(indices)
     title: str = "Refusal Direction Angular Drift Through Network",
 ):
     """Visualize cumulative angular drift of the refusal direction."""
+    title = _sanitize_label(title)
     indices = cross_layer_result.layer_indices
     drift = cross_layer_result.angular_drift
     title: str | None = None,
 ):
     """Visualize the logit lens token promotion/suppression spectrum."""
     # Select which layer to display
     if layer_idx is not None:
         result = logit_lens_result.per_layer.get(layer_idx)
     if title is None:
         title = f"Logit Lens — Layer {result.layer_idx}"
+    title = _sanitize_label(title)
     # Combine top promoted and suppressed
     promoted = result.top_promoted[:15]
     title: str = "Defense Robustness Profile",
 ):
     """Spider/radar chart of defense properties."""
+    title = _sanitize_label(title)
     import numpy as np
     categories = [
     ax.set_ylim(0, 1)
     ax.set_yticks([0.25, 0.5, 0.75, 1.0])
     ax.set_yticklabels(["0.25", "0.50", "0.75", "1.00"], fontsize=8)
+    ax.set_title(f"{title}\n{_sanitize_label(p.model_name)}", fontsize=14, fontweight="bold", pad=20)
     # Robustness badge
     robustness_colors = {
     title: str = "Capability-Safety Pareto Frontier",
 ):
     """Plot the capability vs safety tradeoff."""
+    title = _sanitize_label(title)
     # Current point
     scores = [r.score for r in benchmark_results.values()]
     capability = sum(scores) / max(len(scores), 1)
     title: str = "Activation Probe Dashboard",
 ):
     """Dashboard showing per-layer refusal elimination status."""
+    title = _sanitize_label(title)
     layers = sorted(probe_result.per_layer.keys())
     gaps = [probe_result.per_layer[idx].projection_gap for idx in layers]
     d_primes = [probe_result.per_layer[idx].separation_d_prime for idx in layers]

obliteratus/evaluation/benchmark_plots.py CHANGED Viewed

@@ -18,6 +18,8 @@ Gradio's gr.Plot component.
 from __future__ import annotations
 import matplotlib
 matplotlib.use("Agg")  # Non-interactive backend for server use
 import matplotlib.pyplot as plt
@@ -25,6 +27,16 @@ import matplotlib.ticker as mticker
 import numpy as np
 # ── Color palette (research-paper friendly, colorblind-safe) ──────────────
 PALETTE = {
@@ -79,7 +91,7 @@ def plot_pareto_frontier(results: list[dict], title_suffix: str = "") -> plt.Fig
     xs = [r["refusal_rate"] * 100 for r in valid]  # percentage
     ys = [r["perplexity"] for r in valid]
-    labels = [r.get("method", r.get("model_short", "?")) for r in valid]
     colors = [_get_color(r.get("method", ""), i) for i, r in enumerate(valid)]
     # Scatter
@@ -161,7 +173,7 @@ def plot_method_radar(results: list[dict], title_suffix: str = "") -> plt.Figure
     angles += angles[:1]  # close the polygon
     for i, r in enumerate(valid):
-        label = r.get("method", r.get("model_short", f"run-{i}"))
         color = _get_color(r.get("method", ""), i)
         refusal_removal = 1.0 - (r.get("refusal_rate") or 1.0)
@@ -202,7 +214,7 @@ def plot_metric_bars(results: list[dict], title_suffix: str = "") -> plt.Figure:
         fig.tight_layout()
         return fig
-    labels = [r.get("method", r.get("model_short", "?")) for r in valid]
     n = len(valid)
     # Metrics
@@ -257,7 +269,7 @@ def plot_timing_efficiency(results: list[dict], title_suffix: str = "") -> plt.F
     fig, ax = plt.subplots(figsize=(8, 5.5))
     for i, r in enumerate(valid):
-        label = r.get("method", r.get("model_short", "?"))
         color = _get_color(r.get("method", ""), i)
         x = r["time_s"]
         # Composite quality: weighted sum of refusal removal and coherence
@@ -295,7 +307,7 @@ def plot_moe_metrics(results: list[dict], title_suffix: str = "") -> plt.Figure:
     if not valid:
         # Show all results with a note about no MoE activity
         fig, ax = plt.subplots(figsize=(8, 4))
-        all_labels = [r.get("method", r.get("model_short", "?")) for r in results]
         if results:
             ax.barh(all_labels, [0] * len(results), color="#cccccc")
             ax.text(0.5, 0.5, "No MoE-specific features activated\n(use surgical/optimized/nuclear methods)",
@@ -306,7 +318,7 @@ def plot_moe_metrics(results: list[dict], title_suffix: str = "") -> plt.Figure:
         fig.tight_layout()
         return fig
-    labels = [r.get("method", r.get("model_short", "?")) for r in valid]
     ega = [r.get("ega_expert_dirs", 0) for r in valid]
     cot = [r.get("cot_preserved", 0) for r in valid]
     safety = [r.get("expert_classified_layers", r.get("ega_safety_layers", 0)) for r in valid]
@@ -349,7 +361,7 @@ def plot_model_scaling(results: list[dict], title_suffix: str = "") -> plt.Figur
         fig.tight_layout()
         return fig
-    labels = [r.get("model_short", r.get("model", "?").split("/")[-1]) for r in valid]
     n = len(valid)
     x = np.arange(n)

 from __future__ import annotations
+import re
 import matplotlib
 matplotlib.use("Agg")  # Non-interactive backend for server use
 import matplotlib.pyplot as plt
 import numpy as np
+def _sanitize_label(text: str, max_len: int = 80) -> str:
+    """Strip filesystem paths, tokens, and overly-long strings from labels."""
+    text = re.sub(r"(/[a-zA-Z0-9_./-]{3,})", lambda m: m.group(0).rsplit("/", 1)[-1], text)
+    text = re.sub(r"\bhf_[A-Za-z0-9]{6,}\b", "<TOKEN>", text)
+    text = re.sub(r"\b[0-9a-fA-F]{32,}\b", "<REDACTED>", text)
+    if len(text) > max_len:
+        text = text[: max_len - 3] + "..."
+    return text
 # ── Color palette (research-paper friendly, colorblind-safe) ──────────────
 PALETTE = {
     xs = [r["refusal_rate"] * 100 for r in valid]  # percentage
     ys = [r["perplexity"] for r in valid]
+    labels = [_sanitize_label(r.get("method", r.get("model_short", "?"))) for r in valid]
     colors = [_get_color(r.get("method", ""), i) for i, r in enumerate(valid)]
     # Scatter
     angles += angles[:1]  # close the polygon
     for i, r in enumerate(valid):
+        label = _sanitize_label(r.get("method", r.get("model_short", f"run-{i}")))
         color = _get_color(r.get("method", ""), i)
         refusal_removal = 1.0 - (r.get("refusal_rate") or 1.0)
         fig.tight_layout()
         return fig
+    labels = [_sanitize_label(r.get("method", r.get("model_short", "?"))) for r in valid]
     n = len(valid)
     # Metrics
     fig, ax = plt.subplots(figsize=(8, 5.5))
     for i, r in enumerate(valid):
+        label = _sanitize_label(r.get("method", r.get("model_short", "?")))
         color = _get_color(r.get("method", ""), i)
         x = r["time_s"]
         # Composite quality: weighted sum of refusal removal and coherence
     if not valid:
         # Show all results with a note about no MoE activity
         fig, ax = plt.subplots(figsize=(8, 4))
+        all_labels = [_sanitize_label(r.get("method", r.get("model_short", "?"))) for r in results]
         if results:
             ax.barh(all_labels, [0] * len(results), color="#cccccc")
             ax.text(0.5, 0.5, "No MoE-specific features activated\n(use surgical/optimized/nuclear methods)",
         fig.tight_layout()
         return fig
+    labels = [_sanitize_label(r.get("method", r.get("model_short", "?"))) for r in valid]
     ega = [r.get("ega_expert_dirs", 0) for r in valid]
     cot = [r.get("cot_preserved", 0) for r in valid]
     safety = [r.get("expert_classified_layers", r.get("ega_safety_layers", 0)) for r in valid]
         fig.tight_layout()
         return fig
+    labels = [_sanitize_label(r.get("model_short", r.get("model", "?").split("/")[-1])) for r in valid]
     n = len(valid)
     x = np.arange(n)

obliteratus/reporting/report.py CHANGED Viewed

@@ -3,6 +3,7 @@
 from __future__ import annotations
 import json
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any
@@ -10,6 +11,16 @@ from typing import Any
 import pandas as pd
 @dataclass
 class AblationResult:
     """Result of a single ablation experiment."""
@@ -69,7 +80,7 @@ class AblationReport:
             console.print("[yellow]No ablation results to display.[/yellow]")
             return
-        table = Table(title=f"Ablation Results: {self.model_name}")
         table.add_column("Strategy", style="cyan")
         table.add_column("Component", style="green")
@@ -154,7 +165,7 @@ class AblationReport:
         ax.set_xlabel(f"Change in {metric} (vs baseline)")
         ax.set_ylabel("Ablated Component")
-        ax.set_title(f"Ablation Impact on {metric} — {self.model_name}")
         ax.axvline(x=0, color="black", linewidth=0.8)
         plt.tight_layout()
@@ -183,7 +194,7 @@ class AblationReport:
         fig, ax = plt.subplots(figsize=(max(6, len(pivot.columns) * 2), max(4, len(pivot) * 0.4)))
         sns.heatmap(pivot, annot=True, fmt=".1f", cmap="RdYlGn_r", center=0, ax=ax)
-        ax.set_title(f"Ablation % Change — {self.model_name}")
         plt.tight_layout()
         if output_path:

 from __future__ import annotations
 import json
+import re
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any
 import pandas as pd
+def _sanitize_label(text: str, max_len: int = 80) -> str:
+    """Strip filesystem paths, tokens, and overly-long strings from labels."""
+    text = re.sub(r"(/[a-zA-Z0-9_./-]{3,})", lambda m: m.group(0).rsplit("/", 1)[-1], text)
+    text = re.sub(r"\bhf_[A-Za-z0-9]{6,}\b", "<TOKEN>", text)
+    text = re.sub(r"\b[0-9a-fA-F]{32,}\b", "<REDACTED>", text)
+    if len(text) > max_len:
+        text = text[: max_len - 3] + "..."
+    return text
 @dataclass
 class AblationResult:
     """Result of a single ablation experiment."""
             console.print("[yellow]No ablation results to display.[/yellow]")
             return
+        table = Table(title=f"Ablation Results: {_sanitize_label(self.model_name)}")
         table.add_column("Strategy", style="cyan")
         table.add_column("Component", style="green")
         ax.set_xlabel(f"Change in {metric} (vs baseline)")
         ax.set_ylabel("Ablated Component")
+        ax.set_title(f"Ablation Impact on {metric} — {_sanitize_label(self.model_name)}")
         ax.axvline(x=0, color="black", linewidth=0.8)
         plt.tight_layout()
         fig, ax = plt.subplots(figsize=(max(6, len(pivot.columns) * 2), max(4, len(pivot) * 0.4)))
         sns.heatmap(pivot, annot=True, fmt=".1f", cmap="RdYlGn_r", center=0, ax=ax)
+        ax.set_title(f"Ablation % Change — {_sanitize_label(self.model_name)}")
         plt.tight_layout()
         if output_path:

tests/test_visualization.py CHANGED Viewed

@@ -11,6 +11,7 @@ import torch
 from obliteratus.analysis.cross_layer import CrossLayerAlignmentAnalyzer
 from obliteratus.analysis.activation_probing import ActivationProbe
 from obliteratus.analysis.visualization import (
     plot_refusal_topology,
     plot_cross_layer_heatmap,
     plot_angular_drift,
@@ -117,3 +118,50 @@ class TestDefenseRadar:
         path = tmp_dir / "radar.png"
         plot_defense_radar(profile, output_path=path)
         assert path.exists()

 from obliteratus.analysis.cross_layer import CrossLayerAlignmentAnalyzer
 from obliteratus.analysis.activation_probing import ActivationProbe
 from obliteratus.analysis.visualization import (
+    _sanitize_label,
     plot_refusal_topology,
     plot_cross_layer_heatmap,
     plot_angular_drift,
         path = tmp_dir / "radar.png"
         plot_defense_radar(profile, output_path=path)
         assert path.exists()
+    def test_model_name_sanitized_in_title(self, tmp_dir):
+        """Ensure sensitive paths in model_name don't leak into saved charts."""
+        profile = DefenseProfile(
+            model_name="/home/user/.cache/huggingface/hub/models--secret-org/private-model",
+            alignment_type_estimate="RLHF-like",
+            refusal_concentration=0.4,
+            refusal_layer_spread=5,
+            mean_refusal_strength=2.0,
+            max_refusal_strength=4.0,
+            self_repair_estimate=0.6,
+            entanglement_score=0.3,
+            estimated_robustness="medium",
+        )
+        path = tmp_dir / "radar_sanitized.png"
+        fig = plot_defense_radar(profile, output_path=path)
+        # Title should not contain the full filesystem path
+        title_text = fig.axes[0].get_title()
+        assert "/home/user" not in title_text
+        assert ".cache" not in title_text
+class TestSanitizeLabel:
+    def test_strips_absolute_paths(self):
+        result = _sanitize_label("/home/user/.cache/huggingface/models--org/model")
+        assert "/home/user" not in result
+        assert "model" in result
+    def test_redacts_hf_tokens(self):
+        result = _sanitize_label("model with hf_abcdefghij token")
+        assert "hf_abcdefghij" not in result
+        assert "<TOKEN>" in result
+    def test_redacts_long_hex_strings(self):
+        hex_str = "a" * 40
+        result = _sanitize_label(f"commit {hex_str}")
+        assert hex_str not in result
+        assert "<REDACTED>" in result
+    def test_truncates_long_strings(self):
+        long = "x" * 200
+        result = _sanitize_label(long)
+        assert len(result) <= 80
+        assert result.endswith("...")
+    def test_passes_normal_strings_through(self):
+        assert _sanitize_label("Refusal Topology Map") == "Refusal Topology Map"