pliny-the-prompter commited on
Commit
664144c
·
verified ·
1 Parent(s): d419e87

Upload 129 files

Browse files
app.py CHANGED
@@ -588,6 +588,114 @@ _BENCH_TEST_PROMPTS = [
588
  ]
589
 
590
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
591
  def _figs_to_gallery(figs: list) -> list[tuple[str, str]]:
592
  """Convert matplotlib Figures to gallery-compatible (filepath, caption) tuples."""
593
  import tempfile
@@ -648,6 +756,7 @@ def benchmark(
648
 
649
  results = []
650
  all_logs = []
 
651
 
652
  # Compute actual prompt count that will be used
653
  if prompt_volume > 0:
@@ -815,6 +924,10 @@ def benchmark(
815
  all_logs.append(f" Strong layers: {entry['strong_layers']}")
816
  all_logs.append(f" EGA expert directions: {entry['ega_expert_dirs']}")
817
 
 
 
 
 
818
  results.append(entry)
819
 
820
  # ── Telemetry: log benchmark result for community leaderboard ──
@@ -873,13 +986,16 @@ def benchmark(
873
  from obliteratus.evaluation.benchmark_plots import generate_benchmark_dashboard
874
  dashboard_figs = generate_benchmark_dashboard(results, mode="multi_method", title_suffix=f" — {model_id}")
875
 
 
 
 
876
  # Convert figures to gallery images
877
- gallery_images = _figs_to_gallery(dashboard_figs)
878
 
879
  # Final summary
880
  all_logs.append("\n" + "=" * 60)
881
  all_logs.append("BENCHMARK COMPLETE")
882
- all_logs.append(f"Generated {len(dashboard_figs)} visualizations")
883
  all_logs.append("=" * 60)
884
  all_logs.append("\nJSON results:")
885
  all_logs.append(_json.dumps(results, indent=2, default=str))
@@ -999,6 +1115,7 @@ def benchmark_multi_model(
999
 
1000
  results = []
1001
  all_logs = []
 
1002
  bench_context = {
1003
  "method": method_key,
1004
  "dataset": source_label,
@@ -1151,6 +1268,11 @@ def benchmark_multi_model(
1151
  all_logs.append(f" Completed in {elapsed:.1f}s")
1152
  all_logs.append(f" PPL={entry['perplexity']}, Coherence={entry['coherence']}, Refusal={entry['refusal_rate']}")
1153
 
 
 
 
 
 
1154
  results.append(entry)
1155
 
1156
  # ── Telemetry: log multi-model benchmark result ──
@@ -1206,11 +1328,15 @@ def benchmark_multi_model(
1206
  # Generate dashboard visualizations
1207
  from obliteratus.evaluation.benchmark_plots import generate_benchmark_dashboard
1208
  dashboard_figs = generate_benchmark_dashboard(results, mode="multi_model", title_suffix=f" \u2014 {method_key}")
1209
- gallery_images = _figs_to_gallery(dashboard_figs)
 
 
 
 
1210
 
1211
  all_logs.append("\n" + "=" * 60)
1212
  all_logs.append("MULTI-MODEL BENCHMARK COMPLETE")
1213
- all_logs.append(f"Generated {len(dashboard_figs)} visualizations")
1214
  all_logs.append("=" * 60)
1215
  all_logs.append("\nJSON results:")
1216
  all_logs.append(_json.dumps(results, indent=2, default=str))
@@ -1333,7 +1459,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
1333
  f" or locally: `export HF_TOKEN=hf_...`\n\n"
1334
  f"Get your token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)\n\n"
1335
  f"Alternatively, choose a non-gated model (those without the \U0001f512 icon).",
1336
- "", gr.update(), gr.update(),
1337
  )
1338
  return
1339
 
@@ -1342,14 +1468,14 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
1342
  if not re.match(r'^[a-zA-Z0-9_-]+/[a-zA-Z0-9_.-]+$', push_to_hub):
1343
  yield (
1344
  "**Error:** Invalid Hub repo format. Use `username/model-name`.",
1345
- "", gr.update(), gr.update(),
1346
  )
1347
  return
1348
  if not os.environ.get("HF_TOKEN"):
1349
  yield (
1350
  "**Error:** HF_TOKEN not set. Push to Hub requires a write token. "
1351
  "Set it via `export HF_TOKEN=hf_...` or in your Space secrets.",
1352
- "", gr.update(), gr.update(),
1353
  )
1354
  return
1355
 
@@ -1360,7 +1486,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
1360
  _clear_gpu()
1361
  with _lock:
1362
  if _state["status"] == "obliterating":
1363
- yield "**Error:** An obliteration is already in progress.", "", gr.update(), gr.update()
1364
  return
1365
  _state["log"] = []
1366
  _state["status"] = "obliterating"
@@ -1512,9 +1638,9 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
1512
  status_msg = f"**Obliterating\u2026** ({_elapsed()})"
1513
  if len(log_lines) > last_yielded[0]:
1514
  last_yielded[0] = len(log_lines)
1515
- yield status_msg, "\n".join(log_lines), gr.update(), gr.update()
1516
  else:
1517
- yield status_msg, "\n".join(log_lines), gr.update(), gr.update()
1518
  if time.time() - _pipeline_start > _max_pipeline_secs:
1519
  log_lines.append("\nTIMEOUT: Pipeline exceeded 45-minute limit.")
1520
  break
@@ -1529,7 +1655,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
1529
  err_msg = str(error_ref[0]) or repr(error_ref[0])
1530
  log_lines.append(f"\nERROR: {err_msg}")
1531
  _state["log"] = log_lines
1532
- yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update()
1533
  return
1534
 
1535
  # Success — keep model in memory for chat.
@@ -1631,7 +1757,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
1631
  if bnb_available:
1632
  log_lines.append("\nModel too large for chat at float16 — reloading in 4-bit...")
1633
  last_yielded[0] = len(log_lines)
1634
- yield status_msg, "\n".join(log_lines), gr.update(), gr.update()
1635
  try:
1636
  from transformers import BitsAndBytesConfig
1637
  bnb_cfg = BitsAndBytesConfig(
@@ -1678,7 +1804,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
1678
  else "Falling back to CPU offload..."
1679
  )
1680
  last_yielded[0] = len(log_lines)
1681
- yield status_msg, "\n".join(log_lines), gr.update(), gr.update()
1682
  try:
1683
  offload_dir = tempfile.mkdtemp(prefix="obliteratus_offload_")
1684
  model_reloaded = AutoModelForCausalLM.from_pretrained(
@@ -1713,6 +1839,9 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
1713
  with _lock:
1714
  _state["status"] = "idle"
1715
 
 
 
 
1716
  # Free pipeline internals we no longer need (activations, directions cache)
1717
  # to reclaim memory — we've already extracted the model and steering metadata.
1718
  pipeline_ref[0] = None
@@ -1738,7 +1867,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
1738
  choices=_get_session_model_choices(),
1739
  value=_last_obliterated_label or None,
1740
  )
1741
- yield status_msg, "\n".join(log_lines), get_chat_header(), _dd_update
1742
 
1743
  except Exception as e:
1744
  # Ensure status never gets stuck on "obliterating"
@@ -1747,7 +1876,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
1747
  err_msg = str(e) or repr(e)
1748
  log_lines.append(f"\nERROR (post-pipeline): {err_msg}")
1749
  _state["log"] = log_lines
1750
- yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update()
1751
 
1752
 
1753
  # ---------------------------------------------------------------------------
@@ -1837,6 +1966,10 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
1837
  )
1838
  if tokenizer.pad_token is None:
1839
  tokenizer.pad_token = tokenizer.eos_token
 
 
 
 
1840
  with _lock:
1841
  _state["model"] = model
1842
  _state["tokenizer"] = tokenizer
@@ -1874,9 +2007,8 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
1874
  inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=context_length)
1875
  inputs = {k: v.to(model.device) for k, v in inputs.items()}
1876
 
1877
- # Streaming generation — repetition_penalty and no_repeat_ngram_size
1878
- # break degenerate refusal loops where the model gets stuck cycling
1879
- # through fragments of its safety response
1880
  # Scale timeout with max_tokens: large generations need more time.
1881
  # Base 120s + ~0.1s per token gives headroom for slow models.
1882
  stream_timeout = max(120, 120 + int(max_tokens * 0.1))
@@ -1888,7 +2020,6 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
1888
  "temperature": max(temperature, 0.01),
1889
  "top_p": top_p,
1890
  "repetition_penalty": float(repetition_penalty),
1891
- "no_repeat_ngram_size": 4,
1892
  "streamer": streamer,
1893
  }
1894
 
@@ -2199,6 +2330,10 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
2199
  )
2200
  if tokenizer.pad_token is None:
2201
  tokenizer.pad_token = tokenizer.eos_token
 
 
 
 
2202
  with _lock:
2203
  _state["model"] = abliterated_model
2204
  _state["tokenizer"] = tokenizer
@@ -2240,7 +2375,6 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
2240
  "temperature": max(temperature, 0.01),
2241
  "top_p": top_p,
2242
  "repetition_penalty": float(repetition_penalty),
2243
- "no_repeat_ngram_size": 4,
2244
  }
2245
 
2246
  # Add user message to both histories
@@ -3236,6 +3370,7 @@ with gr.Blocks(theme=THEME, css=CSS, js=_JS, title="OBLITERATUS", fill_height=Tr
3236
  )
3237
 
3238
  status_md = gr.Markdown("")
 
3239
  log_box = gr.Textbox(
3240
  label="Pipeline Log",
3241
  lines=20,
@@ -3606,9 +3741,9 @@ Pre-configured benchmark configurations for common research questions.
3606
  top_p = gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top P")
3607
  max_tokens = gr.Slider(32, 4096, value=512, step=32, label="Max Tokens")
3608
  repetition_penalty = gr.Slider(
3609
- 1.0, 2.0, value=1.15, step=0.05,
3610
  label="Repetition Penalty",
3611
- info="Penalizes repeated tokens — higher values break refusal loops (1.0 = off)",
3612
  )
3613
  context_length = gr.Slider(
3614
  128, 32768, value=2048, step=128,
@@ -3656,7 +3791,7 @@ See exactly how abliteration changes model behavior on the same prompt.
3656
  ab_temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="Temperature")
3657
  ab_top_p = gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top P")
3658
  ab_max_tokens = gr.Slider(32, 2048, value=256, step=32, label="Max Tokens")
3659
- ab_rep_penalty = gr.Slider(1.0, 2.0, value=1.15, step=0.05, label="Rep Penalty")
3660
  ab_context_length = gr.Slider(
3661
  128, 32768, value=2048, step=128,
3662
  label="Context Length",
@@ -4063,7 +4198,7 @@ Built on the shoulders of:
4063
  fn=obliterate,
4064
  inputs=[model_dd, method_dd, hub_repo, prompt_vol_dd, dataset_dd,
4065
  custom_harmful_tb, custom_harmless_tb] + _adv_controls,
4066
- outputs=[status_md, log_box, chat_status, session_model_dd],
4067
  ).then(
4068
  fn=lambda: (
4069
  gr.update(choices=_get_session_model_choices()),
 
588
  ]
589
 
590
 
591
+ def _format_obliteration_metrics(pipeline, method: str, elapsed_str: str) -> str:
592
+ """Format post-obliteration quality metrics as a concise Markdown card."""
593
+ metrics = getattr(pipeline, "_quality_metrics", {}) or {}
594
+ ppl = metrics.get("perplexity")
595
+ coh = metrics.get("coherence")
596
+ ref = metrics.get("refusal_rate")
597
+ kl = metrics.get("kl_divergence")
598
+ n_layers = len(getattr(pipeline, "_strong_layers", []))
599
+
600
+ parts = ["### Liberation Results\n"]
601
+ parts.append("| Metric | Value | |")
602
+ parts.append("|--------|------:|---|")
603
+
604
+ if ref is not None:
605
+ pct = ref * 100
606
+ icon = "🟢" if pct < 10 else "🟡" if pct < 30 else "🔴"
607
+ parts.append(f"| Refusal Rate | **{pct:.1f}%** | {icon} |")
608
+ if coh is not None:
609
+ pct = coh * 100
610
+ icon = "🟢" if pct > 80 else "🟡" if pct > 60 else "🔴"
611
+ parts.append(f"| Coherence | **{pct:.1f}%** | {icon} |")
612
+ if ppl is not None:
613
+ icon = "🟢" if ppl < 12 else "🟡" if ppl < 20 else "🔴"
614
+ parts.append(f"| Perplexity | **{ppl:.2f}** | {icon} |")
615
+ if kl is not None:
616
+ icon = "🟢" if kl < 0.05 else "🟡" if kl < 0.1 else "🔴"
617
+ parts.append(f"| KL Divergence | **{kl:.4f}** | {icon} |")
618
+ if n_layers > 0:
619
+ parts.append(f"| Layers Modified | **{n_layers}** | |")
620
+
621
+ if not metrics:
622
+ return ""
623
+
624
+ return "\n".join(parts)
625
+
626
+
627
+ def _generate_analysis_figs(pipeline, model_label: str = "") -> list:
628
+ """Generate analysis visualizations from a completed pipeline's surviving data.
629
+
630
+ Produces cross-layer heatmap + angular drift charts from refusal_directions
631
+ (which persist after pipeline.run()), and a refusal topology chart using
632
+ direction norms as a proxy for signal strength (since activation means are
633
+ freed during execution).
634
+ """
635
+ figs = []
636
+ directions = getattr(pipeline, "refusal_directions", {})
637
+ strong_layers = getattr(pipeline, "_strong_layers", [])
638
+
639
+ if len(directions) < 2:
640
+ return figs
641
+
642
+ try:
643
+ from obliteratus.analysis.cross_layer import CrossLayerAlignmentAnalyzer
644
+ from obliteratus.analysis.visualization import (
645
+ plot_cross_layer_heatmap,
646
+ plot_angular_drift,
647
+ )
648
+ import tempfile, os
649
+
650
+ analyzer = CrossLayerAlignmentAnalyzer()
651
+ result = analyzer.analyze(directions)
652
+
653
+ suffix = f" — {model_label}" if model_label else ""
654
+
655
+ heatmap_fig = plot_cross_layer_heatmap(
656
+ result,
657
+ output_path=tempfile.mktemp(suffix=".png"),
658
+ title=f"Cross-Layer Direction Alignment{suffix}",
659
+ )
660
+ figs.append(heatmap_fig)
661
+
662
+ drift_fig = plot_angular_drift(
663
+ result,
664
+ output_path=tempfile.mktemp(suffix=".png"),
665
+ title=f"Refusal Direction Angular Drift{suffix}",
666
+ )
667
+ figs.append(drift_fig)
668
+ except Exception:
669
+ pass # Analysis charts are best-effort
670
+
671
+ # Refusal topology using direction norms as proxy (means are freed)
672
+ if directions and strong_layers:
673
+ try:
674
+ from obliteratus.analysis.visualization import plot_refusal_topology
675
+ import tempfile
676
+ # Build proxy means from direction norms
677
+ proxy_harmful = {}
678
+ proxy_harmless = {}
679
+ for idx, d in directions.items():
680
+ d_f = d.float().squeeze()
681
+ d_f = d_f / d_f.norm().clamp(min=1e-8)
682
+ # Simulate a separation proportional to the direction norm
683
+ norm = d.float().squeeze().norm().item()
684
+ proxy_harmless[idx] = torch.zeros_like(d_f).unsqueeze(0)
685
+ proxy_harmful[idx] = (d_f * norm).unsqueeze(0)
686
+
687
+ topo_fig = plot_refusal_topology(
688
+ directions, proxy_harmful, proxy_harmless, list(strong_layers),
689
+ output_path=tempfile.mktemp(suffix=".png"),
690
+ title=f"Refusal Topology Map{suffix}",
691
+ )
692
+ figs.append(topo_fig)
693
+ except Exception:
694
+ pass
695
+
696
+ return figs
697
+
698
+
699
  def _figs_to_gallery(figs: list) -> list[tuple[str, str]]:
700
  """Convert matplotlib Figures to gallery-compatible (filepath, caption) tuples."""
701
  import tempfile
 
756
 
757
  results = []
758
  all_logs = []
759
+ analysis_figs = [] # Cross-layer/topology charts from each pipeline run
760
 
761
  # Compute actual prompt count that will be used
762
  if prompt_volume > 0:
 
924
  all_logs.append(f" Strong layers: {entry['strong_layers']}")
925
  all_logs.append(f" EGA expert directions: {entry['ega_expert_dirs']}")
926
 
927
+ # Extract analysis visualizations before pipeline is freed
928
+ method_figs = _generate_analysis_figs(pipeline, method_key)
929
+ analysis_figs.extend(method_figs)
930
+
931
  results.append(entry)
932
 
933
  # ── Telemetry: log benchmark result for community leaderboard ──
 
986
  from obliteratus.evaluation.benchmark_plots import generate_benchmark_dashboard
987
  dashboard_figs = generate_benchmark_dashboard(results, mode="multi_method", title_suffix=f" — {model_id}")
988
 
989
+ # Append per-method analysis charts (cross-layer heatmaps, topology maps, etc.)
990
+ all_figs = dashboard_figs + analysis_figs
991
+
992
  # Convert figures to gallery images
993
+ gallery_images = _figs_to_gallery(all_figs)
994
 
995
  # Final summary
996
  all_logs.append("\n" + "=" * 60)
997
  all_logs.append("BENCHMARK COMPLETE")
998
+ all_logs.append(f"Generated {len(all_figs)} visualizations")
999
  all_logs.append("=" * 60)
1000
  all_logs.append("\nJSON results:")
1001
  all_logs.append(_json.dumps(results, indent=2, default=str))
 
1115
 
1116
  results = []
1117
  all_logs = []
1118
+ analysis_figs = [] # Cross-layer/topology charts from each pipeline run
1119
  bench_context = {
1120
  "method": method_key,
1121
  "dataset": source_label,
 
1268
  all_logs.append(f" Completed in {elapsed:.1f}s")
1269
  all_logs.append(f" PPL={entry['perplexity']}, Coherence={entry['coherence']}, Refusal={entry['refusal_rate']}")
1270
 
1271
+ # Extract analysis visualizations before pipeline is freed
1272
+ model_short = model_id.split("/")[-1] if "/" in model_id else model_id
1273
+ method_figs = _generate_analysis_figs(pipeline, model_short)
1274
+ analysis_figs.extend(method_figs)
1275
+
1276
  results.append(entry)
1277
 
1278
  # ── Telemetry: log multi-model benchmark result ──
 
1328
  # Generate dashboard visualizations
1329
  from obliteratus.evaluation.benchmark_plots import generate_benchmark_dashboard
1330
  dashboard_figs = generate_benchmark_dashboard(results, mode="multi_model", title_suffix=f" \u2014 {method_key}")
1331
+
1332
+ # Append per-model analysis charts (cross-layer heatmaps, topology maps, etc.)
1333
+ all_figs = dashboard_figs + analysis_figs
1334
+
1335
+ gallery_images = _figs_to_gallery(all_figs)
1336
 
1337
  all_logs.append("\n" + "=" * 60)
1338
  all_logs.append("MULTI-MODEL BENCHMARK COMPLETE")
1339
+ all_logs.append(f"Generated {len(all_figs)} visualizations")
1340
  all_logs.append("=" * 60)
1341
  all_logs.append("\nJSON results:")
1342
  all_logs.append(_json.dumps(results, indent=2, default=str))
 
1459
  f" or locally: `export HF_TOKEN=hf_...`\n\n"
1460
  f"Get your token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)\n\n"
1461
  f"Alternatively, choose a non-gated model (those without the \U0001f512 icon).",
1462
+ "", gr.update(), gr.update(), gr.update(),
1463
  )
1464
  return
1465
 
 
1468
  if not re.match(r'^[a-zA-Z0-9_-]+/[a-zA-Z0-9_.-]+$', push_to_hub):
1469
  yield (
1470
  "**Error:** Invalid Hub repo format. Use `username/model-name`.",
1471
+ "", gr.update(), gr.update(), gr.update(),
1472
  )
1473
  return
1474
  if not os.environ.get("HF_TOKEN"):
1475
  yield (
1476
  "**Error:** HF_TOKEN not set. Push to Hub requires a write token. "
1477
  "Set it via `export HF_TOKEN=hf_...` or in your Space secrets.",
1478
+ "", gr.update(), gr.update(), gr.update(),
1479
  )
1480
  return
1481
 
 
1486
  _clear_gpu()
1487
  with _lock:
1488
  if _state["status"] == "obliterating":
1489
+ yield "**Error:** An obliteration is already in progress.", "", gr.update(), gr.update(), gr.update()
1490
  return
1491
  _state["log"] = []
1492
  _state["status"] = "obliterating"
 
1638
  status_msg = f"**Obliterating\u2026** ({_elapsed()})"
1639
  if len(log_lines) > last_yielded[0]:
1640
  last_yielded[0] = len(log_lines)
1641
+ yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update()
1642
  else:
1643
+ yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update()
1644
  if time.time() - _pipeline_start > _max_pipeline_secs:
1645
  log_lines.append("\nTIMEOUT: Pipeline exceeded 45-minute limit.")
1646
  break
 
1655
  err_msg = str(error_ref[0]) or repr(error_ref[0])
1656
  log_lines.append(f"\nERROR: {err_msg}")
1657
  _state["log"] = log_lines
1658
+ yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update(), gr.update()
1659
  return
1660
 
1661
  # Success — keep model in memory for chat.
 
1757
  if bnb_available:
1758
  log_lines.append("\nModel too large for chat at float16 — reloading in 4-bit...")
1759
  last_yielded[0] = len(log_lines)
1760
+ yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update()
1761
  try:
1762
  from transformers import BitsAndBytesConfig
1763
  bnb_cfg = BitsAndBytesConfig(
 
1804
  else "Falling back to CPU offload..."
1805
  )
1806
  last_yielded[0] = len(log_lines)
1807
+ yield status_msg, "\n".join(log_lines), gr.update(), gr.update(), gr.update()
1808
  try:
1809
  offload_dir = tempfile.mkdtemp(prefix="obliteratus_offload_")
1810
  model_reloaded = AutoModelForCausalLM.from_pretrained(
 
1839
  with _lock:
1840
  _state["status"] = "idle"
1841
 
1842
+ # Build metrics summary card while pipeline is still alive
1843
+ metrics_card = _format_obliteration_metrics(pipeline, method, _elapsed())
1844
+
1845
  # Free pipeline internals we no longer need (activations, directions cache)
1846
  # to reclaim memory — we've already extracted the model and steering metadata.
1847
  pipeline_ref[0] = None
 
1867
  choices=_get_session_model_choices(),
1868
  value=_last_obliterated_label or None,
1869
  )
1870
+ yield status_msg, "\n".join(log_lines), get_chat_header(), _dd_update, metrics_card
1871
 
1872
  except Exception as e:
1873
  # Ensure status never gets stuck on "obliterating"
 
1876
  err_msg = str(e) or repr(e)
1877
  log_lines.append(f"\nERROR (post-pipeline): {err_msg}")
1878
  _state["log"] = log_lines
1879
+ yield f"**Error:** {err_msg}", "\n".join(log_lines), get_chat_header(), gr.update(), gr.update()
1880
 
1881
 
1882
  # ---------------------------------------------------------------------------
 
1966
  )
1967
  if tokenizer.pad_token is None:
1968
  tokenizer.pad_token = tokenizer.eos_token
1969
+ # Re-install activation steering hooks on the reloaded model
1970
+ steering_meta = _state.get("steering")
1971
+ if steering_meta:
1972
+ _install_steering_hooks(model, steering_meta)
1973
  with _lock:
1974
  _state["model"] = model
1975
  _state["tokenizer"] = tokenizer
 
2007
  inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=context_length)
2008
  inputs = {k: v.to(model.device) for k, v in inputs.items()}
2009
 
2010
+ # Streaming generation — repetition_penalty (user-controllable, default 1.0)
2011
+ # can break degenerate refusal loops if increased.
 
2012
  # Scale timeout with max_tokens: large generations need more time.
2013
  # Base 120s + ~0.1s per token gives headroom for slow models.
2014
  stream_timeout = max(120, 120 + int(max_tokens * 0.1))
 
2020
  "temperature": max(temperature, 0.01),
2021
  "top_p": top_p,
2022
  "repetition_penalty": float(repetition_penalty),
 
2023
  "streamer": streamer,
2024
  }
2025
 
 
2330
  )
2331
  if tokenizer.pad_token is None:
2332
  tokenizer.pad_token = tokenizer.eos_token
2333
+ # Re-install activation steering hooks on the reloaded model
2334
+ steering_meta = _state.get("steering")
2335
+ if steering_meta:
2336
+ _install_steering_hooks(abliterated_model, steering_meta)
2337
  with _lock:
2338
  _state["model"] = abliterated_model
2339
  _state["tokenizer"] = tokenizer
 
2375
  "temperature": max(temperature, 0.01),
2376
  "top_p": top_p,
2377
  "repetition_penalty": float(repetition_penalty),
 
2378
  }
2379
 
2380
  # Add user message to both histories
 
3370
  )
3371
 
3372
  status_md = gr.Markdown("")
3373
+ metrics_md = gr.Markdown("")
3374
  log_box = gr.Textbox(
3375
  label="Pipeline Log",
3376
  lines=20,
 
3741
  top_p = gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top P")
3742
  max_tokens = gr.Slider(32, 4096, value=512, step=32, label="Max Tokens")
3743
  repetition_penalty = gr.Slider(
3744
+ 1.0, 2.0, value=1.0, step=0.05,
3745
  label="Repetition Penalty",
3746
+ info="Penalizes repeated tokens — increase if model loops (1.0 = off)",
3747
  )
3748
  context_length = gr.Slider(
3749
  128, 32768, value=2048, step=128,
 
3791
  ab_temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="Temperature")
3792
  ab_top_p = gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top P")
3793
  ab_max_tokens = gr.Slider(32, 2048, value=256, step=32, label="Max Tokens")
3794
+ ab_rep_penalty = gr.Slider(1.0, 2.0, value=1.0, step=0.05, label="Rep Penalty")
3795
  ab_context_length = gr.Slider(
3796
  128, 32768, value=2048, step=128,
3797
  label="Context Length",
 
4198
  fn=obliterate,
4199
  inputs=[model_dd, method_dd, hub_repo, prompt_vol_dd, dataset_dd,
4200
  custom_harmful_tb, custom_harmless_tb] + _adv_controls,
4201
+ outputs=[status_md, log_box, chat_status, session_model_dd, metrics_md],
4202
  ).then(
4203
  fn=lambda: (
4204
  gr.update(choices=_get_session_model_choices()),
obliteratus/.DS_Store CHANGED
Binary files a/obliteratus/.DS_Store and b/obliteratus/.DS_Store differ
 
obliteratus/analysis/visualization.py CHANGED
@@ -15,12 +15,35 @@ Visualizations:
15
 
16
  from __future__ import annotations
17
 
 
18
  from pathlib import Path
19
  from typing import Any
20
 
 
 
 
 
21
  import torch
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def plot_refusal_topology(
25
  refusal_directions: dict[int, torch.Tensor],
26
  harmful_means: dict[int, torch.Tensor],
@@ -35,11 +58,7 @@ def plot_refusal_topology(
35
  harmful-harmless mean difference projected onto the refusal direction),
36
  with strong layers highlighted.
37
  """
38
- import matplotlib
39
- if output_path:
40
- matplotlib.use("Agg")
41
- import matplotlib.pyplot as plt
42
-
43
  layers = sorted(refusal_directions.keys())
44
  strengths = []
45
  for idx in layers:
@@ -86,11 +105,7 @@ def plot_cross_layer_heatmap(
86
  title: str = "Cross-Layer Refusal Direction Alignment",
87
  ):
88
  """Visualize the pairwise cosine similarity matrix between layer refusal directions."""
89
- import matplotlib
90
- if output_path:
91
- matplotlib.use("Agg")
92
- import matplotlib.pyplot as plt
93
-
94
  matrix = cross_layer_result.cosine_matrix.numpy()
95
  indices = cross_layer_result.layer_indices
96
  n = len(indices)
@@ -132,11 +147,7 @@ def plot_angular_drift(
132
  title: str = "Refusal Direction Angular Drift Through Network",
133
  ):
134
  """Visualize cumulative angular drift of the refusal direction."""
135
- import matplotlib
136
- if output_path:
137
- matplotlib.use("Agg")
138
- import matplotlib.pyplot as plt
139
-
140
  indices = cross_layer_result.layer_indices
141
  drift = cross_layer_result.angular_drift
142
 
@@ -173,11 +184,6 @@ def plot_logit_lens_spectrum(
173
  title: str | None = None,
174
  ):
175
  """Visualize the logit lens token promotion/suppression spectrum."""
176
- import matplotlib
177
- if output_path:
178
- matplotlib.use("Agg")
179
- import matplotlib.pyplot as plt
180
-
181
  # Select which layer to display
182
  if layer_idx is not None:
183
  result = logit_lens_result.per_layer.get(layer_idx)
@@ -189,6 +195,7 @@ def plot_logit_lens_spectrum(
189
 
190
  if title is None:
191
  title = f"Logit Lens — Layer {result.layer_idx}"
 
192
 
193
  # Combine top promoted and suppressed
194
  promoted = result.top_promoted[:15]
@@ -233,10 +240,7 @@ def plot_defense_radar(
233
  title: str = "Defense Robustness Profile",
234
  ):
235
  """Spider/radar chart of defense properties."""
236
- import matplotlib
237
- if output_path:
238
- matplotlib.use("Agg")
239
- import matplotlib.pyplot as plt
240
  import numpy as np
241
 
242
  categories = [
@@ -271,7 +275,7 @@ def plot_defense_radar(
271
  ax.set_ylim(0, 1)
272
  ax.set_yticks([0.25, 0.5, 0.75, 1.0])
273
  ax.set_yticklabels(["0.25", "0.50", "0.75", "1.00"], fontsize=8)
274
- ax.set_title(f"{title}\n{p.model_name}", fontsize=14, fontweight="bold", pad=20)
275
 
276
  # Robustness badge
277
  robustness_colors = {
@@ -304,11 +308,7 @@ def plot_capability_safety_pareto(
304
  title: str = "Capability-Safety Pareto Frontier",
305
  ):
306
  """Plot the capability vs safety tradeoff."""
307
- import matplotlib
308
- if output_path:
309
- matplotlib.use("Agg")
310
- import matplotlib.pyplot as plt
311
-
312
  # Current point
313
  scores = [r.score for r in benchmark_results.values()]
314
  capability = sum(scores) / max(len(scores), 1)
@@ -363,11 +363,7 @@ def plot_probe_dashboard(
363
  title: str = "Activation Probe Dashboard",
364
  ):
365
  """Dashboard showing per-layer refusal elimination status."""
366
- import matplotlib
367
- if output_path:
368
- matplotlib.use("Agg")
369
- import matplotlib.pyplot as plt
370
-
371
  layers = sorted(probe_result.per_layer.keys())
372
  gaps = [probe_result.per_layer[idx].projection_gap for idx in layers]
373
  d_primes = [probe_result.per_layer[idx].separation_d_prime for idx in layers]
 
15
 
16
  from __future__ import annotations
17
 
18
+ import re
19
  from pathlib import Path
20
  from typing import Any
21
 
22
+ import matplotlib
23
+ matplotlib.use("Agg") # Set once at import time; safe for server & notebook
24
+ import matplotlib.pyplot as plt
25
+
26
  import torch
27
 
28
 
29
+ def _sanitize_label(text: str, max_len: int = 80) -> str:
30
+ """Strip filesystem paths, tokens, and overly-long strings from labels.
31
+
32
+ Prevents accidental leakage of sensitive information (HF org names,
33
+ local paths, API tokens) into saved chart images.
34
+ """
35
+ # Remove anything that looks like an absolute path
36
+ text = re.sub(r"(/[a-zA-Z0-9_./-]{3,})", lambda m: m.group(0).rsplit("/", 1)[-1], text)
37
+ # Remove HF-style token prefixes (hf_...)
38
+ text = re.sub(r"\bhf_[A-Za-z0-9]{6,}\b", "<TOKEN>", text)
39
+ # Remove generic secret-like hex strings (32+ chars)
40
+ text = re.sub(r"\b[0-9a-fA-F]{32,}\b", "<REDACTED>", text)
41
+ # Truncate
42
+ if len(text) > max_len:
43
+ text = text[: max_len - 3] + "..."
44
+ return text
45
+
46
+
47
  def plot_refusal_topology(
48
  refusal_directions: dict[int, torch.Tensor],
49
  harmful_means: dict[int, torch.Tensor],
 
58
  harmful-harmless mean difference projected onto the refusal direction),
59
  with strong layers highlighted.
60
  """
61
+ title = _sanitize_label(title)
 
 
 
 
62
  layers = sorted(refusal_directions.keys())
63
  strengths = []
64
  for idx in layers:
 
105
  title: str = "Cross-Layer Refusal Direction Alignment",
106
  ):
107
  """Visualize the pairwise cosine similarity matrix between layer refusal directions."""
108
+ title = _sanitize_label(title)
 
 
 
 
109
  matrix = cross_layer_result.cosine_matrix.numpy()
110
  indices = cross_layer_result.layer_indices
111
  n = len(indices)
 
147
  title: str = "Refusal Direction Angular Drift Through Network",
148
  ):
149
  """Visualize cumulative angular drift of the refusal direction."""
150
+ title = _sanitize_label(title)
 
 
 
 
151
  indices = cross_layer_result.layer_indices
152
  drift = cross_layer_result.angular_drift
153
 
 
184
  title: str | None = None,
185
  ):
186
  """Visualize the logit lens token promotion/suppression spectrum."""
 
 
 
 
 
187
  # Select which layer to display
188
  if layer_idx is not None:
189
  result = logit_lens_result.per_layer.get(layer_idx)
 
195
 
196
  if title is None:
197
  title = f"Logit Lens — Layer {result.layer_idx}"
198
+ title = _sanitize_label(title)
199
 
200
  # Combine top promoted and suppressed
201
  promoted = result.top_promoted[:15]
 
240
  title: str = "Defense Robustness Profile",
241
  ):
242
  """Spider/radar chart of defense properties."""
243
+ title = _sanitize_label(title)
 
 
 
244
  import numpy as np
245
 
246
  categories = [
 
275
  ax.set_ylim(0, 1)
276
  ax.set_yticks([0.25, 0.5, 0.75, 1.0])
277
  ax.set_yticklabels(["0.25", "0.50", "0.75", "1.00"], fontsize=8)
278
+ ax.set_title(f"{title}\n{_sanitize_label(p.model_name)}", fontsize=14, fontweight="bold", pad=20)
279
 
280
  # Robustness badge
281
  robustness_colors = {
 
308
  title: str = "Capability-Safety Pareto Frontier",
309
  ):
310
  """Plot the capability vs safety tradeoff."""
311
+ title = _sanitize_label(title)
 
 
 
 
312
  # Current point
313
  scores = [r.score for r in benchmark_results.values()]
314
  capability = sum(scores) / max(len(scores), 1)
 
363
  title: str = "Activation Probe Dashboard",
364
  ):
365
  """Dashboard showing per-layer refusal elimination status."""
366
+ title = _sanitize_label(title)
 
 
 
 
367
  layers = sorted(probe_result.per_layer.keys())
368
  gaps = [probe_result.per_layer[idx].projection_gap for idx in layers]
369
  d_primes = [probe_result.per_layer[idx].separation_d_prime for idx in layers]
obliteratus/evaluation/benchmark_plots.py CHANGED
@@ -18,6 +18,8 @@ Gradio's gr.Plot component.
18
  from __future__ import annotations
19
 
20
 
 
 
21
  import matplotlib
22
  matplotlib.use("Agg") # Non-interactive backend for server use
23
  import matplotlib.pyplot as plt
@@ -25,6 +27,16 @@ import matplotlib.ticker as mticker
25
  import numpy as np
26
 
27
 
 
 
 
 
 
 
 
 
 
 
28
  # ── Color palette (research-paper friendly, colorblind-safe) ──────────────
29
 
30
  PALETTE = {
@@ -79,7 +91,7 @@ def plot_pareto_frontier(results: list[dict], title_suffix: str = "") -> plt.Fig
79
 
80
  xs = [r["refusal_rate"] * 100 for r in valid] # percentage
81
  ys = [r["perplexity"] for r in valid]
82
- labels = [r.get("method", r.get("model_short", "?")) for r in valid]
83
  colors = [_get_color(r.get("method", ""), i) for i, r in enumerate(valid)]
84
 
85
  # Scatter
@@ -161,7 +173,7 @@ def plot_method_radar(results: list[dict], title_suffix: str = "") -> plt.Figure
161
  angles += angles[:1] # close the polygon
162
 
163
  for i, r in enumerate(valid):
164
- label = r.get("method", r.get("model_short", f"run-{i}"))
165
  color = _get_color(r.get("method", ""), i)
166
 
167
  refusal_removal = 1.0 - (r.get("refusal_rate") or 1.0)
@@ -202,7 +214,7 @@ def plot_metric_bars(results: list[dict], title_suffix: str = "") -> plt.Figure:
202
  fig.tight_layout()
203
  return fig
204
 
205
- labels = [r.get("method", r.get("model_short", "?")) for r in valid]
206
  n = len(valid)
207
 
208
  # Metrics
@@ -257,7 +269,7 @@ def plot_timing_efficiency(results: list[dict], title_suffix: str = "") -> plt.F
257
  fig, ax = plt.subplots(figsize=(8, 5.5))
258
 
259
  for i, r in enumerate(valid):
260
- label = r.get("method", r.get("model_short", "?"))
261
  color = _get_color(r.get("method", ""), i)
262
  x = r["time_s"]
263
  # Composite quality: weighted sum of refusal removal and coherence
@@ -295,7 +307,7 @@ def plot_moe_metrics(results: list[dict], title_suffix: str = "") -> plt.Figure:
295
  if not valid:
296
  # Show all results with a note about no MoE activity
297
  fig, ax = plt.subplots(figsize=(8, 4))
298
- all_labels = [r.get("method", r.get("model_short", "?")) for r in results]
299
  if results:
300
  ax.barh(all_labels, [0] * len(results), color="#cccccc")
301
  ax.text(0.5, 0.5, "No MoE-specific features activated\n(use surgical/optimized/nuclear methods)",
@@ -306,7 +318,7 @@ def plot_moe_metrics(results: list[dict], title_suffix: str = "") -> plt.Figure:
306
  fig.tight_layout()
307
  return fig
308
 
309
- labels = [r.get("method", r.get("model_short", "?")) for r in valid]
310
  ega = [r.get("ega_expert_dirs", 0) for r in valid]
311
  cot = [r.get("cot_preserved", 0) for r in valid]
312
  safety = [r.get("expert_classified_layers", r.get("ega_safety_layers", 0)) for r in valid]
@@ -349,7 +361,7 @@ def plot_model_scaling(results: list[dict], title_suffix: str = "") -> plt.Figur
349
  fig.tight_layout()
350
  return fig
351
 
352
- labels = [r.get("model_short", r.get("model", "?").split("/")[-1]) for r in valid]
353
  n = len(valid)
354
  x = np.arange(n)
355
 
 
18
  from __future__ import annotations
19
 
20
 
21
+ import re
22
+
23
  import matplotlib
24
  matplotlib.use("Agg") # Non-interactive backend for server use
25
  import matplotlib.pyplot as plt
 
27
  import numpy as np
28
 
29
 
30
+ def _sanitize_label(text: str, max_len: int = 80) -> str:
31
+ """Strip filesystem paths, tokens, and overly-long strings from labels."""
32
+ text = re.sub(r"(/[a-zA-Z0-9_./-]{3,})", lambda m: m.group(0).rsplit("/", 1)[-1], text)
33
+ text = re.sub(r"\bhf_[A-Za-z0-9]{6,}\b", "<TOKEN>", text)
34
+ text = re.sub(r"\b[0-9a-fA-F]{32,}\b", "<REDACTED>", text)
35
+ if len(text) > max_len:
36
+ text = text[: max_len - 3] + "..."
37
+ return text
38
+
39
+
40
  # ── Color palette (research-paper friendly, colorblind-safe) ──────────────
41
 
42
  PALETTE = {
 
91
 
92
  xs = [r["refusal_rate"] * 100 for r in valid] # percentage
93
  ys = [r["perplexity"] for r in valid]
94
+ labels = [_sanitize_label(r.get("method", r.get("model_short", "?"))) for r in valid]
95
  colors = [_get_color(r.get("method", ""), i) for i, r in enumerate(valid)]
96
 
97
  # Scatter
 
173
  angles += angles[:1] # close the polygon
174
 
175
  for i, r in enumerate(valid):
176
+ label = _sanitize_label(r.get("method", r.get("model_short", f"run-{i}")))
177
  color = _get_color(r.get("method", ""), i)
178
 
179
  refusal_removal = 1.0 - (r.get("refusal_rate") or 1.0)
 
214
  fig.tight_layout()
215
  return fig
216
 
217
+ labels = [_sanitize_label(r.get("method", r.get("model_short", "?"))) for r in valid]
218
  n = len(valid)
219
 
220
  # Metrics
 
269
  fig, ax = plt.subplots(figsize=(8, 5.5))
270
 
271
  for i, r in enumerate(valid):
272
+ label = _sanitize_label(r.get("method", r.get("model_short", "?")))
273
  color = _get_color(r.get("method", ""), i)
274
  x = r["time_s"]
275
  # Composite quality: weighted sum of refusal removal and coherence
 
307
  if not valid:
308
  # Show all results with a note about no MoE activity
309
  fig, ax = plt.subplots(figsize=(8, 4))
310
+ all_labels = [_sanitize_label(r.get("method", r.get("model_short", "?"))) for r in results]
311
  if results:
312
  ax.barh(all_labels, [0] * len(results), color="#cccccc")
313
  ax.text(0.5, 0.5, "No MoE-specific features activated\n(use surgical/optimized/nuclear methods)",
 
318
  fig.tight_layout()
319
  return fig
320
 
321
+ labels = [_sanitize_label(r.get("method", r.get("model_short", "?"))) for r in valid]
322
  ega = [r.get("ega_expert_dirs", 0) for r in valid]
323
  cot = [r.get("cot_preserved", 0) for r in valid]
324
  safety = [r.get("expert_classified_layers", r.get("ega_safety_layers", 0)) for r in valid]
 
361
  fig.tight_layout()
362
  return fig
363
 
364
+ labels = [_sanitize_label(r.get("model_short", r.get("model", "?").split("/")[-1])) for r in valid]
365
  n = len(valid)
366
  x = np.arange(n)
367
 
obliteratus/reporting/report.py CHANGED
@@ -3,6 +3,7 @@
3
  from __future__ import annotations
4
 
5
  import json
 
6
  from dataclasses import dataclass, field
7
  from pathlib import Path
8
  from typing import Any
@@ -10,6 +11,16 @@ from typing import Any
10
  import pandas as pd
11
 
12
 
 
 
 
 
 
 
 
 
 
 
13
  @dataclass
14
  class AblationResult:
15
  """Result of a single ablation experiment."""
@@ -69,7 +80,7 @@ class AblationReport:
69
  console.print("[yellow]No ablation results to display.[/yellow]")
70
  return
71
 
72
- table = Table(title=f"Ablation Results: {self.model_name}")
73
  table.add_column("Strategy", style="cyan")
74
  table.add_column("Component", style="green")
75
 
@@ -154,7 +165,7 @@ class AblationReport:
154
 
155
  ax.set_xlabel(f"Change in {metric} (vs baseline)")
156
  ax.set_ylabel("Ablated Component")
157
- ax.set_title(f"Ablation Impact on {metric} — {self.model_name}")
158
  ax.axvline(x=0, color="black", linewidth=0.8)
159
 
160
  plt.tight_layout()
@@ -183,7 +194,7 @@ class AblationReport:
183
 
184
  fig, ax = plt.subplots(figsize=(max(6, len(pivot.columns) * 2), max(4, len(pivot) * 0.4)))
185
  sns.heatmap(pivot, annot=True, fmt=".1f", cmap="RdYlGn_r", center=0, ax=ax)
186
- ax.set_title(f"Ablation % Change — {self.model_name}")
187
 
188
  plt.tight_layout()
189
  if output_path:
 
3
  from __future__ import annotations
4
 
5
  import json
6
+ import re
7
  from dataclasses import dataclass, field
8
  from pathlib import Path
9
  from typing import Any
 
11
  import pandas as pd
12
 
13
 
14
+ def _sanitize_label(text: str, max_len: int = 80) -> str:
15
+ """Strip filesystem paths, tokens, and overly-long strings from labels."""
16
+ text = re.sub(r"(/[a-zA-Z0-9_./-]{3,})", lambda m: m.group(0).rsplit("/", 1)[-1], text)
17
+ text = re.sub(r"\bhf_[A-Za-z0-9]{6,}\b", "<TOKEN>", text)
18
+ text = re.sub(r"\b[0-9a-fA-F]{32,}\b", "<REDACTED>", text)
19
+ if len(text) > max_len:
20
+ text = text[: max_len - 3] + "..."
21
+ return text
22
+
23
+
24
  @dataclass
25
  class AblationResult:
26
  """Result of a single ablation experiment."""
 
80
  console.print("[yellow]No ablation results to display.[/yellow]")
81
  return
82
 
83
+ table = Table(title=f"Ablation Results: {_sanitize_label(self.model_name)}")
84
  table.add_column("Strategy", style="cyan")
85
  table.add_column("Component", style="green")
86
 
 
165
 
166
  ax.set_xlabel(f"Change in {metric} (vs baseline)")
167
  ax.set_ylabel("Ablated Component")
168
+ ax.set_title(f"Ablation Impact on {metric} — {_sanitize_label(self.model_name)}")
169
  ax.axvline(x=0, color="black", linewidth=0.8)
170
 
171
  plt.tight_layout()
 
194
 
195
  fig, ax = plt.subplots(figsize=(max(6, len(pivot.columns) * 2), max(4, len(pivot) * 0.4)))
196
  sns.heatmap(pivot, annot=True, fmt=".1f", cmap="RdYlGn_r", center=0, ax=ax)
197
+ ax.set_title(f"Ablation % Change — {_sanitize_label(self.model_name)}")
198
 
199
  plt.tight_layout()
200
  if output_path:
tests/test_visualization.py CHANGED
@@ -11,6 +11,7 @@ import torch
11
  from obliteratus.analysis.cross_layer import CrossLayerAlignmentAnalyzer
12
  from obliteratus.analysis.activation_probing import ActivationProbe
13
  from obliteratus.analysis.visualization import (
 
14
  plot_refusal_topology,
15
  plot_cross_layer_heatmap,
16
  plot_angular_drift,
@@ -117,3 +118,50 @@ class TestDefenseRadar:
117
  path = tmp_dir / "radar.png"
118
  plot_defense_radar(profile, output_path=path)
119
  assert path.exists()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  from obliteratus.analysis.cross_layer import CrossLayerAlignmentAnalyzer
12
  from obliteratus.analysis.activation_probing import ActivationProbe
13
  from obliteratus.analysis.visualization import (
14
+ _sanitize_label,
15
  plot_refusal_topology,
16
  plot_cross_layer_heatmap,
17
  plot_angular_drift,
 
118
  path = tmp_dir / "radar.png"
119
  plot_defense_radar(profile, output_path=path)
120
  assert path.exists()
121
+
122
+ def test_model_name_sanitized_in_title(self, tmp_dir):
123
+ """Ensure sensitive paths in model_name don't leak into saved charts."""
124
+ profile = DefenseProfile(
125
+ model_name="/home/user/.cache/huggingface/hub/models--secret-org/private-model",
126
+ alignment_type_estimate="RLHF-like",
127
+ refusal_concentration=0.4,
128
+ refusal_layer_spread=5,
129
+ mean_refusal_strength=2.0,
130
+ max_refusal_strength=4.0,
131
+ self_repair_estimate=0.6,
132
+ entanglement_score=0.3,
133
+ estimated_robustness="medium",
134
+ )
135
+ path = tmp_dir / "radar_sanitized.png"
136
+ fig = plot_defense_radar(profile, output_path=path)
137
+ # Title should not contain the full filesystem path
138
+ title_text = fig.axes[0].get_title()
139
+ assert "/home/user" not in title_text
140
+ assert ".cache" not in title_text
141
+
142
+
143
+ class TestSanitizeLabel:
144
+ def test_strips_absolute_paths(self):
145
+ result = _sanitize_label("/home/user/.cache/huggingface/models--org/model")
146
+ assert "/home/user" not in result
147
+ assert "model" in result
148
+
149
+ def test_redacts_hf_tokens(self):
150
+ result = _sanitize_label("model with hf_abcdefghij token")
151
+ assert "hf_abcdefghij" not in result
152
+ assert "<TOKEN>" in result
153
+
154
+ def test_redacts_long_hex_strings(self):
155
+ hex_str = "a" * 40
156
+ result = _sanitize_label(f"commit {hex_str}")
157
+ assert hex_str not in result
158
+ assert "<REDACTED>" in result
159
+
160
+ def test_truncates_long_strings(self):
161
+ long = "x" * 200
162
+ result = _sanitize_label(long)
163
+ assert len(result) <= 80
164
+ assert result.endswith("...")
165
+
166
+ def test_passes_normal_strings_through(self):
167
+ assert _sanitize_label("Refusal Topology Map") == "Refusal Topology Map"