G. Claude Opus 4.7 commited on
Commit
dea9e25
·
1 Parent(s): 1569836

Apply Aleph Beth design system to GuardLLM UI

Browse files

- Parchment surface + ink typography, Instrument Serif display, Geist body/mono.
- Restrained 13-category palette drawn from brand families (safe, threat, gilt, signal, ink) — no neon.
- Replace emoji (verdict, header) with geometric primitives and editorial labels.
- Plotly chart re-skinned: parchment paper, ink axes, soft grid, branded hover labels.
- Bilingual mark (א-ב · أب) in header and footer.
- Cards, buttons, inputs, filters all themed via gr.themes.Base override + custom CSS.
- Pass HF_TOKEN to from_pretrained so the gated model loads when the Space secret is set.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +761 -222
app.py CHANGED
@@ -1,19 +1,18 @@
1
  """
2
- GuardLLM - Interactive Prompt Security Visualizer
3
- Combines t-SNE embedding visualization with real-time prompt risk analysis.
4
  Powered by Llama Prompt Guard 2 (86M) and neuralchemy/Prompt-injection-dataset.
5
  """
6
 
7
  import logging
 
8
  import sys
9
  import json
10
- import traceback
11
 
12
  import gradio as gr
13
  import torch
14
  import numpy as np
15
  import plotly.graph_objects as go
16
- import plotly.io as pio
17
  from pathlib import Path
18
 
19
  # ---------------------------------------------------------------------------
@@ -27,22 +26,61 @@ logging.basicConfig(
27
  logger = logging.getLogger("GuardLLM")
28
 
29
  # ---------------------------------------------------------------------------
30
- # Color palette for categories
31
  # ---------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  CATEGORY_COLORS = {
33
- "benign": "#22c55e",
34
- "direct_injection": "#ef4444",
35
- "jailbreak": "#f97316",
36
- "system_extraction": "#a855f7",
37
- "encoding_obfuscation": "#ec4899",
38
- "persona_replacement": "#f59e0b",
39
- "indirect_injection": "#e11d48",
40
- "token_smuggling": "#7c3aed",
41
- "many_shot": "#06b6d4",
42
- "crescendo": "#14b8a6",
43
- "context_overflow": "#8b5cf6",
44
- "prompt_leaking": "#d946ef",
45
- "unknown": "#64748b",
46
  }
47
 
48
  CATEGORY_LABELS = {
@@ -66,6 +104,7 @@ CATEGORY_LABELS = {
66
  # ---------------------------------------------------------------------------
67
  MODEL_ID = "meta-llama/Llama-Prompt-Guard-2-86M"
68
  LABELS = ["Benign", "Malicious"]
 
69
  _classifier = {"tokenizer": None, "model": None, "device": None}
70
 
71
 
@@ -73,8 +112,9 @@ def get_classifier():
73
  if _classifier["model"] is None:
74
  logger.info("Lazy-loading Llama Prompt Guard 2...")
75
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
76
- tok = AutoTokenizer.from_pretrained(MODEL_ID)
77
- mdl = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
 
78
  mdl.eval()
79
  dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
80
  mdl.to(dev)
@@ -131,14 +171,13 @@ def analyze_prompt(text):
131
  with torch.no_grad():
132
  outputs = model(**inputs)
133
  probs = torch.softmax(outputs.logits, dim=-1)[0].cpu().numpy()
134
- pred_idx = int(np.argmax(probs))
135
  prob_dict = {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
136
  safety = float(probs[0])
137
  return prob_dict, safety
138
 
139
 
140
  # ---------------------------------------------------------------------------
141
- # Build the t-SNE Plotly figure
142
  # ---------------------------------------------------------------------------
143
  def build_tsne_figure(selected_categories=None):
144
  fig = go.Figure()
@@ -159,8 +198,8 @@ def build_tsne_figure(selected_categories=None):
159
  severities = [ALL_SEVERITIES[i] or "benign" for i in indices]
160
  hover_texts = [
161
  f"<b>{CATEGORY_LABELS.get(cat, cat)}</b><br>"
162
- f"Severity: {sev}<br>"
163
- f"Index: {idx}<br>"
164
  f"<i>{txt}</i>"
165
  for idx, txt, sev in zip(indices, texts_preview, severities)
166
  ]
@@ -173,41 +212,55 @@ def build_tsne_figure(selected_categories=None):
173
  marker=dict(
174
  size=5 if len(indices) > 500 else 7,
175
  color=color,
176
- opacity=0.7,
177
- line=dict(width=0.5, color="rgba(255,255,255,0.2)"),
178
  ),
179
  text=hover_texts,
180
  hoverinfo="text",
181
  customdata=[str(i) for i in indices],
182
  ))
183
  fig.update_layout(
184
- template="plotly_dark",
185
- paper_bgcolor="#0f172a",
186
- plot_bgcolor="#1e293b",
 
187
  title=dict(
188
- text="t-SNE Embedding Space - Prompt Security Landscape",
189
- font=dict(size=16, color="#e2e8f0"),
 
190
  x=0.5,
 
191
  ),
192
  legend=dict(
193
- title=dict(text="Category", font=dict(color="#94a3b8")),
194
- bgcolor="rgba(15,23,42,0.9)",
195
- bordercolor="#334155",
196
  borderwidth=1,
197
- font=dict(color="#cbd5e1", size=10),
198
  itemsizing="constant",
199
  ),
200
  xaxis=dict(
201
- title="t-SNE 1", showgrid=True, gridcolor="#334155",
202
- zeroline=False, color="#94a3b8",
 
 
 
203
  ),
204
  yaxis=dict(
205
- title="t-SNE 2", showgrid=True, gridcolor="#334155",
206
- zeroline=False, color="#94a3b8",
 
 
 
207
  ),
208
- margin=dict(l=40, r=40, t=50, b=40),
209
- height=600,
210
  dragmode="pan",
 
 
 
 
 
211
  )
212
  return fig
213
 
@@ -228,30 +281,34 @@ def deselect_all_categories():
228
  return gr.update(value=[]), build_tsne_figure([])
229
 
230
 
 
 
 
 
 
 
 
 
 
231
  def on_dropdown_select(choice):
232
  if not choice:
233
- return empty_analysis_html(), "*Select a prompt.*", ""
234
  try:
235
  idx = int(choice.split(" | ")[0])
236
  text = ALL_TEXTS[idx]
237
  category = ALL_CATEGORIES[idx]
238
  severity = ALL_SEVERITIES[idx] or "N/A"
239
  ground_truth = "Malicious" if ALL_LABELS_DS[idx] == 1 else "Benign"
240
- prob_dict, safety = analyze_prompt(text)
241
  pred_label = max(prob_dict, key=prob_dict.get)
242
  confidence = prob_dict[pred_label]
243
  result_html = build_result_html(pred_label, confidence, prob_dict, text)
244
  risk_text = build_risk_assessment(pred_label, confidence, prob_dict)
245
- risk_text += (
246
- f"\n\n---\n**Dataset metadata:**\n"
247
- f"- Category: **{CATEGORY_LABELS.get(category, category)}**\n"
248
- f"- Severity: **{severity}**\n"
249
- f"- Ground truth: **{ground_truth}**\n"
250
- )
251
  return result_html, risk_text, text
252
  except Exception as e:
253
  logger.error("Error: %s", e)
254
- return empty_analysis_html(), f"Error: {e}", ""
255
 
256
 
257
  def on_index_input(idx_str):
@@ -260,32 +317,27 @@ def on_index_input(idx_str):
260
  try:
261
  idx = int(idx_str.strip())
262
  if idx < 0 or idx >= len(ALL_TEXTS):
263
- return empty_analysis_html(), f"Invalid index: {idx}", ""
264
  text = ALL_TEXTS[idx]
265
  category = ALL_CATEGORIES[idx]
266
  severity = ALL_SEVERITIES[idx] or "N/A"
267
  ground_truth = "Malicious" if ALL_LABELS_DS[idx] == 1 else "Benign"
268
- prob_dict, safety = analyze_prompt(text)
269
  pred_label = max(prob_dict, key=prob_dict.get)
270
  confidence = prob_dict[pred_label]
271
  result_html = build_result_html(pred_label, confidence, prob_dict, text)
272
  risk_text = build_risk_assessment(pred_label, confidence, prob_dict)
273
- risk_text += (
274
- f"\n\n---\n**Dataset metadata:**\n"
275
- f"- Category: **{CATEGORY_LABELS.get(category, category)}**\n"
276
- f"- Severity: **{severity}**\n"
277
- f"- Ground truth: **{ground_truth}**\n"
278
- )
279
  return result_html, risk_text, text
280
  except Exception as e:
281
  logger.error("Error: %s", e)
282
- return empty_analysis_html(), f"Error: {e}", ""
283
 
284
 
285
  def on_manual_analyze(text):
286
  if not text or not text.strip():
287
  return empty_analysis_html(), ""
288
- prob_dict, safety = analyze_prompt(text)
289
  pred_label = max(prob_dict, key=prob_dict.get)
290
  confidence = prob_dict[pred_label]
291
  result_html = build_result_html(pred_label, confidence, prob_dict, text)
@@ -294,69 +346,79 @@ def on_manual_analyze(text):
294
 
295
 
296
  # ---------------------------------------------------------------------------
297
- # UI builders
298
  # ---------------------------------------------------------------------------
299
  def empty_analysis_html():
300
- return """
301
- <div style="text-align:center; padding:30px; color:#94a3b8;">
302
- <p style="font-size:1em;">Click a point on the chart,<br>
303
- select a prompt from the list,<br>
304
- or enter a custom prompt below.</p>
 
 
305
  </div>
306
  """
307
 
308
 
309
  def build_result_html(label, confidence, probs, text):
310
- color = "#22c55e" if label == "Benign" else "#ef4444"
311
- emoji = "\u2705" if label == "Benign" else "\u26a0\ufe0f"
 
312
  pct = confidence * 100
313
  safety_score = probs["Benign"] * 100
314
  safety_color = (
315
- "#22c55e" if safety_score >= 70
316
- else "#f59e0b" if safety_score >= 40
317
- else "#ef4444"
318
  )
 
319
  bars_html = ""
320
  for lbl in LABELS:
321
  p = probs[lbl] * 100
322
- c = "#22c55e" if lbl == "Benign" else "#ef4444"
323
  bars_html += f"""
324
- <div style="margin-bottom:8px;">
325
- <div style="display:flex; justify-content:space-between; margin-bottom:2px;">
326
- <span style="font-weight:600; color:#e2e8f0;">{lbl}</span>
327
- <span style="color:#cbd5e1; font-weight:600;">{p:.1f}%</span>
328
  </div>
329
- <div style="background:#1e293b; border-radius:8px; height:18px; overflow:hidden;">
330
- <div style="background:{c}; height:100%; width:{p}%; border-radius:8px;"></div>
331
  </div>
332
  </div>
333
  """
334
- preview = text[:150].replace("<", "&lt;").replace(">", "&gt;")
335
- if len(text) > 150:
336
- preview += "..."
 
 
337
  return f"""
338
- <div style="background:#0f172a; border-radius:12px; padding:18px; font-family:system-ui,sans-serif;">
339
- <div style="text-align:center; margin-bottom:14px;">
340
- <div style="font-size:2em;">{emoji}</div>
341
- <div style="font-size:1.2em; font-weight:700; color:{color};">{label}</div>
342
- <div style="color:#94a3b8; font-size:0.85em;">Confidence: {pct:.1f}%</div>
343
- </div>
344
- <div style="background:#1e293b; border-radius:10px; padding:12px; margin-bottom:10px;">
345
- <div style="display:flex; justify-content:space-between; margin-bottom:4px;">
346
- <span style="color:#e2e8f0; font-weight:600;">Safety Score</span>
347
- <span style="color:{safety_color}; font-weight:700; font-size:1.1em;">{safety_score:.0f}/100</span>
348
- </div>
349
- <div style="background:#334155; border-radius:8px; height:12px; overflow:hidden;">
350
- <div style="background:linear-gradient(90deg, #ef4444, #f59e0b, #22c55e);
351
- height:100%; width:{safety_score}%; border-radius:8px;"></div>
352
  </div>
353
  </div>
354
- <div style="background:#1e293b; border-radius:10px; padding:12px; margin-bottom:10px;">
355
- {bars_html}
 
 
 
 
 
 
 
356
  </div>
357
- <div style="background:#1e293b; border-radius:10px; padding:12px;">
358
- <div style="color:#94a3b8; font-size:0.8em; margin-bottom:3px;">Analyzed prompt:</div>
359
- <div style="color:#cbd5e1; font-style:italic; word-break:break-word; font-size:0.85em;">"{preview}"</div>
 
 
 
 
360
  </div>
361
  </div>
362
  """
@@ -366,19 +428,22 @@ def build_risk_assessment(label, confidence, probs):
366
  safety_score = probs["Benign"] * 100
367
  malicious_score = probs["Malicious"] * 100
368
  if label == "Benign" and confidence > 0.85:
369
- level, desc = "Low", "This prompt appears **safe**. No injection or jailbreak patterns detected."
 
370
  elif label == "Benign":
371
- level, desc = "Moderate", "Likely benign, but moderate confidence. Potentially ambiguous wording."
 
372
  elif confidence > 0.85:
373
- level, desc = "Critical", "**Malicious prompt detected** with high confidence. Likely injection or jailbreak attempt."
 
374
  else:
375
- level, desc = "High", "**Malicious prompt detected.** Possible injection or jailbreak. Review recommended."
 
376
  return (
377
- f"### Risk Level: {level}\n\n{desc}\n\n"
378
- f"**Details:**\n"
379
- f"- Safety score: **{safety_score:.0f}/100**\n"
380
- f"- Predicted class: **{label}** ({confidence*100:.1f}%)\n"
381
- f"- P(Benign) = {probs['Benign']*100:.1f}% | P(Malicious) = {malicious_score:.1f}%\n"
382
  )
383
 
384
 
@@ -396,37 +461,37 @@ def build_stats_html():
396
  pct = count / total * 100
397
  label = CATEGORY_LABELS.get(cat, cat)
398
  cats_html += (
399
- f'<div style="display:flex; justify-content:space-between; padding:2px 0;">'
400
- f'<span style="color:{color}; font-weight:500; font-size:0.85em;">{label}</span>'
401
- f'<span style="color:#94a3b8; font-size:0.85em;">{count} ({pct:.1f}%)</span>'
 
402
  f'</div>'
403
  )
404
  return f"""
405
- <div style="background:#0f172a; border-radius:12px; padding:14px; font-family:system-ui,sans-serif;">
406
- <div style="color:#e2e8f0; font-weight:700; margin-bottom:8px;">Dataset Statistics</div>
407
- <div style="display:flex; gap:10px; margin-bottom:10px;">
408
- <div style="flex:1; background:#1e293b; border-radius:8px; padding:8px; text-align:center;">
409
- <div style="color:#94a3b8; font-size:0.75em;">Total</div>
410
- <div style="color:#e2e8f0; font-weight:700; font-size:1.2em;">{total:,}</div>
 
411
  </div>
412
- <div style="flex:1; background:#1e293b; border-radius:8px; padding:8px; text-align:center;">
413
- <div style="color:#22c55e; font-size:0.75em;">Benign</div>
414
- <div style="color:#22c55e; font-weight:700; font-size:1.2em;">{n_benign:,}</div>
415
  </div>
416
- <div style="flex:1; background:#1e293b; border-radius:8px; padding:8px; text-align:center;">
417
- <div style="color:#ef4444; font-size:0.75em;">Malicious</div>
418
- <div style="color:#ef4444; font-weight:700; font-size:1.2em;">{n_malicious:,}</div>
419
  </div>
420
  </div>
421
- <div style="background:#1e293b; border-radius:8px; padding:8px;">
422
- {cats_html}
423
- </div>
424
  </div>
425
  """
426
 
427
 
428
  # ---------------------------------------------------------------------------
429
- # JavaScript to bridge Plotly clicks -> Gradio
430
  # ---------------------------------------------------------------------------
431
  PLOTLY_CLICK_JS = """
432
  () => {
@@ -440,7 +505,8 @@ PLOTLY_CLICK_JS = """
440
  if (data && data.points && data.points.length > 0) {
441
  const idx = data.points[0].customdata;
442
  if (idx !== undefined && idx !== null) {
443
- const inputEl = document.querySelector('#click-index-input textarea') || document.querySelector('#click-index-input input');
 
444
  if (inputEl) {
445
  const proto = inputEl.tagName === 'TEXTAREA'
446
  ? window.HTMLTextAreaElement.prototype
@@ -473,60 +539,560 @@ PLOTLY_CLICK_JS = """
473
 
474
 
475
  # ---------------------------------------------------------------------------
476
- # Gradio Interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
477
  # ---------------------------------------------------------------------------
478
- TITLE_HTML = """
479
- <div style="text-align:center; padding:10px 0 4px 0;">
480
- <h1 style="font-size:1.8em; margin:0;">GuardLLM - Prompt Security Visualizer</h1>
481
- <p style="color:#94a3b8; font-size:0.95em; margin-top:4px;">
482
- Interactive t-SNE embedding space &bull;
483
- <a href="https://huggingface.co/meta-llama/Llama-Prompt-Guard-2-86M" target="_blank" style="color:#60a5fa;">
484
- Llama Prompt Guard 2</a> &bull;
485
- <a href="https://huggingface.co/datasets/neuralchemy/Prompt-injection-dataset" target="_blank" style="color:#60a5fa;">
486
- neuralchemy dataset</a>
 
 
 
 
 
487
  </p>
488
- </div>
489
  """
490
 
491
  HOW_TO_HTML = """
492
- <div style="background:linear-gradient(135deg, #0f172a 0%, #1e293b 100%); border:1px solid #334155; border-radius:12px; padding:16px 20px; margin:0 0 8px 0; font-family:system-ui,sans-serif;">
493
- <div style="color:#e2e8f0; font-weight:700; font-size:1em; margin-bottom:8px;">How to use this tool</div>
494
- <div style="display:flex; flex-wrap:wrap; gap:12px;">
495
- <div style="flex:1; min-width:180px; background:#1e293b; border-radius:8px; padding:10px 12px;">
496
- <div style="color:#60a5fa; font-weight:600; font-size:0.85em; margin-bottom:4px;">1. Explore the map</div>
497
- <div style="color:#94a3b8; font-size:0.8em; line-height:1.4;">Each dot represents a prompt from the dataset, positioned by semantic similarity. Colors indicate attack categories. Hover to preview, scroll to zoom, drag to pan.</div>
 
 
498
  </div>
499
- <div style="flex:1; min-width:180px; background:#1e293b; border-radius:8px; padding:10px 12px;">
500
- <div style="color:#f59e0b; font-weight:600; font-size:0.85em; margin-bottom:4px;">2. Click to analyze</div>
501
- <div style="color:#94a3b8; font-size:0.8em; line-height:1.4;">Click any point to run it through <strong style="color:#cbd5e1;">Llama Prompt Guard 2</strong>. The right panel will show the risk classification, safety score, and confidence breakdown.</div>
 
 
 
 
 
502
  </div>
503
- <div style="flex:1; min-width:180px; background:#1e293b; border-radius:8px; padding:10px 12px;">
504
- <div style="color:#22c55e; font-weight:600; font-size:0.85em; margin-bottom:4px;">3. Test your own prompts</div>
505
- <div style="color:#94a3b8; font-size:0.8em; line-height:1.4;">Type or paste any prompt in the <strong style="color:#cbd5e1;">Custom prompt</strong> field and hit Analyze to check if it would be flagged as an injection attempt.</div>
 
 
 
 
 
506
  </div>
507
  </div>
508
  </div>
509
  """
510
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
511
  with gr.Blocks(
512
- title="GuardLLM - Prompt Security Visualizer",
 
 
513
  ) as demo:
514
 
515
- gr.HTML(TITLE_HTML)
516
  gr.HTML(HOW_TO_HTML)
517
 
518
- click_index = gr.Textbox(
519
- value="",
520
- visible=True,
521
- elem_id="click-index-input",
522
- )
523
 
524
  with gr.Row():
525
- # ---- Left: t-SNE chart + filters ----
526
  with gr.Column(scale=3):
527
  with gr.Row():
528
- select_all_btn = gr.Button("Select All", size="sm", scale=1)
529
- deselect_all_btn = gr.Button("Deselect All", size="sm", scale=1)
530
 
531
  category_filter = gr.CheckboxGroup(
532
  choices=UNIQUE_CATEGORIES,
@@ -536,96 +1102,69 @@ with gr.Blocks(
536
  )
537
  tsne_plot = gr.Plot(
538
  value=build_tsne_figure(),
539
- label="t-SNE Space",
540
  elem_id="tsne-chart",
541
  )
542
  gr.Markdown(
543
- "*Click a point to analyze it. "
544
- "Hover to preview text. Use scroll wheel to zoom.*"
545
  )
546
 
547
- # ---- Right: Analysis first, then stats (swapped) ----
548
  with gr.Column(scale=2):
549
- gr.Markdown("### Analysis Result")
 
550
  result_html = gr.HTML(value=empty_analysis_html())
551
  risk_md = gr.Markdown(value="")
552
- full_prompt = gr.Textbox(label="Full prompt", lines=3, interactive=False, visible=True)
 
 
 
 
 
553
 
554
  gr.Markdown("---")
555
 
556
- gr.Markdown("### Select a prompt")
 
557
  prompt_dropdown = gr.Dropdown(
558
  choices=DROPDOWN_CHOICES,
559
- label="Search dataset",
560
  filterable=True,
561
  interactive=True,
562
  )
563
 
564
- gr.Markdown("### Or analyze a custom prompt")
 
565
  manual_input = gr.Textbox(
566
- label="Custom prompt",
567
- placeholder="Type or paste a prompt...",
568
  lines=2,
569
  )
570
- analyze_btn = gr.Button("Analyze", variant="primary")
571
 
572
  gr.Markdown("---")
573
 
574
  gr.HTML(build_stats_html())
575
 
576
  # ---- Events ----
577
- category_filter.change(
578
- fn=on_filter_change,
579
- inputs=[category_filter],
580
- outputs=[tsne_plot],
581
- )
582
- select_all_btn.click(
583
- fn=select_all_categories,
584
- inputs=[],
585
- outputs=[category_filter, tsne_plot],
586
- )
587
- deselect_all_btn.click(
588
- fn=deselect_all_categories,
589
- inputs=[],
590
- outputs=[category_filter, tsne_plot],
591
- )
592
- click_index.change(
593
- fn=on_index_input,
594
- inputs=[click_index],
595
- outputs=[result_html, risk_md, full_prompt],
596
- )
597
- prompt_dropdown.change(
598
- fn=on_dropdown_select,
599
- inputs=[prompt_dropdown],
600
- outputs=[result_html, risk_md, full_prompt],
601
- )
602
- analyze_btn.click(
603
- fn=on_manual_analyze,
604
- inputs=[manual_input],
605
- outputs=[result_html, risk_md],
606
- )
607
- manual_input.submit(
608
- fn=on_manual_analyze,
609
- inputs=[manual_input],
610
- outputs=[result_html, risk_md],
611
- )
612
  demo.load(fn=None, inputs=None, outputs=None, js=PLOTLY_CLICK_JS)
613
 
614
- gr.Markdown(
615
- """
616
- ---
617
- <div style="text-align:center; color:#64748b; font-size:0.8em;">
618
- <strong>GuardLLM</strong> - Prompt Security Visualizer<br>
619
- Model: <a href="https://huggingface.co/meta-llama/Llama-Prompt-Guard-2-86M">
620
- Llama Prompt Guard 2 (86M)</a> by Meta |
621
- Dataset: <a href="https://huggingface.co/datasets/neuralchemy/Prompt-injection-dataset">
622
- neuralchemy/Prompt-injection-dataset</a>
623
- </div>
624
- """
625
- )
626
 
627
 
628
  logger.info("Gradio app built. Ready to launch.")
629
 
630
  if __name__ == "__main__":
631
- demo.launch(css="#click-index-input { position:absolute !important; width:1px !important; height:1px !important; overflow:hidden !important; opacity:0 !important; pointer-events:none !important; }")
 
1
  """
2
+ GuardLLM Prompt Security Visualizer
3
+ Aleph Beth design system applied. Editorial calm, bilingual FR/EN posture.
4
  Powered by Llama Prompt Guard 2 (86M) and neuralchemy/Prompt-injection-dataset.
5
  """
6
 
7
  import logging
8
+ import os
9
  import sys
10
  import json
 
11
 
12
  import gradio as gr
13
  import torch
14
  import numpy as np
15
  import plotly.graph_objects as go
 
16
  from pathlib import Path
17
 
18
  # ---------------------------------------------------------------------------
 
26
  logger = logging.getLogger("GuardLLM")
27
 
28
  # ---------------------------------------------------------------------------
29
+ # Aleph Beth — palette tokens (mirrored from colors_and_type.css)
30
  # ---------------------------------------------------------------------------
31
+ AB = {
32
+ "ink_950": "#0B1626",
33
+ "ink_900": "#11203A",
34
+ "ink_800": "#1B2F4E",
35
+ "ink_700": "#2A4566",
36
+ "ink_600": "#44607F",
37
+ "ink_500": "#6B829D",
38
+ "ink_400": "#95A6BB",
39
+ "ink_300": "#BCC8D6",
40
+ "ink_200": "#DAE1EA",
41
+ "ink_100": "#ECF0F5",
42
+ "ink_50": "#F6F8FB",
43
+ "parchment_50": "#FCFAF2",
44
+ "parchment_100": "#F8F3E6",
45
+ "parchment_200": "#ECE5D2",
46
+ "parchment_300": "#DDD3B9",
47
+ "parchment_400": "#C2B695",
48
+ "gilt_50": "#FCEEDA",
49
+ "gilt_100": "#F8D9A4",
50
+ "gilt_200": "#F2BD72",
51
+ "gilt_300": "#EAA046",
52
+ "gilt_400": "#DC8B2A",
53
+ "gilt_500": "#A66718",
54
+ "gilt_600": "#7A4912",
55
+ "signal_100": "#C9DDEB",
56
+ "signal_200": "#9BBFD9",
57
+ "signal_300": "#6FA0C2",
58
+ "signal_400": "#4A82AA",
59
+ "signal_500": "#36678C",
60
+ "signal_600": "#244D6B",
61
+ "threat_400": "#D44A3E",
62
+ "threat_300": "#E07065",
63
+ "threat_100": "#F8DAD5",
64
+ "safe_400": "#3F8F6E",
65
+ "safe_300": "#66AB8C",
66
+ "safe_100": "#D4E8DD",
67
+ }
68
+
69
+ # Category colors stay within the brand families — no neon, no inventions.
70
  CATEGORY_COLORS = {
71
+ "benign": AB["safe_400"],
72
+ "direct_injection": AB["threat_400"],
73
+ "jailbreak": AB["gilt_400"],
74
+ "system_extraction": AB["gilt_600"],
75
+ "encoding_obfuscation": AB["signal_500"],
76
+ "persona_replacement": AB["gilt_300"],
77
+ "indirect_injection": AB["threat_300"],
78
+ "token_smuggling": AB["signal_600"],
79
+ "many_shot": AB["signal_400"],
80
+ "crescendo": AB["signal_200"],
81
+ "context_overflow": AB["ink_600"],
82
+ "prompt_leaking": AB["gilt_500"],
83
+ "unknown": AB["ink_400"],
84
  }
85
 
86
  CATEGORY_LABELS = {
 
104
  # ---------------------------------------------------------------------------
105
  MODEL_ID = "meta-llama/Llama-Prompt-Guard-2-86M"
106
  LABELS = ["Benign", "Malicious"]
107
+ HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
108
  _classifier = {"tokenizer": None, "model": None, "device": None}
109
 
110
 
 
112
  if _classifier["model"] is None:
113
  logger.info("Lazy-loading Llama Prompt Guard 2...")
114
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
115
+ kwargs = {"token": HF_TOKEN} if HF_TOKEN else {}
116
+ tok = AutoTokenizer.from_pretrained(MODEL_ID, **kwargs)
117
+ mdl = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, **kwargs)
118
  mdl.eval()
119
  dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
120
  mdl.to(dev)
 
171
  with torch.no_grad():
172
  outputs = model(**inputs)
173
  probs = torch.softmax(outputs.logits, dim=-1)[0].cpu().numpy()
 
174
  prob_dict = {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
175
  safety = float(probs[0])
176
  return prob_dict, safety
177
 
178
 
179
  # ---------------------------------------------------------------------------
180
+ # Plotly figure parchment surface, ink axes, restrained palette
181
  # ---------------------------------------------------------------------------
182
  def build_tsne_figure(selected_categories=None):
183
  fig = go.Figure()
 
198
  severities = [ALL_SEVERITIES[i] or "benign" for i in indices]
199
  hover_texts = [
200
  f"<b>{CATEGORY_LABELS.get(cat, cat)}</b><br>"
201
+ f"Severity {sev}<br>"
202
+ f"Index {idx}<br>"
203
  f"<i>{txt}</i>"
204
  for idx, txt, sev in zip(indices, texts_preview, severities)
205
  ]
 
212
  marker=dict(
213
  size=5 if len(indices) > 500 else 7,
214
  color=color,
215
+ opacity=0.78,
216
+ line=dict(width=0.5, color="rgba(17,32,58,0.20)"),
217
  ),
218
  text=hover_texts,
219
  hoverinfo="text",
220
  customdata=[str(i) for i in indices],
221
  ))
222
  fig.update_layout(
223
+ template="plotly_white",
224
+ paper_bgcolor=AB["parchment_100"],
225
+ plot_bgcolor=AB["parchment_50"],
226
+ font=dict(family="Geist, Inter, system-ui, sans-serif", color=AB["ink_700"]),
227
  title=dict(
228
+ text="<span style='font-family: Instrument Serif, serif; font-size:18px;'>"
229
+ "t-SNE — Prompt Security Landscape</span>",
230
+ font=dict(color=AB["ink_900"]),
231
  x=0.5,
232
+ xanchor="center",
233
  ),
234
  legend=dict(
235
+ title=dict(text="Category", font=dict(color=AB["ink_700"], size=11)),
236
+ bgcolor="rgba(252,250,242,0.88)",
237
+ bordercolor="rgba(17,32,58,0.12)",
238
  borderwidth=1,
239
+ font=dict(color=AB["ink_800"], size=10),
240
  itemsizing="constant",
241
  ),
242
  xaxis=dict(
243
+ title=dict(text="t-SNE 1", font=dict(color=AB["ink_500"], size=11)),
244
+ showgrid=True,
245
+ gridcolor="rgba(17,32,58,0.06)",
246
+ zeroline=False,
247
+ color=AB["ink_500"],
248
  ),
249
  yaxis=dict(
250
+ title=dict(text="t-SNE 2", font=dict(color=AB["ink_500"], size=11)),
251
+ showgrid=True,
252
+ gridcolor="rgba(17,32,58,0.06)",
253
+ zeroline=False,
254
+ color=AB["ink_500"],
255
  ),
256
+ margin=dict(l=44, r=44, t=56, b=44),
257
+ height=620,
258
  dragmode="pan",
259
+ hoverlabel=dict(
260
+ bgcolor=AB["parchment_50"],
261
+ bordercolor="rgba(17,32,58,0.12)",
262
+ font=dict(family="Geist, sans-serif", color=AB["ink_900"], size=12),
263
+ ),
264
  )
265
  return fig
266
 
 
281
  return gr.update(value=[]), build_tsne_figure([])
282
 
283
 
284
+ def _dataset_meta_block(category, severity, ground_truth):
285
+ return (
286
+ f"\n\n<span class='ab-eyebrow'>Dataset metadata</span>\n"
287
+ f"- Category — **{CATEGORY_LABELS.get(category, category)}**\n"
288
+ f"- Severity — **{severity}**\n"
289
+ f"- Ground truth — **{ground_truth}**\n"
290
+ )
291
+
292
+
293
  def on_dropdown_select(choice):
294
  if not choice:
295
+ return empty_analysis_html(), "*Select a prompt to begin.*", ""
296
  try:
297
  idx = int(choice.split(" | ")[0])
298
  text = ALL_TEXTS[idx]
299
  category = ALL_CATEGORIES[idx]
300
  severity = ALL_SEVERITIES[idx] or "N/A"
301
  ground_truth = "Malicious" if ALL_LABELS_DS[idx] == 1 else "Benign"
302
+ prob_dict, _ = analyze_prompt(text)
303
  pred_label = max(prob_dict, key=prob_dict.get)
304
  confidence = prob_dict[pred_label]
305
  result_html = build_result_html(pred_label, confidence, prob_dict, text)
306
  risk_text = build_risk_assessment(pred_label, confidence, prob_dict)
307
+ risk_text += _dataset_meta_block(category, severity, ground_truth)
 
 
 
 
 
308
  return result_html, risk_text, text
309
  except Exception as e:
310
  logger.error("Error: %s", e)
311
+ return empty_analysis_html(), f"Error {e}", ""
312
 
313
 
314
  def on_index_input(idx_str):
 
317
  try:
318
  idx = int(idx_str.strip())
319
  if idx < 0 or idx >= len(ALL_TEXTS):
320
+ return empty_analysis_html(), f"Invalid index {idx}", ""
321
  text = ALL_TEXTS[idx]
322
  category = ALL_CATEGORIES[idx]
323
  severity = ALL_SEVERITIES[idx] or "N/A"
324
  ground_truth = "Malicious" if ALL_LABELS_DS[idx] == 1 else "Benign"
325
+ prob_dict, _ = analyze_prompt(text)
326
  pred_label = max(prob_dict, key=prob_dict.get)
327
  confidence = prob_dict[pred_label]
328
  result_html = build_result_html(pred_label, confidence, prob_dict, text)
329
  risk_text = build_risk_assessment(pred_label, confidence, prob_dict)
330
+ risk_text += _dataset_meta_block(category, severity, ground_truth)
 
 
 
 
 
331
  return result_html, risk_text, text
332
  except Exception as e:
333
  logger.error("Error: %s", e)
334
+ return empty_analysis_html(), f"Error {e}", ""
335
 
336
 
337
  def on_manual_analyze(text):
338
  if not text or not text.strip():
339
  return empty_analysis_html(), ""
340
+ prob_dict, _ = analyze_prompt(text)
341
  pred_label = max(prob_dict, key=prob_dict.get)
342
  confidence = prob_dict[pred_label]
343
  result_html = build_result_html(pred_label, confidence, prob_dict, text)
 
346
 
347
 
348
  # ---------------------------------------------------------------------------
349
+ # UI builders — editorial, parchment surface, ink type, no emoji
350
  # ---------------------------------------------------------------------------
351
  def empty_analysis_html():
352
+ return f"""
353
+ <div class="ab-card ab-card--quiet">
354
+ <div class="ab-eyebrow">Idle</div>
355
+ <p class="ab-prose">
356
+ Click a point on the chart, pick a prompt from the list,
357
+ or paste your own below. The classifier runs on demand.
358
+ </p>
359
  </div>
360
  """
361
 
362
 
363
  def build_result_html(label, confidence, probs, text):
364
+ is_safe = label == "Benign"
365
+ accent = AB["safe_400"] if is_safe else AB["threat_400"]
366
+ marker = "●" # geometric primitive instead of emoji
367
  pct = confidence * 100
368
  safety_score = probs["Benign"] * 100
369
  safety_color = (
370
+ AB["safe_400"] if safety_score >= 70
371
+ else AB["gilt_400"] if safety_score >= 40
372
+ else AB["threat_400"]
373
  )
374
+
375
  bars_html = ""
376
  for lbl in LABELS:
377
  p = probs[lbl] * 100
378
+ c = AB["safe_400"] if lbl == "Benign" else AB["threat_400"]
379
  bars_html += f"""
380
+ <div class="ab-bar">
381
+ <div class="ab-bar__row">
382
+ <span class="ab-bar__label">{lbl}</span>
383
+ <span class="ab-bar__value">{p:.1f}%</span>
384
  </div>
385
+ <div class="ab-bar__track">
386
+ <div class="ab-bar__fill" style="width:{p}%; background:{c};"></div>
387
  </div>
388
  </div>
389
  """
390
+
391
+ preview = text[:180].replace("<", "&lt;").replace(">", "&gt;")
392
+ if len(text) > 180:
393
+ preview += "…"
394
+
395
  return f"""
396
+ <div class="ab-card">
397
+ <div class="ab-result__head">
398
+ <span class="ab-result__marker" style="color:{accent};">{marker}</span>
399
+ <div>
400
+ <div class="ab-eyebrow">Verdict</div>
401
+ <div class="ab-result__label" style="color:{accent};">{label}</div>
402
+ <div class="ab-caption">Confidence {pct:.1f}%</div>
 
 
 
 
 
 
 
403
  </div>
404
  </div>
405
+
406
+ <div class="ab-divider"></div>
407
+
408
+ <div class="ab-eyebrow">Safety score</div>
409
+ <div class="ab-score">
410
+ <div class="ab-score__value" style="color:{safety_color};">{safety_score:.0f}<span>/100</span></div>
411
+ <div class="ab-score__track">
412
+ <div class="ab-score__fill" style="width:{safety_score}%;"></div>
413
+ </div>
414
  </div>
415
+
416
+ <div class="ab-eyebrow" style="margin-top:18px;">Class probabilities</div>
417
+ <div class="ab-bars">{bars_html}</div>
418
+
419
+ <div class="ab-quote">
420
+ <div class="ab-eyebrow">Analyzed prompt</div>
421
+ <blockquote>“{preview}”</blockquote>
422
  </div>
423
  </div>
424
  """
 
428
  safety_score = probs["Benign"] * 100
429
  malicious_score = probs["Malicious"] * 100
430
  if label == "Benign" and confidence > 0.85:
431
+ level = "Low"
432
+ desc = "The request appears **safe**. No injection or jailbreak patterns were detected."
433
  elif label == "Benign":
434
+ level = "Moderate"
435
+ desc = "Likely benign, with moderate confidence. The wording may be ambiguous."
436
  elif confidence > 0.85:
437
+ level = "Critical"
438
+ desc = "**Malicious request detected** with high confidence. Likely injection or jailbreak."
439
  else:
440
+ level = "High"
441
+ desc = "**Malicious request detected.** Possible injection or jailbreak — review recommended."
442
  return (
443
+ f"<span class='ab-eyebrow'>Risk level {level}</span>\n\n{desc}\n\n"
444
+ f"- Safety score — **{safety_score:.0f}/100**\n"
445
+ f"- Predicted class **{label}** ({confidence*100:.1f}%)\n"
446
+ f"- P(Benign) {probs['Benign']*100:.1f}% &nbsp;·&nbsp; P(Malicious) — {malicious_score:.1f}%\n"
 
447
  )
448
 
449
 
 
461
  pct = count / total * 100
462
  label = CATEGORY_LABELS.get(cat, cat)
463
  cats_html += (
464
+ f'<div class="ab-stats__row">'
465
+ f'<span class="ab-stats__dot" style="background:{color};"></span>'
466
+ f'<span class="ab-stats__name">{label}</span>'
467
+ f'<span class="ab-stats__count">{count:,} <em>({pct:.1f}%)</em></span>'
468
  f'</div>'
469
  )
470
  return f"""
471
+ <div class="ab-card">
472
+ <div class="ab-eyebrow">Dataset</div>
473
+ <h3 class="ab-h3">Composition</h3>
474
+ <div class="ab-kpi-row">
475
+ <div class="ab-kpi">
476
+ <div class="ab-kpi__label">Total</div>
477
+ <div class="ab-kpi__value">{total:,}</div>
478
  </div>
479
+ <div class="ab-kpi">
480
+ <div class="ab-kpi__label" style="color:{AB['safe_400']};">Benign</div>
481
+ <div class="ab-kpi__value" style="color:{AB['safe_400']};">{n_benign:,}</div>
482
  </div>
483
+ <div class="ab-kpi">
484
+ <div class="ab-kpi__label" style="color:{AB['threat_400']};">Malicious</div>
485
+ <div class="ab-kpi__value" style="color:{AB['threat_400']};">{n_malicious:,}</div>
486
  </div>
487
  </div>
488
+ <div class="ab-stats">{cats_html}</div>
 
 
489
  </div>
490
  """
491
 
492
 
493
  # ---------------------------------------------------------------------------
494
+ # JavaScript bridge: Plotly clicks Gradio hidden input
495
  # ---------------------------------------------------------------------------
496
  PLOTLY_CLICK_JS = """
497
  () => {
 
505
  if (data && data.points && data.points.length > 0) {
506
  const idx = data.points[0].customdata;
507
  if (idx !== undefined && idx !== null) {
508
+ const inputEl = document.querySelector('#click-index-input textarea')
509
+ || document.querySelector('#click-index-input input');
510
  if (inputEl) {
511
  const proto = inputEl.tagName === 'TEXTAREA'
512
  ? window.HTMLTextAreaElement.prototype
 
539
 
540
 
541
  # ---------------------------------------------------------------------------
542
+ # Aleph Beth — global CSS
543
+ # ---------------------------------------------------------------------------
544
+ ALEPH_BETH_CSS = """
545
+ @import url('https://fonts.googleapis.com/css2?family=Instrument+Serif:ital@0;1&family=Geist:wght@300;400;500;600;700&family=Geist+Mono:wght@400;500;600&family=Frank+Ruhl+Libre:wght@400;500&family=Amiri:wght@400;700&display=swap');
546
+
547
+ :root, .gradio-container {
548
+ --ab-ink-950:#0B1626; --ab-ink-900:#11203A; --ab-ink-800:#1B2F4E;
549
+ --ab-ink-700:#2A4566; --ab-ink-600:#44607F; --ab-ink-500:#6B829D;
550
+ --ab-ink-400:#95A6BB; --ab-ink-300:#BCC8D6; --ab-ink-200:#DAE1EA;
551
+ --ab-ink-100:#ECF0F5; --ab-ink-50:#F6F8FB;
552
+ --ab-parchment-50:#FCFAF2; --ab-parchment-100:#F8F3E6;
553
+ --ab-parchment-200:#ECE5D2; --ab-parchment-300:#DDD3B9;
554
+ --ab-gilt-300:#EAA046; --ab-gilt-400:#DC8B2A; --ab-gilt-500:#A66718; --ab-gilt-600:#7A4912;
555
+ --ab-signal-300:#6FA0C2; --ab-signal-400:#4A82AA; --ab-signal-500:#36678C;
556
+ --ab-threat-400:#D44A3E; --ab-safe-400:#3F8F6E;
557
+ --ab-border: rgba(17,32,58,0.12);
558
+ --ab-border-subtle: rgba(17,32,58,0.06);
559
+ --ab-shadow-sm: 0 2px 6px rgba(17,32,58,0.07), 0 1px 2px rgba(17,32,58,0.04);
560
+ --ab-shadow-md: 0 8px 20px rgba(17,32,58,0.08), 0 2px 4px rgba(17,32,58,0.05);
561
+ --ab-ease: cubic-bezier(0.16, 1, 0.3, 1);
562
+ --font-display: 'Instrument Serif', 'Cormorant Garamond', serif;
563
+ --font-body: 'Geist', 'Inter', system-ui, sans-serif;
564
+ --font-mono: 'Geist Mono', 'JetBrains Mono', ui-monospace, monospace;
565
+ }
566
+
567
+ /* ---------- Base canvas ---------- */
568
+ .gradio-container, body, html {
569
+ background: var(--ab-parchment-100) !important;
570
+ color: var(--ab-ink-900) !important;
571
+ font-family: var(--font-body) !important;
572
+ font-feature-settings: 'ss01', 'cv01';
573
+ }
574
+ .gradio-container { max-width: 1440px !important; margin: 0 auto !important; padding: 24px 32px !important; }
575
+
576
+ /* Remove Gradio gradient backgrounds */
577
+ .gradio-container *::before, .gradio-container *::after { background-image: none !important; }
578
+
579
+ /* ---------- Header / brand ---------- */
580
+ .ab-header {
581
+ padding: 18px 4px 22px;
582
+ border-bottom: 1px solid var(--ab-border);
583
+ margin-bottom: 24px;
584
+ display: flex; align-items: baseline; justify-content: space-between; gap: 24px;
585
+ flex-wrap: wrap;
586
+ }
587
+ .ab-header__brand {
588
+ display: flex; align-items: baseline; gap: 14px;
589
+ }
590
+ .ab-header__mark {
591
+ font-family: var(--font-display);
592
+ font-size: 32px; line-height: 1;
593
+ color: var(--ab-gilt-500);
594
+ letter-spacing: -0.01em;
595
+ }
596
+ .ab-header__mark .heb { font-family: 'Frank Ruhl Libre', serif; }
597
+ .ab-header__mark .ar { font-family: 'Amiri', serif; }
598
+ .ab-header__title {
599
+ font-family: var(--font-display);
600
+ font-size: 38px; line-height: 1.05;
601
+ color: var(--ab-ink-900);
602
+ letter-spacing: -0.01em;
603
+ margin: 0;
604
+ }
605
+ .ab-header__title em { font-style: italic; color: var(--ab-gilt-600); }
606
+ .ab-header__sub {
607
+ font-family: var(--font-body);
608
+ color: var(--ab-ink-700);
609
+ font-size: 14px; line-height: 1.5;
610
+ max-width: 460px;
611
+ }
612
+ .ab-header__sub a { color: var(--ab-signal-500); text-decoration: underline; text-underline-offset: 3px; }
613
+
614
+ /* ---------- Eyebrow / labels / type ---------- */
615
+ .ab-eyebrow {
616
+ display: inline-block;
617
+ font-family: var(--font-body);
618
+ font-size: 11px; font-weight: 500;
619
+ text-transform: uppercase;
620
+ letter-spacing: 0.16em;
621
+ color: var(--ab-gilt-600);
622
+ margin-bottom: 6px;
623
+ }
624
+ .ab-h3 {
625
+ font-family: var(--font-display);
626
+ font-size: 22px; line-height: 1.2;
627
+ color: var(--ab-ink-900);
628
+ margin: 0 0 12px 0;
629
+ letter-spacing: -0.005em;
630
+ }
631
+ .ab-prose {
632
+ font-family: var(--font-body);
633
+ font-size: 14px; line-height: 1.55;
634
+ color: var(--ab-ink-700);
635
+ }
636
+ .ab-caption {
637
+ font-family: var(--font-body);
638
+ font-size: 12px;
639
+ color: var(--ab-ink-500);
640
+ letter-spacing: 0.02em;
641
+ }
642
+ .ab-divider {
643
+ height: 1px; background: var(--ab-border);
644
+ margin: 16px 0;
645
+ }
646
+
647
+ /* ---------- Cards ---------- */
648
+ .ab-card {
649
+ background: var(--ab-parchment-50);
650
+ border: 1px solid var(--ab-border);
651
+ border-radius: 12px;
652
+ padding: 20px 22px;
653
+ box-shadow: var(--ab-shadow-sm);
654
+ font-family: var(--font-body);
655
+ }
656
+ .ab-card--quiet {
657
+ background: transparent;
658
+ border-style: dashed;
659
+ box-shadow: none;
660
+ }
661
+
662
+ /* ---------- How-to (3-up) ---------- */
663
+ .ab-howto {
664
+ display: grid;
665
+ grid-template-columns: repeat(3, 1fr);
666
+ gap: 12px;
667
+ margin: 8px 0 20px;
668
+ }
669
+ @media (max-width: 900px) { .ab-howto { grid-template-columns: 1fr; } }
670
+ .ab-howto__step {
671
+ background: var(--ab-parchment-50);
672
+ border: 1px solid var(--ab-border);
673
+ border-radius: 12px;
674
+ padding: 16px 18px;
675
+ transition: transform var(--ab-ease) 220ms, box-shadow var(--ab-ease) 220ms;
676
+ }
677
+ .ab-howto__step:hover { transform: translateY(-1px); box-shadow: var(--ab-shadow-md); }
678
+ .ab-howto__num {
679
+ font-family: var(--font-display);
680
+ font-size: 28px;
681
+ color: var(--ab-gilt-500);
682
+ line-height: 1;
683
+ }
684
+ .ab-howto__title {
685
+ font-family: var(--font-body);
686
+ font-size: 14px; font-weight: 600;
687
+ color: var(--ab-ink-900);
688
+ margin: 8px 0 6px;
689
+ }
690
+ .ab-howto__body {
691
+ font-family: var(--font-body);
692
+ font-size: 13px; line-height: 1.5;
693
+ color: var(--ab-ink-700);
694
+ }
695
+
696
+ /* ---------- Result card ---------- */
697
+ .ab-result__head {
698
+ display: flex; align-items: center; gap: 14px;
699
+ }
700
+ .ab-result__marker {
701
+ font-size: 28px; line-height: 1;
702
+ }
703
+ .ab-result__label {
704
+ font-family: var(--font-display);
705
+ font-size: 28px;
706
+ line-height: 1.1;
707
+ letter-spacing: -0.01em;
708
+ margin-top: 2px;
709
+ }
710
+ .ab-score {
711
+ display: flex; align-items: center; gap: 14px;
712
+ margin: 6px 0 4px;
713
+ }
714
+ .ab-score__value {
715
+ font-family: var(--font-display);
716
+ font-size: 44px; line-height: 1;
717
+ letter-spacing: -0.02em;
718
+ }
719
+ .ab-score__value span { font-size: 16px; color: var(--ab-ink-500); margin-left: 2px; }
720
+ .ab-score__track {
721
+ flex: 1; height: 8px;
722
+ background: var(--ab-parchment-200);
723
+ border-radius: 999px; overflow: hidden;
724
+ }
725
+ .ab-score__fill {
726
+ height: 100%;
727
+ background: linear-gradient(90deg, var(--ab-threat-400), var(--ab-gilt-400) 50%, var(--ab-safe-400));
728
+ border-radius: 999px;
729
+ transition: width 380ms var(--ab-ease);
730
+ }
731
+ .ab-bars { display: flex; flex-direction: column; gap: 10px; margin-top: 4px; }
732
+ .ab-bar__row {
733
+ display: flex; justify-content: space-between;
734
+ font-size: 13px; margin-bottom: 4px;
735
+ }
736
+ .ab-bar__label { color: var(--ab-ink-800); font-weight: 500; }
737
+ .ab-bar__value { color: var(--ab-ink-700); font-family: var(--font-mono); font-size: 12px; }
738
+ .ab-bar__track {
739
+ height: 8px; background: var(--ab-parchment-200);
740
+ border-radius: 999px; overflow: hidden;
741
+ }
742
+ .ab-bar__fill { height: 100%; border-radius: 999px; transition: width 380ms var(--ab-ease); }
743
+ .ab-quote {
744
+ margin-top: 18px;
745
+ padding: 14px 16px;
746
+ background: var(--ab-parchment-100);
747
+ border-left: 2px solid var(--ab-gilt-400);
748
+ border-radius: 4px;
749
+ }
750
+ .ab-quote blockquote {
751
+ font-family: var(--font-display);
752
+ font-style: italic;
753
+ font-size: 16px;
754
+ color: var(--ab-ink-800);
755
+ margin: 6px 0 0; padding: 0;
756
+ line-height: 1.45;
757
+ }
758
+
759
+ /* ---------- Stats ---------- */
760
+ .ab-kpi-row {
761
+ display: grid; grid-template-columns: repeat(3, 1fr); gap: 10px;
762
+ margin: 4px 0 16px;
763
+ }
764
+ .ab-kpi {
765
+ background: var(--ab-parchment-100);
766
+ border: 1px solid var(--ab-border-subtle);
767
+ border-radius: 8px;
768
+ padding: 10px 12px;
769
+ text-align: center;
770
+ }
771
+ .ab-kpi__label {
772
+ font-family: var(--font-body);
773
+ font-size: 11px; text-transform: uppercase; letter-spacing: 0.12em;
774
+ color: var(--ab-ink-500);
775
+ margin-bottom: 4px;
776
+ }
777
+ .ab-kpi__value {
778
+ font-family: var(--font-display);
779
+ font-size: 26px; line-height: 1;
780
+ color: var(--ab-ink-900);
781
+ letter-spacing: -0.01em;
782
+ }
783
+ .ab-stats { display: flex; flex-direction: column; }
784
+ .ab-stats__row {
785
+ display: flex; align-items: center; gap: 10px;
786
+ padding: 6px 0;
787
+ border-bottom: 1px solid var(--ab-border-subtle);
788
+ font-size: 13px;
789
+ }
790
+ .ab-stats__row:last-child { border-bottom: 0; }
791
+ .ab-stats__dot { width: 8px; height: 8px; border-radius: 999px; flex-shrink: 0; }
792
+ .ab-stats__name { color: var(--ab-ink-800); flex: 1; }
793
+ .ab-stats__count { color: var(--ab-ink-600); font-family: var(--font-mono); font-size: 12px; }
794
+ .ab-stats__count em { color: var(--ab-ink-500); font-style: normal; }
795
+
796
+ /* ---------- Gradio component overrides ---------- */
797
+ .gradio-container .block, .gradio-container .form, .gradio-container .panel {
798
+ background: transparent !important;
799
+ border: none !important;
800
+ }
801
+ .gradio-container .gr-box, .gradio-container .gr-panel,
802
+ .gradio-container .gr-form, .gradio-container [data-testid="block"] {
803
+ background: transparent !important;
804
+ border: none !important;
805
+ box-shadow: none !important;
806
+ }
807
+
808
+ /* Plot wrapper — paper card */
809
+ #tsne-chart {
810
+ background: var(--ab-parchment-50) !important;
811
+ border: 1px solid var(--ab-border) !important;
812
+ border-radius: 12px !important;
813
+ padding: 8px !important;
814
+ box-shadow: var(--ab-shadow-sm) !important;
815
+ }
816
+
817
+ /* Buttons */
818
+ .gradio-container button {
819
+ font-family: var(--font-body) !important;
820
+ font-weight: 500 !important;
821
+ letter-spacing: 0 !important;
822
+ border-radius: 8px !important;
823
+ transition: transform 80ms var(--ab-ease), background-color 220ms var(--ab-ease) !important;
824
+ }
825
+ .gradio-container button:active { transform: scale(0.98) !important; }
826
+ .gradio-container button.primary, .gradio-container button[variant="primary"] {
827
+ background: var(--ab-ink-900) !important;
828
+ color: var(--ab-parchment-50) !important;
829
+ border: 1px solid var(--ab-ink-900) !important;
830
+ }
831
+ .gradio-container button.primary:hover {
832
+ background: var(--ab-ink-800) !important;
833
+ }
834
+ .gradio-container button.secondary {
835
+ background: var(--ab-parchment-50) !important;
836
+ color: var(--ab-ink-900) !important;
837
+ border: 1px solid var(--ab-border) !important;
838
+ }
839
+ .gradio-container button.secondary:hover {
840
+ background: var(--ab-parchment-200) !important;
841
+ }
842
+
843
+ /* Text inputs / textareas */
844
+ .gradio-container input[type="text"],
845
+ .gradio-container textarea,
846
+ .gradio-container .gr-input,
847
+ .gradio-container .gr-textbox textarea {
848
+ background: var(--ab-parchment-50) !important;
849
+ color: var(--ab-ink-900) !important;
850
+ border: 1px solid var(--ab-border) !important;
851
+ border-radius: 8px !important;
852
+ font-family: var(--font-body) !important;
853
+ font-size: 14px !important;
854
+ box-shadow: inset 0 1px 2px rgba(17,32,58,0.04);
855
+ }
856
+ .gradio-container input[type="text"]:focus,
857
+ .gradio-container textarea:focus,
858
+ .gradio-container .gr-textbox textarea:focus {
859
+ outline: none !important;
860
+ border-color: var(--ab-gilt-400) !important;
861
+ box-shadow: 0 0 0 3px rgba(220,139,42,0.18) !important;
862
+ }
863
+
864
+ /* Labels */
865
+ .gradio-container label, .gradio-container .label-wrap {
866
+ color: var(--ab-ink-700) !important;
867
+ font-family: var(--font-body) !important;
868
+ font-size: 13px !important;
869
+ font-weight: 500 !important;
870
+ letter-spacing: 0.01em !important;
871
+ }
872
+
873
+ /* Dropdowns */
874
+ .gradio-container .gr-dropdown, .gradio-container [data-testid="dropdown"] select,
875
+ .gradio-container .wrap.svelte-1cl284s {
876
+ background: var(--ab-parchment-50) !important;
877
+ border: 1px solid var(--ab-border) !important;
878
+ border-radius: 8px !important;
879
+ color: var(--ab-ink-900) !important;
880
+ }
881
+
882
+ /* Checkbox group filter */
883
+ .gradio-container .gr-check-radio,
884
+ .gradio-container fieldset[data-testid="checkbox-group"] {
885
+ background: var(--ab-parchment-50) !important;
886
+ border: 1px solid var(--ab-border) !important;
887
+ border-radius: 12px !important;
888
+ padding: 12px 14px !important;
889
+ }
890
+ .gradio-container fieldset[data-testid="checkbox-group"] label {
891
+ background: var(--ab-parchment-100) !important;
892
+ border: 1px solid var(--ab-border-subtle) !important;
893
+ border-radius: 999px !important;
894
+ padding: 4px 10px !important;
895
+ margin: 3px !important;
896
+ font-size: 12px !important;
897
+ }
898
+ .gradio-container fieldset[data-testid="checkbox-group"] label:hover {
899
+ background: var(--ab-parchment-200) !important;
900
+ }
901
+ .gradio-container input[type="checkbox"]:checked + * {
902
+ color: var(--ab-ink-900) !important;
903
+ }
904
+ .gradio-container input[type="checkbox"] {
905
+ accent-color: var(--ab-gilt-400) !important;
906
+ }
907
+
908
+ /* Markdown */
909
+ .gradio-container .markdown, .gradio-container .prose {
910
+ color: var(--ab-ink-800) !important;
911
+ font-family: var(--font-body) !important;
912
+ }
913
+ .gradio-container .markdown h1, .gradio-container .markdown h2,
914
+ .gradio-container .prose h1, .gradio-container .prose h2 {
915
+ font-family: var(--font-display) !important;
916
+ color: var(--ab-ink-900) !important;
917
+ font-weight: 400 !important;
918
+ letter-spacing: -0.01em !important;
919
+ }
920
+ .gradio-container .markdown h3, .gradio-container .prose h3 {
921
+ font-family: var(--font-body) !important;
922
+ font-weight: 600 !important;
923
+ color: var(--ab-ink-900) !important;
924
+ font-size: 16px !important;
925
+ margin-bottom: 8px !important;
926
+ }
927
+ .gradio-container .markdown strong { color: var(--ab-ink-900) !important; font-weight: 600 !important; }
928
+ .gradio-container .markdown a { color: var(--ab-signal-500) !important; }
929
+ .gradio-container .markdown hr {
930
+ border: none !important;
931
+ border-top: 1px solid var(--ab-border) !important;
932
+ margin: 18px 0 !important;
933
+ }
934
+
935
+ /* Hidden index input (kept invisible) */
936
+ #click-index-input {
937
+ position: absolute !important;
938
+ width: 1px !important;
939
+ height: 1px !important;
940
+ overflow: hidden !important;
941
+ opacity: 0 !important;
942
+ pointer-events: none !important;
943
+ }
944
+
945
+ /* Footer */
946
+ .ab-footer {
947
+ border-top: 1px solid var(--ab-border);
948
+ margin-top: 36px;
949
+ padding-top: 18px;
950
+ text-align: center;
951
+ }
952
+ .ab-footer__line {
953
+ font-family: var(--font-body);
954
+ color: var(--ab-ink-500);
955
+ font-size: 12px;
956
+ letter-spacing: 0.02em;
957
+ }
958
+ .ab-footer__line a { color: var(--ab-signal-500); }
959
+ .ab-footer__mark {
960
+ font-family: var(--font-display);
961
+ color: var(--ab-gilt-500);
962
+ font-size: 14px;
963
+ letter-spacing: 0.04em;
964
+ margin-bottom: 6px;
965
+ }
966
+ .ab-footer__mark .heb { font-family: 'Frank Ruhl Libre', serif; }
967
+ .ab-footer__mark .ar { font-family: 'Amiri', serif; }
968
+ """
969
+
970
+
971
+ # ---------------------------------------------------------------------------
972
+ # Header / How-to / Footer markup
973
  # ---------------------------------------------------------------------------
974
+ HEADER_HTML = """
975
+ <header class="ab-header">
976
+ <div class="ab-header__brand">
977
+ <div class="ab-header__mark">
978
+ <span class="heb">א-ב</span>&nbsp;·&nbsp;<span class="ar">أب</span>
979
+ </div>
980
+ <div>
981
+ <h1 class="ab-header__title">GuardLLM <em>—</em> Prompt Security Visualizer</h1>
982
+ </div>
983
+ </div>
984
+ <p class="ab-header__sub">
985
+ Editorial inspection of the prompt attack surface. Powered by
986
+ <a href="https://huggingface.co/meta-llama/Llama-Prompt-Guard-2-86M" target="_blank">Llama Prompt Guard 2 (86M)</a>
987
+ on the <a href="https://huggingface.co/datasets/neuralchemy/Prompt-injection-dataset" target="_blank">neuralchemy</a> corpus.
988
  </p>
989
+ </header>
990
  """
991
 
992
  HOW_TO_HTML = """
993
+ <div class="ab-howto">
994
+ <div class="ab-howto__step">
995
+ <div class="ab-howto__num">01</div>
996
+ <div class="ab-eyebrow">Map</div>
997
+ <div class="ab-howto__title">Explore the landscape</div>
998
+ <div class="ab-howto__body">
999
+ Each point is a prompt placed by semantic similarity. Color encodes the attack class.
1000
+ Hover to preview, scroll to zoom, drag to pan.
1001
  </div>
1002
+ </div>
1003
+ <div class="ab-howto__step">
1004
+ <div class="ab-howto__num">02</div>
1005
+ <div class="ab-eyebrow">Inspect</div>
1006
+ <div class="ab-howto__title">Click to analyze</div>
1007
+ <div class="ab-howto__body">
1008
+ Selecting a point runs the classifier and returns a verdict, a safety score,
1009
+ and the full class probability breakdown.
1010
  </div>
1011
+ </div>
1012
+ <div class="ab-howto__step">
1013
+ <div class="ab-howto__num">03</div>
1014
+ <div class="ab-eyebrow">Probe</div>
1015
+ <div class="ab-howto__title">Try your own prompt</div>
1016
+ <div class="ab-howto__body">
1017
+ Paste any text into the custom field below to see whether the model would flag
1018
+ it as injection or jailbreak.
1019
  </div>
1020
  </div>
1021
  </div>
1022
  """
1023
 
1024
+ FOOTER_HTML = """
1025
+ <footer class="ab-footer">
1026
+ <div class="ab-footer__mark"><span class="heb">א-ב</span> · ALEPH BETH · <span class="ar">أب</span></div>
1027
+ <div class="ab-footer__line">
1028
+ GuardLLM — Prompt Security Visualizer.
1029
+ Model: <a href="https://huggingface.co/meta-llama/Llama-Prompt-Guard-2-86M">Llama Prompt Guard 2 (86M)</a>.
1030
+ Dataset: <a href="https://huggingface.co/datasets/neuralchemy/Prompt-injection-dataset">neuralchemy / Prompt-injection-dataset</a>.
1031
+ </div>
1032
+ </footer>
1033
+ """
1034
+
1035
+
1036
+ # ---------------------------------------------------------------------------
1037
+ # Gradio theme (parchment / ink)
1038
+ # ---------------------------------------------------------------------------
1039
+ ab_theme = gr.themes.Base(
1040
+ primary_hue=gr.themes.Color(
1041
+ c50=AB["parchment_50"], c100=AB["parchment_100"], c200=AB["parchment_200"],
1042
+ c300=AB["parchment_300"], c400=AB["gilt_300"], c500=AB["gilt_400"],
1043
+ c600=AB["gilt_500"], c700=AB["gilt_600"], c800=AB["ink_800"],
1044
+ c900=AB["ink_900"], c950=AB["ink_950"],
1045
+ ),
1046
+ neutral_hue=gr.themes.Color(
1047
+ c50=AB["parchment_50"], c100=AB["parchment_100"], c200=AB["parchment_200"],
1048
+ c300=AB["ink_200"], c400=AB["ink_300"], c500=AB["ink_500"],
1049
+ c600=AB["ink_600"], c700=AB["ink_700"], c800=AB["ink_800"],
1050
+ c900=AB["ink_900"], c950=AB["ink_950"],
1051
+ ),
1052
+ font=[gr.themes.GoogleFont("Geist"), "Inter", "system-ui", "sans-serif"],
1053
+ font_mono=[gr.themes.GoogleFont("Geist Mono"), "JetBrains Mono", "monospace"],
1054
+ ).set(
1055
+ body_background_fill=AB["parchment_100"],
1056
+ body_text_color=AB["ink_900"],
1057
+ background_fill_primary=AB["parchment_50"],
1058
+ background_fill_secondary=AB["parchment_100"],
1059
+ border_color_primary="rgba(17,32,58,0.12)",
1060
+ block_background_fill=AB["parchment_50"],
1061
+ block_border_color="rgba(17,32,58,0.12)",
1062
+ block_label_text_color=AB["ink_700"],
1063
+ block_title_text_color=AB["ink_900"],
1064
+ input_background_fill=AB["parchment_50"],
1065
+ input_border_color="rgba(17,32,58,0.12)",
1066
+ input_border_color_focus=AB["gilt_400"],
1067
+ button_primary_background_fill=AB["ink_900"],
1068
+ button_primary_background_fill_hover=AB["ink_800"],
1069
+ button_primary_text_color=AB["parchment_50"],
1070
+ button_secondary_background_fill=AB["parchment_50"],
1071
+ button_secondary_background_fill_hover=AB["parchment_200"],
1072
+ button_secondary_text_color=AB["ink_900"],
1073
+ )
1074
+
1075
+
1076
+ # ---------------------------------------------------------------------------
1077
+ # Gradio Interface
1078
+ # ---------------------------------------------------------------------------
1079
  with gr.Blocks(
1080
+ title="GuardLLM Prompt Security Visualizer",
1081
+ theme=ab_theme,
1082
+ css=ALEPH_BETH_CSS,
1083
  ) as demo:
1084
 
1085
+ gr.HTML(HEADER_HTML)
1086
  gr.HTML(HOW_TO_HTML)
1087
 
1088
+ click_index = gr.Textbox(value="", visible=True, elem_id="click-index-input")
 
 
 
 
1089
 
1090
  with gr.Row():
1091
+ # ---- Left t-SNE chart + filters ----
1092
  with gr.Column(scale=3):
1093
  with gr.Row():
1094
+ select_all_btn = gr.Button("Select all", size="sm", scale=1)
1095
+ deselect_all_btn = gr.Button("Deselect all", size="sm", scale=1)
1096
 
1097
  category_filter = gr.CheckboxGroup(
1098
  choices=UNIQUE_CATEGORIES,
 
1102
  )
1103
  tsne_plot = gr.Plot(
1104
  value=build_tsne_figure(),
1105
+ label="t-SNE space",
1106
  elem_id="tsne-chart",
1107
  )
1108
  gr.Markdown(
1109
+ "<span class='ab-caption'>Click a point to inspect it. "
1110
+ "Hover to preview. Scroll to zoom, drag to pan.</span>"
1111
  )
1112
 
1113
+ # ---- Right Analysis + controls + stats ----
1114
  with gr.Column(scale=2):
1115
+ gr.HTML("<div class='ab-eyebrow'>Analysis</div>"
1116
+ "<h3 class='ab-h3'>Verdict & confidence</h3>")
1117
  result_html = gr.HTML(value=empty_analysis_html())
1118
  risk_md = gr.Markdown(value="")
1119
+ full_prompt = gr.Textbox(
1120
+ label="Full prompt",
1121
+ lines=3,
1122
+ interactive=False,
1123
+ visible=True,
1124
+ )
1125
 
1126
  gr.Markdown("---")
1127
 
1128
+ gr.HTML("<div class='ab-eyebrow'>Library</div>"
1129
+ "<h3 class='ab-h3'>Pick a prompt</h3>")
1130
  prompt_dropdown = gr.Dropdown(
1131
  choices=DROPDOWN_CHOICES,
1132
+ label="Search the dataset",
1133
  filterable=True,
1134
  interactive=True,
1135
  )
1136
 
1137
+ gr.HTML("<div class='ab-eyebrow' style='margin-top:14px;'>Custom</div>"
1138
+ "<h3 class='ab-h3'>Analyze your own</h3>")
1139
  manual_input = gr.Textbox(
1140
+ label="Prompt",
1141
+ placeholder="Type or paste a request to evaluate…",
1142
  lines=2,
1143
  )
1144
+ analyze_btn = gr.Button("Inspect", variant="primary")
1145
 
1146
  gr.Markdown("---")
1147
 
1148
  gr.HTML(build_stats_html())
1149
 
1150
  # ---- Events ----
1151
+ category_filter.change(fn=on_filter_change, inputs=[category_filter], outputs=[tsne_plot])
1152
+ select_all_btn.click(fn=select_all_categories, inputs=[], outputs=[category_filter, tsne_plot])
1153
+ deselect_all_btn.click(fn=deselect_all_categories, inputs=[], outputs=[category_filter, tsne_plot])
1154
+ click_index.change(fn=on_index_input, inputs=[click_index],
1155
+ outputs=[result_html, risk_md, full_prompt])
1156
+ prompt_dropdown.change(fn=on_dropdown_select, inputs=[prompt_dropdown],
1157
+ outputs=[result_html, risk_md, full_prompt])
1158
+ analyze_btn.click(fn=on_manual_analyze, inputs=[manual_input],
1159
+ outputs=[result_html, risk_md])
1160
+ manual_input.submit(fn=on_manual_analyze, inputs=[manual_input],
1161
+ outputs=[result_html, risk_md])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1162
  demo.load(fn=None, inputs=None, outputs=None, js=PLOTLY_CLICK_JS)
1163
 
1164
+ gr.HTML(FOOTER_HTML)
 
 
 
 
 
 
 
 
 
 
 
1165
 
1166
 
1167
  logger.info("Gradio app built. Ready to launch.")
1168
 
1169
  if __name__ == "__main__":
1170
+ demo.launch()