AlephBeth-AI commited on
Commit
b1516cb
·
verified ·
1 Parent(s): 2718a1f

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +105 -92
app.py CHANGED
@@ -45,20 +45,20 @@ CATEGORY_COLORS = {
45
  "unknown": "#64748b",
46
  }
47
 
48
- CATEGORY_LABELS_FR = {
49
- "benign": "Benin",
50
- "direct_injection": "Injection directe",
51
  "jailbreak": "Jailbreak",
52
- "system_extraction": "Extraction systeme",
53
- "encoding_obfuscation": "Obfuscation/Encodage",
54
- "persona_replacement": "Remplacement persona",
55
- "indirect_injection": "Injection indirecte",
56
- "token_smuggling": "Token smuggling",
57
- "many_shot": "Many-shot",
58
  "crescendo": "Crescendo",
59
- "context_overflow": "Overflow contexte",
60
- "prompt_leaking": "Fuite de prompt",
61
- "unknown": "Inconnu",
62
  }
63
 
64
  # ---------------------------------------------------------------------------
@@ -132,8 +132,6 @@ def analyze_prompt(text):
132
  outputs = model(**inputs)
133
  probs = torch.softmax(outputs.logits, dim=-1)[0].cpu().numpy()
134
  pred_idx = int(np.argmax(probs))
135
- pred_label = LABELS[pred_idx]
136
- confidence = float(probs[pred_idx])
137
  prob_dict = {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
138
  safety = float(probs[0])
139
  return prob_dict, safety
@@ -160,18 +158,18 @@ def build_tsne_figure(selected_categories=None):
160
  ]
161
  severities = [ALL_SEVERITIES[i] or "benign" for i in indices]
162
  hover_texts = [
163
- f"<b>{CATEGORY_LABELS_FR.get(cat, cat)}</b><br>"
164
- f"Severite: {sev}<br>"
165
  f"Index: {idx}<br>"
166
  f"<i>{txt}</i>"
167
  for idx, txt, sev in zip(indices, texts_preview, severities)
168
  ]
169
  color = CATEGORY_COLORS.get(cat, CATEGORY_COLORS["unknown"])
170
- label_fr = CATEGORY_LABELS_FR.get(cat, cat)
171
  fig.add_trace(go.Scatter(
172
  x=x, y=y,
173
  mode="markers",
174
- name=label_fr,
175
  marker=dict(
176
  size=5 if len(indices) > 500 else 7,
177
  color=color,
@@ -187,12 +185,12 @@ def build_tsne_figure(selected_categories=None):
187
  paper_bgcolor="#0f172a",
188
  plot_bgcolor="#1e293b",
189
  title=dict(
190
- text="Espace d'Embedding t-SNE - Paysage de Securite des Prompts",
191
  font=dict(size=16, color="#e2e8f0"),
192
  x=0.5,
193
  ),
194
  legend=dict(
195
- title=dict(text="Categorie", font=dict(color="#94a3b8")),
196
  bgcolor="rgba(15,23,42,0.9)",
197
  bordercolor="#334155",
198
  borderwidth=1,
@@ -222,9 +220,17 @@ def on_filter_change(categories):
222
  return build_tsne_figure(sel)
223
 
224
 
 
 
 
 
 
 
 
 
225
  def on_dropdown_select(choice):
226
  if not choice:
227
- return empty_analysis_html(), "*Selectionnez un prompt.*", ""
228
  try:
229
  idx = int(choice.split(" | ")[0])
230
  text = ALL_TEXTS[idx]
@@ -237,24 +243,24 @@ def on_dropdown_select(choice):
237
  result_html = build_result_html(pred_label, confidence, prob_dict, text)
238
  risk_text = build_risk_assessment(pred_label, confidence, prob_dict)
239
  risk_text += (
240
- f"\n\n---\n**Metadonnees du dataset :**\n"
241
- f"- Categorie : **{CATEGORY_LABELS_FR.get(category, category)}**\n"
242
- f"- Severite : **{severity}**\n"
243
- f"- Verite terrain : **{ground_truth}**\n"
244
  )
245
  return result_html, risk_text, text
246
  except Exception as e:
247
  logger.error("Error: %s", e)
248
- return empty_analysis_html(), f"Erreur : {e}", ""
249
 
250
 
251
  def on_index_input(idx_str):
252
  if not idx_str or not idx_str.strip():
253
- return empty_analysis_html(), "*Cliquez sur un point du graphique.*", ""
254
  try:
255
  idx = int(idx_str.strip())
256
  if idx < 0 or idx >= len(ALL_TEXTS):
257
- return empty_analysis_html(), f"Index invalide : {idx}", ""
258
  text = ALL_TEXTS[idx]
259
  category = ALL_CATEGORIES[idx]
260
  severity = ALL_SEVERITIES[idx] or "N/A"
@@ -265,15 +271,15 @@ def on_index_input(idx_str):
265
  result_html = build_result_html(pred_label, confidence, prob_dict, text)
266
  risk_text = build_risk_assessment(pred_label, confidence, prob_dict)
267
  risk_text += (
268
- f"\n\n---\n**Metadonnees du dataset :**\n"
269
- f"- Categorie : **{CATEGORY_LABELS_FR.get(category, category)}**\n"
270
- f"- Severite : **{severity}**\n"
271
- f"- Verite terrain : **{ground_truth}**\n"
272
  )
273
  return result_html, risk_text, text
274
  except Exception as e:
275
  logger.error("Error: %s", e)
276
- return empty_analysis_html(), f"Erreur : {e}", ""
277
 
278
 
279
  def on_manual_analyze(text):
@@ -293,9 +299,9 @@ def on_manual_analyze(text):
293
  def empty_analysis_html():
294
  return """
295
  <div style="text-align:center; padding:30px; color:#94a3b8;">
296
- <p style="font-size:1em;">Cliquez sur un point du graphique,<br>
297
- selectionnez un prompt dans la liste,<br>
298
- ou entrez un prompt manuellement.</p>
299
  </div>
300
  """
301
 
@@ -333,11 +339,11 @@ def build_result_html(label, confidence, probs, text):
333
  <div style="text-align:center; margin-bottom:14px;">
334
  <div style="font-size:2em;">{emoji}</div>
335
  <div style="font-size:1.2em; font-weight:700; color:{color};">{label}</div>
336
- <div style="color:#94a3b8; font-size:0.85em;">Confiance : {pct:.1f}%</div>
337
  </div>
338
  <div style="background:#1e293b; border-radius:10px; padding:12px; margin-bottom:10px;">
339
  <div style="display:flex; justify-content:space-between; margin-bottom:4px;">
340
- <span style="color:#e2e8f0; font-weight:600;">Score de securite</span>
341
  <span style="color:{safety_color}; font-weight:700; font-size:1.1em;">{safety_score:.0f}/100</span>
342
  </div>
343
  <div style="background:#334155; border-radius:8px; height:12px; overflow:hidden;">
@@ -349,7 +355,7 @@ def build_result_html(label, confidence, probs, text):
349
  {bars_html}
350
  </div>
351
  <div style="background:#1e293b; border-radius:10px; padding:12px;">
352
- <div style="color:#94a3b8; font-size:0.8em; margin-bottom:3px;">Prompt analyse :</div>
353
  <div style="color:#cbd5e1; font-style:italic; word-break:break-word; font-size:0.85em;">"{preview}"</div>
354
  </div>
355
  </div>
@@ -360,18 +366,18 @@ def build_risk_assessment(label, confidence, probs):
360
  safety_score = probs["Benign"] * 100
361
  malicious_score = probs["Malicious"] * 100
362
  if label == "Benign" and confidence > 0.85:
363
- level, desc = "Faible", "Ce prompt semble **sur**. Aucun pattern d'injection ou de jailbreak detecte."
364
  elif label == "Benign":
365
- level, desc = "Modere", "Probablement benin, mais confiance moderee. Formulation potentiellement ambigue."
366
  elif confidence > 0.85:
367
- level, desc = "Critique", "**Prompt malveillant detecte** avec haute confiance. Probable tentative d'injection ou de jailbreak."
368
  else:
369
- level, desc = "Eleve", "**Prompt malveillant detecte.** Possible injection ou jailbreak. Revue recommandee."
370
  return (
371
- f"### Niveau de risque : {level}\n\n{desc}\n\n"
372
- f"**Details :**\n"
373
- f"- Score de securite : **{safety_score:.0f}/100**\n"
374
- f"- Classe predite : **{label}** ({confidence*100:.1f}%)\n"
375
  f"- P(Benign) = {probs['Benign']*100:.1f}% | P(Malicious) = {malicious_score:.1f}%\n"
376
  )
377
 
@@ -388,27 +394,27 @@ def build_stats_html():
388
  count = cat_counts[cat]
389
  color = CATEGORY_COLORS.get(cat, CATEGORY_COLORS["unknown"])
390
  pct = count / total * 100
391
- label_fr = CATEGORY_LABELS_FR.get(cat, cat)
392
  cats_html += (
393
  f'<div style="display:flex; justify-content:space-between; padding:2px 0;">'
394
- f'<span style="color:{color}; font-weight:500; font-size:0.85em;">{label_fr}</span>'
395
  f'<span style="color:#94a3b8; font-size:0.85em;">{count} ({pct:.1f}%)</span>'
396
  f'</div>'
397
  )
398
  return f"""
399
  <div style="background:#0f172a; border-radius:12px; padding:14px; font-family:system-ui,sans-serif;">
400
- <div style="color:#e2e8f0; font-weight:700; margin-bottom:8px;">Statistiques du dataset</div>
401
  <div style="display:flex; gap:10px; margin-bottom:10px;">
402
  <div style="flex:1; background:#1e293b; border-radius:8px; padding:8px; text-align:center;">
403
  <div style="color:#94a3b8; font-size:0.75em;">Total</div>
404
  <div style="color:#e2e8f0; font-weight:700; font-size:1.2em;">{total:,}</div>
405
  </div>
406
  <div style="flex:1; background:#1e293b; border-radius:8px; padding:8px; text-align:center;">
407
- <div style="color:#22c55e; font-size:0.75em;">Benin</div>
408
  <div style="color:#22c55e; font-weight:700; font-size:1.2em;">{n_benign:,}</div>
409
  </div>
410
  <div style="flex:1; background:#1e293b; border-radius:8px; padding:8px; text-align:center;">
411
- <div style="color:#ef4444; font-size:0.75em;">Malveillant</div>
412
  <div style="color:#ef4444; font-weight:700; font-size:1.2em;">{n_malicious:,}</div>
413
  </div>
414
  </div>
@@ -430,7 +436,7 @@ PLOTLY_CLICK_JS = """
430
  setTimeout(setupClickHandler, 500);
431
  return;
432
  }
433
- plotEl.on('plotly_click', function(data) {
434
  if (data && data.points && data.points.length > 0) {
435
  const idx = data.points[0].customdata;
436
  if (idx !== undefined && idx !== null) {
@@ -447,29 +453,13 @@ PLOTLY_CLICK_JS = """
447
  }
448
  }
449
  }
450
- });
 
451
  const observer = new MutationObserver(() => {
452
  const newPlot = document.querySelector('#tsne-chart .js-plotly-plot');
453
  if (newPlot && !newPlot._hasClickHandler) {
454
  newPlot._hasClickHandler = true;
455
- newPlot.on('plotly_click', function(data) {
456
- if (data && data.points && data.points.length > 0) {
457
- const idx = data.points[0].customdata;
458
- if (idx !== undefined && idx !== null) {
459
- const inputEl = document.querySelector('#click-index-input textarea');
460
- if (inputEl) {
461
- const nativeSetter = Object.getOwnPropertyDescriptor(
462
- window.HTMLTextAreaElement.prototype, 'value'
463
- ).set;
464
- nativeSetter.call(inputEl, String(idx));
465
- inputEl.dispatchEvent(new Event('input', { bubbles: true }));
466
- setTimeout(() => {
467
- inputEl.dispatchEvent(new Event('change', { bubbles: true }));
468
- }, 50);
469
- }
470
- }
471
- }
472
- });
473
  }
474
  });
475
  observer.observe(document.querySelector('#tsne-chart') || document.body, {
@@ -488,7 +478,7 @@ TITLE_HTML = """
488
  <div style="text-align:center; padding:10px 0;">
489
  <h1 style="font-size:1.8em; margin:0;">GuardLLM - Prompt Security Visualizer</h1>
490
  <p style="color:#94a3b8; font-size:0.95em; margin-top:4px;">
491
- Espace d'embedding t-SNE interactif &bull;
492
  <a href="https://huggingface.co/meta-llama/Llama-Prompt-Guard-2-86M" target="_blank" style="color:#60a5fa;">
493
  Llama Prompt Guard 2</a> &bull;
494
  <a href="https://huggingface.co/datasets/neuralchemy/Prompt-injection-dataset" target="_blank" style="color:#60a5fa;">
@@ -510,50 +500,73 @@ with gr.Blocks(
510
  )
511
 
512
  with gr.Row():
 
513
  with gr.Column(scale=3):
 
 
 
 
514
  category_filter = gr.CheckboxGroup(
515
  choices=UNIQUE_CATEGORIES,
516
  value=UNIQUE_CATEGORIES,
517
- label="Filtrer par categorie",
518
  interactive=True,
519
  )
520
  tsne_plot = gr.Plot(
521
  value=build_tsne_figure(),
522
- label="Espace t-SNE",
523
  elem_id="tsne-chart",
524
  )
525
  gr.Markdown(
526
- "*Cliquez sur un point pour l'analyser. "
527
- "Survolez pour voir le texte. Utilisez la molette pour zoomer.*"
528
  )
529
 
 
530
  with gr.Column(scale=2):
531
- gr.HTML(build_stats_html())
532
- gr.Markdown("### Selectionner un prompt")
 
 
 
 
 
 
533
  prompt_dropdown = gr.Dropdown(
534
  choices=DROPDOWN_CHOICES,
535
- label="Rechercher dans le dataset",
536
  filterable=True,
537
  interactive=True,
538
  )
539
- gr.Markdown("### Ou analyser un prompt libre")
 
540
  manual_input = gr.Textbox(
541
- label="Prompt personnalise",
542
- placeholder="Tapez ou collez un prompt...",
543
  lines=2,
544
  )
545
- analyze_btn = gr.Button("Analyser", variant="primary")
 
546
  gr.Markdown("---")
547
- gr.Markdown("### Resultat de l'analyse")
548
- result_html = gr.HTML(value=empty_analysis_html())
549
- risk_md = gr.Markdown(value="")
550
- full_prompt = gr.Textbox(label="Prompt complet", lines=3, interactive=False, visible=True)
551
 
 
 
 
552
  category_filter.change(
553
  fn=on_filter_change,
554
  inputs=[category_filter],
555
  outputs=[tsne_plot],
556
  )
 
 
 
 
 
 
 
 
 
 
557
  click_index.change(
558
  fn=on_index_input,
559
  inputs=[click_index],
@@ -580,10 +593,10 @@ with gr.Blocks(
580
  """
581
  ---
582
  <div style="text-align:center; color:#64748b; font-size:0.8em;">
583
- <strong>GuardLLM</strong> - Visualiseur de securite des prompts<br>
584
- Modele : <a href="https://huggingface.co/meta-llama/Llama-Prompt-Guard-2-86M">
585
- Llama Prompt Guard 2 (86M)</a> par Meta |
586
- Dataset : <a href="https://huggingface.co/datasets/neuralchemy/Prompt-injection-dataset">
587
  neuralchemy/Prompt-injection-dataset</a>
588
  </div>
589
  """
 
45
  "unknown": "#64748b",
46
  }
47
 
48
+ CATEGORY_LABELS = {
49
+ "benign": "Benign",
50
+ "direct_injection": "Direct Injection",
51
  "jailbreak": "Jailbreak",
52
+ "system_extraction": "System Extraction",
53
+ "encoding_obfuscation": "Encoding / Obfuscation",
54
+ "persona_replacement": "Persona Replacement",
55
+ "indirect_injection": "Indirect Injection",
56
+ "token_smuggling": "Token Smuggling",
57
+ "many_shot": "Many-Shot",
58
  "crescendo": "Crescendo",
59
+ "context_overflow": "Context Overflow",
60
+ "prompt_leaking": "Prompt Leaking",
61
+ "unknown": "Unknown",
62
  }
63
 
64
  # ---------------------------------------------------------------------------
 
132
  outputs = model(**inputs)
133
  probs = torch.softmax(outputs.logits, dim=-1)[0].cpu().numpy()
134
  pred_idx = int(np.argmax(probs))
 
 
135
  prob_dict = {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
136
  safety = float(probs[0])
137
  return prob_dict, safety
 
158
  ]
159
  severities = [ALL_SEVERITIES[i] or "benign" for i in indices]
160
  hover_texts = [
161
+ f"<b>{CATEGORY_LABELS.get(cat, cat)}</b><br>"
162
+ f"Severity: {sev}<br>"
163
  f"Index: {idx}<br>"
164
  f"<i>{txt}</i>"
165
  for idx, txt, sev in zip(indices, texts_preview, severities)
166
  ]
167
  color = CATEGORY_COLORS.get(cat, CATEGORY_COLORS["unknown"])
168
+ label = CATEGORY_LABELS.get(cat, cat)
169
  fig.add_trace(go.Scatter(
170
  x=x, y=y,
171
  mode="markers",
172
+ name=label,
173
  marker=dict(
174
  size=5 if len(indices) > 500 else 7,
175
  color=color,
 
185
  paper_bgcolor="#0f172a",
186
  plot_bgcolor="#1e293b",
187
  title=dict(
188
+ text="t-SNE Embedding Space - Prompt Security Landscape",
189
  font=dict(size=16, color="#e2e8f0"),
190
  x=0.5,
191
  ),
192
  legend=dict(
193
+ title=dict(text="Category", font=dict(color="#94a3b8")),
194
  bgcolor="rgba(15,23,42,0.9)",
195
  bordercolor="#334155",
196
  borderwidth=1,
 
220
  return build_tsne_figure(sel)
221
 
222
 
223
+ def select_all_categories():
224
+ return gr.update(value=UNIQUE_CATEGORIES), build_tsne_figure(UNIQUE_CATEGORIES)
225
+
226
+
227
+ def deselect_all_categories():
228
+ return gr.update(value=[]), build_tsne_figure([])
229
+
230
+
231
  def on_dropdown_select(choice):
232
  if not choice:
233
+ return empty_analysis_html(), "*Select a prompt.*", ""
234
  try:
235
  idx = int(choice.split(" | ")[0])
236
  text = ALL_TEXTS[idx]
 
243
  result_html = build_result_html(pred_label, confidence, prob_dict, text)
244
  risk_text = build_risk_assessment(pred_label, confidence, prob_dict)
245
  risk_text += (
246
+ f"\n\n---\n**Dataset metadata:**\n"
247
+ f"- Category: **{CATEGORY_LABELS.get(category, category)}**\n"
248
+ f"- Severity: **{severity}**\n"
249
+ f"- Ground truth: **{ground_truth}**\n"
250
  )
251
  return result_html, risk_text, text
252
  except Exception as e:
253
  logger.error("Error: %s", e)
254
+ return empty_analysis_html(), f"Error: {e}", ""
255
 
256
 
257
  def on_index_input(idx_str):
258
  if not idx_str or not idx_str.strip():
259
+ return empty_analysis_html(), "*Click a point on the chart.*", ""
260
  try:
261
  idx = int(idx_str.strip())
262
  if idx < 0 or idx >= len(ALL_TEXTS):
263
+ return empty_analysis_html(), f"Invalid index: {idx}", ""
264
  text = ALL_TEXTS[idx]
265
  category = ALL_CATEGORIES[idx]
266
  severity = ALL_SEVERITIES[idx] or "N/A"
 
271
  result_html = build_result_html(pred_label, confidence, prob_dict, text)
272
  risk_text = build_risk_assessment(pred_label, confidence, prob_dict)
273
  risk_text += (
274
+ f"\n\n---\n**Dataset metadata:**\n"
275
+ f"- Category: **{CATEGORY_LABELS.get(category, category)}**\n"
276
+ f"- Severity: **{severity}**\n"
277
+ f"- Ground truth: **{ground_truth}**\n"
278
  )
279
  return result_html, risk_text, text
280
  except Exception as e:
281
  logger.error("Error: %s", e)
282
+ return empty_analysis_html(), f"Error: {e}", ""
283
 
284
 
285
  def on_manual_analyze(text):
 
299
  def empty_analysis_html():
300
  return """
301
  <div style="text-align:center; padding:30px; color:#94a3b8;">
302
+ <p style="font-size:1em;">Click a point on the chart,<br>
303
+ select a prompt from the list,<br>
304
+ or enter a custom prompt below.</p>
305
  </div>
306
  """
307
 
 
339
  <div style="text-align:center; margin-bottom:14px;">
340
  <div style="font-size:2em;">{emoji}</div>
341
  <div style="font-size:1.2em; font-weight:700; color:{color};">{label}</div>
342
+ <div style="color:#94a3b8; font-size:0.85em;">Confidence: {pct:.1f}%</div>
343
  </div>
344
  <div style="background:#1e293b; border-radius:10px; padding:12px; margin-bottom:10px;">
345
  <div style="display:flex; justify-content:space-between; margin-bottom:4px;">
346
+ <span style="color:#e2e8f0; font-weight:600;">Safety Score</span>
347
  <span style="color:{safety_color}; font-weight:700; font-size:1.1em;">{safety_score:.0f}/100</span>
348
  </div>
349
  <div style="background:#334155; border-radius:8px; height:12px; overflow:hidden;">
 
355
  {bars_html}
356
  </div>
357
  <div style="background:#1e293b; border-radius:10px; padding:12px;">
358
+ <div style="color:#94a3b8; font-size:0.8em; margin-bottom:3px;">Analyzed prompt:</div>
359
  <div style="color:#cbd5e1; font-style:italic; word-break:break-word; font-size:0.85em;">"{preview}"</div>
360
  </div>
361
  </div>
 
366
  safety_score = probs["Benign"] * 100
367
  malicious_score = probs["Malicious"] * 100
368
  if label == "Benign" and confidence > 0.85:
369
+ level, desc = "Low", "This prompt appears **safe**. No injection or jailbreak patterns detected."
370
  elif label == "Benign":
371
+ level, desc = "Moderate", "Likely benign, but moderate confidence. Potentially ambiguous wording."
372
  elif confidence > 0.85:
373
+ level, desc = "Critical", "**Malicious prompt detected** with high confidence. Likely injection or jailbreak attempt."
374
  else:
375
+ level, desc = "High", "**Malicious prompt detected.** Possible injection or jailbreak. Review recommended."
376
  return (
377
+ f"### Risk Level: {level}\n\n{desc}\n\n"
378
+ f"**Details:**\n"
379
+ f"- Safety score: **{safety_score:.0f}/100**\n"
380
+ f"- Predicted class: **{label}** ({confidence*100:.1f}%)\n"
381
  f"- P(Benign) = {probs['Benign']*100:.1f}% | P(Malicious) = {malicious_score:.1f}%\n"
382
  )
383
 
 
394
  count = cat_counts[cat]
395
  color = CATEGORY_COLORS.get(cat, CATEGORY_COLORS["unknown"])
396
  pct = count / total * 100
397
+ label = CATEGORY_LABELS.get(cat, cat)
398
  cats_html += (
399
  f'<div style="display:flex; justify-content:space-between; padding:2px 0;">'
400
+ f'<span style="color:{color}; font-weight:500; font-size:0.85em;">{label}</span>'
401
  f'<span style="color:#94a3b8; font-size:0.85em;">{count} ({pct:.1f}%)</span>'
402
  f'</div>'
403
  )
404
  return f"""
405
  <div style="background:#0f172a; border-radius:12px; padding:14px; font-family:system-ui,sans-serif;">
406
+ <div style="color:#e2e8f0; font-weight:700; margin-bottom:8px;">Dataset Statistics</div>
407
  <div style="display:flex; gap:10px; margin-bottom:10px;">
408
  <div style="flex:1; background:#1e293b; border-radius:8px; padding:8px; text-align:center;">
409
  <div style="color:#94a3b8; font-size:0.75em;">Total</div>
410
  <div style="color:#e2e8f0; font-weight:700; font-size:1.2em;">{total:,}</div>
411
  </div>
412
  <div style="flex:1; background:#1e293b; border-radius:8px; padding:8px; text-align:center;">
413
+ <div style="color:#22c55e; font-size:0.75em;">Benign</div>
414
  <div style="color:#22c55e; font-weight:700; font-size:1.2em;">{n_benign:,}</div>
415
  </div>
416
  <div style="flex:1; background:#1e293b; border-radius:8px; padding:8px; text-align:center;">
417
+ <div style="color:#ef4444; font-size:0.75em;">Malicious</div>
418
  <div style="color:#ef4444; font-weight:700; font-size:1.2em;">{n_malicious:,}</div>
419
  </div>
420
  </div>
 
436
  setTimeout(setupClickHandler, 500);
437
  return;
438
  }
439
+ function handleClick(data) {
440
  if (data && data.points && data.points.length > 0) {
441
  const idx = data.points[0].customdata;
442
  if (idx !== undefined && idx !== null) {
 
453
  }
454
  }
455
  }
456
+ }
457
+ plotEl.on('plotly_click', handleClick);
458
  const observer = new MutationObserver(() => {
459
  const newPlot = document.querySelector('#tsne-chart .js-plotly-plot');
460
  if (newPlot && !newPlot._hasClickHandler) {
461
  newPlot._hasClickHandler = true;
462
+ newPlot.on('plotly_click', handleClick);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463
  }
464
  });
465
  observer.observe(document.querySelector('#tsne-chart') || document.body, {
 
478
  <div style="text-align:center; padding:10px 0;">
479
  <h1 style="font-size:1.8em; margin:0;">GuardLLM - Prompt Security Visualizer</h1>
480
  <p style="color:#94a3b8; font-size:0.95em; margin-top:4px;">
481
+ Interactive t-SNE embedding space &bull;
482
  <a href="https://huggingface.co/meta-llama/Llama-Prompt-Guard-2-86M" target="_blank" style="color:#60a5fa;">
483
  Llama Prompt Guard 2</a> &bull;
484
  <a href="https://huggingface.co/datasets/neuralchemy/Prompt-injection-dataset" target="_blank" style="color:#60a5fa;">
 
500
  )
501
 
502
  with gr.Row():
503
+ # ---- Left: t-SNE chart + filters ----
504
  with gr.Column(scale=3):
505
+ with gr.Row():
506
+ select_all_btn = gr.Button("Select All", size="sm", scale=1)
507
+ deselect_all_btn = gr.Button("Deselect All", size="sm", scale=1)
508
+
509
  category_filter = gr.CheckboxGroup(
510
  choices=UNIQUE_CATEGORIES,
511
  value=UNIQUE_CATEGORIES,
512
+ label="Filter by category",
513
  interactive=True,
514
  )
515
  tsne_plot = gr.Plot(
516
  value=build_tsne_figure(),
517
+ label="t-SNE Space",
518
  elem_id="tsne-chart",
519
  )
520
  gr.Markdown(
521
+ "*Click a point to analyze it. "
522
+ "Hover to preview text. Use scroll wheel to zoom.*"
523
  )
524
 
525
+ # ---- Right: Analysis first, then stats (swapped) ----
526
  with gr.Column(scale=2):
527
+ gr.Markdown("### Analysis Result")
528
+ result_html = gr.HTML(value=empty_analysis_html())
529
+ risk_md = gr.Markdown(value="")
530
+ full_prompt = gr.Textbox(label="Full prompt", lines=3, interactive=False, visible=True)
531
+
532
+ gr.Markdown("---")
533
+
534
+ gr.Markdown("### Select a prompt")
535
  prompt_dropdown = gr.Dropdown(
536
  choices=DROPDOWN_CHOICES,
537
+ label="Search dataset",
538
  filterable=True,
539
  interactive=True,
540
  )
541
+
542
+ gr.Markdown("### Or analyze a custom prompt")
543
  manual_input = gr.Textbox(
544
+ label="Custom prompt",
545
+ placeholder="Type or paste a prompt...",
546
  lines=2,
547
  )
548
+ analyze_btn = gr.Button("Analyze", variant="primary")
549
+
550
  gr.Markdown("---")
 
 
 
 
551
 
552
+ gr.HTML(build_stats_html())
553
+
554
+ # ---- Events ----
555
  category_filter.change(
556
  fn=on_filter_change,
557
  inputs=[category_filter],
558
  outputs=[tsne_plot],
559
  )
560
+ select_all_btn.click(
561
+ fn=select_all_categories,
562
+ inputs=[],
563
+ outputs=[category_filter, tsne_plot],
564
+ )
565
+ deselect_all_btn.click(
566
+ fn=deselect_all_categories,
567
+ inputs=[],
568
+ outputs=[category_filter, tsne_plot],
569
+ )
570
  click_index.change(
571
  fn=on_index_input,
572
  inputs=[click_index],
 
593
  """
594
  ---
595
  <div style="text-align:center; color:#64748b; font-size:0.8em;">
596
+ <strong>GuardLLM</strong> - Prompt Security Visualizer<br>
597
+ Model: <a href="https://huggingface.co/meta-llama/Llama-Prompt-Guard-2-86M">
598
+ Llama Prompt Guard 2 (86M)</a> by Meta |
599
+ Dataset: <a href="https://huggingface.co/datasets/neuralchemy/Prompt-injection-dataset">
600
  neuralchemy/Prompt-injection-dataset</a>
601
  </div>
602
  """