Spaces:
Running
Running
Upload app.py with huggingface_hub
Browse files
app.py
CHANGED
|
@@ -45,20 +45,20 @@ CATEGORY_COLORS = {
|
|
| 45 |
"unknown": "#64748b",
|
| 46 |
}
|
| 47 |
|
| 48 |
-
|
| 49 |
-
"benign": "
|
| 50 |
-
"direct_injection": "
|
| 51 |
"jailbreak": "Jailbreak",
|
| 52 |
-
"system_extraction": "
|
| 53 |
-
"encoding_obfuscation": "
|
| 54 |
-
"persona_replacement": "
|
| 55 |
-
"indirect_injection": "
|
| 56 |
-
"token_smuggling": "Token
|
| 57 |
-
"many_shot": "Many-
|
| 58 |
"crescendo": "Crescendo",
|
| 59 |
-
"context_overflow": "
|
| 60 |
-
"prompt_leaking": "
|
| 61 |
-
"unknown": "
|
| 62 |
}
|
| 63 |
|
| 64 |
# ---------------------------------------------------------------------------
|
|
@@ -132,8 +132,6 @@ def analyze_prompt(text):
|
|
| 132 |
outputs = model(**inputs)
|
| 133 |
probs = torch.softmax(outputs.logits, dim=-1)[0].cpu().numpy()
|
| 134 |
pred_idx = int(np.argmax(probs))
|
| 135 |
-
pred_label = LABELS[pred_idx]
|
| 136 |
-
confidence = float(probs[pred_idx])
|
| 137 |
prob_dict = {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
|
| 138 |
safety = float(probs[0])
|
| 139 |
return prob_dict, safety
|
|
@@ -160,18 +158,18 @@ def build_tsne_figure(selected_categories=None):
|
|
| 160 |
]
|
| 161 |
severities = [ALL_SEVERITIES[i] or "benign" for i in indices]
|
| 162 |
hover_texts = [
|
| 163 |
-
f"<b>{
|
| 164 |
-
f"
|
| 165 |
f"Index: {idx}<br>"
|
| 166 |
f"<i>{txt}</i>"
|
| 167 |
for idx, txt, sev in zip(indices, texts_preview, severities)
|
| 168 |
]
|
| 169 |
color = CATEGORY_COLORS.get(cat, CATEGORY_COLORS["unknown"])
|
| 170 |
-
|
| 171 |
fig.add_trace(go.Scatter(
|
| 172 |
x=x, y=y,
|
| 173 |
mode="markers",
|
| 174 |
-
name=
|
| 175 |
marker=dict(
|
| 176 |
size=5 if len(indices) > 500 else 7,
|
| 177 |
color=color,
|
|
@@ -187,12 +185,12 @@ def build_tsne_figure(selected_categories=None):
|
|
| 187 |
paper_bgcolor="#0f172a",
|
| 188 |
plot_bgcolor="#1e293b",
|
| 189 |
title=dict(
|
| 190 |
-
text="
|
| 191 |
font=dict(size=16, color="#e2e8f0"),
|
| 192 |
x=0.5,
|
| 193 |
),
|
| 194 |
legend=dict(
|
| 195 |
-
title=dict(text="
|
| 196 |
bgcolor="rgba(15,23,42,0.9)",
|
| 197 |
bordercolor="#334155",
|
| 198 |
borderwidth=1,
|
|
@@ -222,9 +220,17 @@ def on_filter_change(categories):
|
|
| 222 |
return build_tsne_figure(sel)
|
| 223 |
|
| 224 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
def on_dropdown_select(choice):
|
| 226 |
if not choice:
|
| 227 |
-
return empty_analysis_html(), "*
|
| 228 |
try:
|
| 229 |
idx = int(choice.split(" | ")[0])
|
| 230 |
text = ALL_TEXTS[idx]
|
|
@@ -237,24 +243,24 @@ def on_dropdown_select(choice):
|
|
| 237 |
result_html = build_result_html(pred_label, confidence, prob_dict, text)
|
| 238 |
risk_text = build_risk_assessment(pred_label, confidence, prob_dict)
|
| 239 |
risk_text += (
|
| 240 |
-
f"\n\n---\n**
|
| 241 |
-
f"-
|
| 242 |
-
f"-
|
| 243 |
-
f"-
|
| 244 |
)
|
| 245 |
return result_html, risk_text, text
|
| 246 |
except Exception as e:
|
| 247 |
logger.error("Error: %s", e)
|
| 248 |
-
return empty_analysis_html(), f"
|
| 249 |
|
| 250 |
|
| 251 |
def on_index_input(idx_str):
|
| 252 |
if not idx_str or not idx_str.strip():
|
| 253 |
-
return empty_analysis_html(), "*
|
| 254 |
try:
|
| 255 |
idx = int(idx_str.strip())
|
| 256 |
if idx < 0 or idx >= len(ALL_TEXTS):
|
| 257 |
-
return empty_analysis_html(), f"
|
| 258 |
text = ALL_TEXTS[idx]
|
| 259 |
category = ALL_CATEGORIES[idx]
|
| 260 |
severity = ALL_SEVERITIES[idx] or "N/A"
|
|
@@ -265,15 +271,15 @@ def on_index_input(idx_str):
|
|
| 265 |
result_html = build_result_html(pred_label, confidence, prob_dict, text)
|
| 266 |
risk_text = build_risk_assessment(pred_label, confidence, prob_dict)
|
| 267 |
risk_text += (
|
| 268 |
-
f"\n\n---\n**
|
| 269 |
-
f"-
|
| 270 |
-
f"-
|
| 271 |
-
f"-
|
| 272 |
)
|
| 273 |
return result_html, risk_text, text
|
| 274 |
except Exception as e:
|
| 275 |
logger.error("Error: %s", e)
|
| 276 |
-
return empty_analysis_html(), f"
|
| 277 |
|
| 278 |
|
| 279 |
def on_manual_analyze(text):
|
|
@@ -293,9 +299,9 @@ def on_manual_analyze(text):
|
|
| 293 |
def empty_analysis_html():
|
| 294 |
return """
|
| 295 |
<div style="text-align:center; padding:30px; color:#94a3b8;">
|
| 296 |
-
<p style="font-size:1em;">
|
| 297 |
-
|
| 298 |
-
|
| 299 |
</div>
|
| 300 |
"""
|
| 301 |
|
|
@@ -333,11 +339,11 @@ def build_result_html(label, confidence, probs, text):
|
|
| 333 |
<div style="text-align:center; margin-bottom:14px;">
|
| 334 |
<div style="font-size:2em;">{emoji}</div>
|
| 335 |
<div style="font-size:1.2em; font-weight:700; color:{color};">{label}</div>
|
| 336 |
-
<div style="color:#94a3b8; font-size:0.85em;">
|
| 337 |
</div>
|
| 338 |
<div style="background:#1e293b; border-radius:10px; padding:12px; margin-bottom:10px;">
|
| 339 |
<div style="display:flex; justify-content:space-between; margin-bottom:4px;">
|
| 340 |
-
<span style="color:#e2e8f0; font-weight:600;">
|
| 341 |
<span style="color:{safety_color}; font-weight:700; font-size:1.1em;">{safety_score:.0f}/100</span>
|
| 342 |
</div>
|
| 343 |
<div style="background:#334155; border-radius:8px; height:12px; overflow:hidden;">
|
|
@@ -349,7 +355,7 @@ def build_result_html(label, confidence, probs, text):
|
|
| 349 |
{bars_html}
|
| 350 |
</div>
|
| 351 |
<div style="background:#1e293b; border-radius:10px; padding:12px;">
|
| 352 |
-
<div style="color:#94a3b8; font-size:0.8em; margin-bottom:3px;">
|
| 353 |
<div style="color:#cbd5e1; font-style:italic; word-break:break-word; font-size:0.85em;">"{preview}"</div>
|
| 354 |
</div>
|
| 355 |
</div>
|
|
@@ -360,18 +366,18 @@ def build_risk_assessment(label, confidence, probs):
|
|
| 360 |
safety_score = probs["Benign"] * 100
|
| 361 |
malicious_score = probs["Malicious"] * 100
|
| 362 |
if label == "Benign" and confidence > 0.85:
|
| 363 |
-
level, desc = "
|
| 364 |
elif label == "Benign":
|
| 365 |
-
level, desc = "
|
| 366 |
elif confidence > 0.85:
|
| 367 |
-
level, desc = "
|
| 368 |
else:
|
| 369 |
-
level, desc = "
|
| 370 |
return (
|
| 371 |
-
f"###
|
| 372 |
-
f"**Details
|
| 373 |
-
f"-
|
| 374 |
-
f"-
|
| 375 |
f"- P(Benign) = {probs['Benign']*100:.1f}% | P(Malicious) = {malicious_score:.1f}%\n"
|
| 376 |
)
|
| 377 |
|
|
@@ -388,27 +394,27 @@ def build_stats_html():
|
|
| 388 |
count = cat_counts[cat]
|
| 389 |
color = CATEGORY_COLORS.get(cat, CATEGORY_COLORS["unknown"])
|
| 390 |
pct = count / total * 100
|
| 391 |
-
|
| 392 |
cats_html += (
|
| 393 |
f'<div style="display:flex; justify-content:space-between; padding:2px 0;">'
|
| 394 |
-
f'<span style="color:{color}; font-weight:500; font-size:0.85em;">{
|
| 395 |
f'<span style="color:#94a3b8; font-size:0.85em;">{count} ({pct:.1f}%)</span>'
|
| 396 |
f'</div>'
|
| 397 |
)
|
| 398 |
return f"""
|
| 399 |
<div style="background:#0f172a; border-radius:12px; padding:14px; font-family:system-ui,sans-serif;">
|
| 400 |
-
<div style="color:#e2e8f0; font-weight:700; margin-bottom:8px;">
|
| 401 |
<div style="display:flex; gap:10px; margin-bottom:10px;">
|
| 402 |
<div style="flex:1; background:#1e293b; border-radius:8px; padding:8px; text-align:center;">
|
| 403 |
<div style="color:#94a3b8; font-size:0.75em;">Total</div>
|
| 404 |
<div style="color:#e2e8f0; font-weight:700; font-size:1.2em;">{total:,}</div>
|
| 405 |
</div>
|
| 406 |
<div style="flex:1; background:#1e293b; border-radius:8px; padding:8px; text-align:center;">
|
| 407 |
-
<div style="color:#22c55e; font-size:0.75em;">
|
| 408 |
<div style="color:#22c55e; font-weight:700; font-size:1.2em;">{n_benign:,}</div>
|
| 409 |
</div>
|
| 410 |
<div style="flex:1; background:#1e293b; border-radius:8px; padding:8px; text-align:center;">
|
| 411 |
-
<div style="color:#ef4444; font-size:0.75em;">
|
| 412 |
<div style="color:#ef4444; font-weight:700; font-size:1.2em;">{n_malicious:,}</div>
|
| 413 |
</div>
|
| 414 |
</div>
|
|
@@ -430,7 +436,7 @@ PLOTLY_CLICK_JS = """
|
|
| 430 |
setTimeout(setupClickHandler, 500);
|
| 431 |
return;
|
| 432 |
}
|
| 433 |
-
|
| 434 |
if (data && data.points && data.points.length > 0) {
|
| 435 |
const idx = data.points[0].customdata;
|
| 436 |
if (idx !== undefined && idx !== null) {
|
|
@@ -447,29 +453,13 @@ PLOTLY_CLICK_JS = """
|
|
| 447 |
}
|
| 448 |
}
|
| 449 |
}
|
| 450 |
-
}
|
|
|
|
| 451 |
const observer = new MutationObserver(() => {
|
| 452 |
const newPlot = document.querySelector('#tsne-chart .js-plotly-plot');
|
| 453 |
if (newPlot && !newPlot._hasClickHandler) {
|
| 454 |
newPlot._hasClickHandler = true;
|
| 455 |
-
newPlot.on('plotly_click',
|
| 456 |
-
if (data && data.points && data.points.length > 0) {
|
| 457 |
-
const idx = data.points[0].customdata;
|
| 458 |
-
if (idx !== undefined && idx !== null) {
|
| 459 |
-
const inputEl = document.querySelector('#click-index-input textarea');
|
| 460 |
-
if (inputEl) {
|
| 461 |
-
const nativeSetter = Object.getOwnPropertyDescriptor(
|
| 462 |
-
window.HTMLTextAreaElement.prototype, 'value'
|
| 463 |
-
).set;
|
| 464 |
-
nativeSetter.call(inputEl, String(idx));
|
| 465 |
-
inputEl.dispatchEvent(new Event('input', { bubbles: true }));
|
| 466 |
-
setTimeout(() => {
|
| 467 |
-
inputEl.dispatchEvent(new Event('change', { bubbles: true }));
|
| 468 |
-
}, 50);
|
| 469 |
-
}
|
| 470 |
-
}
|
| 471 |
-
}
|
| 472 |
-
});
|
| 473 |
}
|
| 474 |
});
|
| 475 |
observer.observe(document.querySelector('#tsne-chart') || document.body, {
|
|
@@ -488,7 +478,7 @@ TITLE_HTML = """
|
|
| 488 |
<div style="text-align:center; padding:10px 0;">
|
| 489 |
<h1 style="font-size:1.8em; margin:0;">GuardLLM - Prompt Security Visualizer</h1>
|
| 490 |
<p style="color:#94a3b8; font-size:0.95em; margin-top:4px;">
|
| 491 |
-
|
| 492 |
<a href="https://huggingface.co/meta-llama/Llama-Prompt-Guard-2-86M" target="_blank" style="color:#60a5fa;">
|
| 493 |
Llama Prompt Guard 2</a> •
|
| 494 |
<a href="https://huggingface.co/datasets/neuralchemy/Prompt-injection-dataset" target="_blank" style="color:#60a5fa;">
|
|
@@ -510,50 +500,73 @@ with gr.Blocks(
|
|
| 510 |
)
|
| 511 |
|
| 512 |
with gr.Row():
|
|
|
|
| 513 |
with gr.Column(scale=3):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 514 |
category_filter = gr.CheckboxGroup(
|
| 515 |
choices=UNIQUE_CATEGORIES,
|
| 516 |
value=UNIQUE_CATEGORIES,
|
| 517 |
-
label="
|
| 518 |
interactive=True,
|
| 519 |
)
|
| 520 |
tsne_plot = gr.Plot(
|
| 521 |
value=build_tsne_figure(),
|
| 522 |
-
label="
|
| 523 |
elem_id="tsne-chart",
|
| 524 |
)
|
| 525 |
gr.Markdown(
|
| 526 |
-
"*
|
| 527 |
-
"
|
| 528 |
)
|
| 529 |
|
|
|
|
| 530 |
with gr.Column(scale=2):
|
| 531 |
-
gr.
|
| 532 |
-
gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 533 |
prompt_dropdown = gr.Dropdown(
|
| 534 |
choices=DROPDOWN_CHOICES,
|
| 535 |
-
label="
|
| 536 |
filterable=True,
|
| 537 |
interactive=True,
|
| 538 |
)
|
| 539 |
-
|
|
|
|
| 540 |
manual_input = gr.Textbox(
|
| 541 |
-
label="
|
| 542 |
-
placeholder="
|
| 543 |
lines=2,
|
| 544 |
)
|
| 545 |
-
analyze_btn = gr.Button("
|
|
|
|
| 546 |
gr.Markdown("---")
|
| 547 |
-
gr.Markdown("### Resultat de l'analyse")
|
| 548 |
-
result_html = gr.HTML(value=empty_analysis_html())
|
| 549 |
-
risk_md = gr.Markdown(value="")
|
| 550 |
-
full_prompt = gr.Textbox(label="Prompt complet", lines=3, interactive=False, visible=True)
|
| 551 |
|
|
|
|
|
|
|
|
|
|
| 552 |
category_filter.change(
|
| 553 |
fn=on_filter_change,
|
| 554 |
inputs=[category_filter],
|
| 555 |
outputs=[tsne_plot],
|
| 556 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 557 |
click_index.change(
|
| 558 |
fn=on_index_input,
|
| 559 |
inputs=[click_index],
|
|
@@ -580,10 +593,10 @@ with gr.Blocks(
|
|
| 580 |
"""
|
| 581 |
---
|
| 582 |
<div style="text-align:center; color:#64748b; font-size:0.8em;">
|
| 583 |
-
<strong>GuardLLM</strong> -
|
| 584 |
-
|
| 585 |
-
Llama Prompt Guard 2 (86M)</a>
|
| 586 |
-
Dataset
|
| 587 |
neuralchemy/Prompt-injection-dataset</a>
|
| 588 |
</div>
|
| 589 |
"""
|
|
|
|
| 45 |
"unknown": "#64748b",
|
| 46 |
}
|
| 47 |
|
| 48 |
+
CATEGORY_LABELS = {
|
| 49 |
+
"benign": "Benign",
|
| 50 |
+
"direct_injection": "Direct Injection",
|
| 51 |
"jailbreak": "Jailbreak",
|
| 52 |
+
"system_extraction": "System Extraction",
|
| 53 |
+
"encoding_obfuscation": "Encoding / Obfuscation",
|
| 54 |
+
"persona_replacement": "Persona Replacement",
|
| 55 |
+
"indirect_injection": "Indirect Injection",
|
| 56 |
+
"token_smuggling": "Token Smuggling",
|
| 57 |
+
"many_shot": "Many-Shot",
|
| 58 |
"crescendo": "Crescendo",
|
| 59 |
+
"context_overflow": "Context Overflow",
|
| 60 |
+
"prompt_leaking": "Prompt Leaking",
|
| 61 |
+
"unknown": "Unknown",
|
| 62 |
}
|
| 63 |
|
| 64 |
# ---------------------------------------------------------------------------
|
|
|
|
| 132 |
outputs = model(**inputs)
|
| 133 |
probs = torch.softmax(outputs.logits, dim=-1)[0].cpu().numpy()
|
| 134 |
pred_idx = int(np.argmax(probs))
|
|
|
|
|
|
|
| 135 |
prob_dict = {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
|
| 136 |
safety = float(probs[0])
|
| 137 |
return prob_dict, safety
|
|
|
|
| 158 |
]
|
| 159 |
severities = [ALL_SEVERITIES[i] or "benign" for i in indices]
|
| 160 |
hover_texts = [
|
| 161 |
+
f"<b>{CATEGORY_LABELS.get(cat, cat)}</b><br>"
|
| 162 |
+
f"Severity: {sev}<br>"
|
| 163 |
f"Index: {idx}<br>"
|
| 164 |
f"<i>{txt}</i>"
|
| 165 |
for idx, txt, sev in zip(indices, texts_preview, severities)
|
| 166 |
]
|
| 167 |
color = CATEGORY_COLORS.get(cat, CATEGORY_COLORS["unknown"])
|
| 168 |
+
label = CATEGORY_LABELS.get(cat, cat)
|
| 169 |
fig.add_trace(go.Scatter(
|
| 170 |
x=x, y=y,
|
| 171 |
mode="markers",
|
| 172 |
+
name=label,
|
| 173 |
marker=dict(
|
| 174 |
size=5 if len(indices) > 500 else 7,
|
| 175 |
color=color,
|
|
|
|
| 185 |
paper_bgcolor="#0f172a",
|
| 186 |
plot_bgcolor="#1e293b",
|
| 187 |
title=dict(
|
| 188 |
+
text="t-SNE Embedding Space - Prompt Security Landscape",
|
| 189 |
font=dict(size=16, color="#e2e8f0"),
|
| 190 |
x=0.5,
|
| 191 |
),
|
| 192 |
legend=dict(
|
| 193 |
+
title=dict(text="Category", font=dict(color="#94a3b8")),
|
| 194 |
bgcolor="rgba(15,23,42,0.9)",
|
| 195 |
bordercolor="#334155",
|
| 196 |
borderwidth=1,
|
|
|
|
| 220 |
return build_tsne_figure(sel)
|
| 221 |
|
| 222 |
|
| 223 |
+
def select_all_categories():
|
| 224 |
+
return gr.update(value=UNIQUE_CATEGORIES), build_tsne_figure(UNIQUE_CATEGORIES)
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
def deselect_all_categories():
|
| 228 |
+
return gr.update(value=[]), build_tsne_figure([])
|
| 229 |
+
|
| 230 |
+
|
| 231 |
def on_dropdown_select(choice):
|
| 232 |
if not choice:
|
| 233 |
+
return empty_analysis_html(), "*Select a prompt.*", ""
|
| 234 |
try:
|
| 235 |
idx = int(choice.split(" | ")[0])
|
| 236 |
text = ALL_TEXTS[idx]
|
|
|
|
| 243 |
result_html = build_result_html(pred_label, confidence, prob_dict, text)
|
| 244 |
risk_text = build_risk_assessment(pred_label, confidence, prob_dict)
|
| 245 |
risk_text += (
|
| 246 |
+
f"\n\n---\n**Dataset metadata:**\n"
|
| 247 |
+
f"- Category: **{CATEGORY_LABELS.get(category, category)}**\n"
|
| 248 |
+
f"- Severity: **{severity}**\n"
|
| 249 |
+
f"- Ground truth: **{ground_truth}**\n"
|
| 250 |
)
|
| 251 |
return result_html, risk_text, text
|
| 252 |
except Exception as e:
|
| 253 |
logger.error("Error: %s", e)
|
| 254 |
+
return empty_analysis_html(), f"Error: {e}", ""
|
| 255 |
|
| 256 |
|
| 257 |
def on_index_input(idx_str):
|
| 258 |
if not idx_str or not idx_str.strip():
|
| 259 |
+
return empty_analysis_html(), "*Click a point on the chart.*", ""
|
| 260 |
try:
|
| 261 |
idx = int(idx_str.strip())
|
| 262 |
if idx < 0 or idx >= len(ALL_TEXTS):
|
| 263 |
+
return empty_analysis_html(), f"Invalid index: {idx}", ""
|
| 264 |
text = ALL_TEXTS[idx]
|
| 265 |
category = ALL_CATEGORIES[idx]
|
| 266 |
severity = ALL_SEVERITIES[idx] or "N/A"
|
|
|
|
| 271 |
result_html = build_result_html(pred_label, confidence, prob_dict, text)
|
| 272 |
risk_text = build_risk_assessment(pred_label, confidence, prob_dict)
|
| 273 |
risk_text += (
|
| 274 |
+
f"\n\n---\n**Dataset metadata:**\n"
|
| 275 |
+
f"- Category: **{CATEGORY_LABELS.get(category, category)}**\n"
|
| 276 |
+
f"- Severity: **{severity}**\n"
|
| 277 |
+
f"- Ground truth: **{ground_truth}**\n"
|
| 278 |
)
|
| 279 |
return result_html, risk_text, text
|
| 280 |
except Exception as e:
|
| 281 |
logger.error("Error: %s", e)
|
| 282 |
+
return empty_analysis_html(), f"Error: {e}", ""
|
| 283 |
|
| 284 |
|
| 285 |
def on_manual_analyze(text):
|
|
|
|
| 299 |
def empty_analysis_html():
|
| 300 |
return """
|
| 301 |
<div style="text-align:center; padding:30px; color:#94a3b8;">
|
| 302 |
+
<p style="font-size:1em;">Click a point on the chart,<br>
|
| 303 |
+
select a prompt from the list,<br>
|
| 304 |
+
or enter a custom prompt below.</p>
|
| 305 |
</div>
|
| 306 |
"""
|
| 307 |
|
|
|
|
| 339 |
<div style="text-align:center; margin-bottom:14px;">
|
| 340 |
<div style="font-size:2em;">{emoji}</div>
|
| 341 |
<div style="font-size:1.2em; font-weight:700; color:{color};">{label}</div>
|
| 342 |
+
<div style="color:#94a3b8; font-size:0.85em;">Confidence: {pct:.1f}%</div>
|
| 343 |
</div>
|
| 344 |
<div style="background:#1e293b; border-radius:10px; padding:12px; margin-bottom:10px;">
|
| 345 |
<div style="display:flex; justify-content:space-between; margin-bottom:4px;">
|
| 346 |
+
<span style="color:#e2e8f0; font-weight:600;">Safety Score</span>
|
| 347 |
<span style="color:{safety_color}; font-weight:700; font-size:1.1em;">{safety_score:.0f}/100</span>
|
| 348 |
</div>
|
| 349 |
<div style="background:#334155; border-radius:8px; height:12px; overflow:hidden;">
|
|
|
|
| 355 |
{bars_html}
|
| 356 |
</div>
|
| 357 |
<div style="background:#1e293b; border-radius:10px; padding:12px;">
|
| 358 |
+
<div style="color:#94a3b8; font-size:0.8em; margin-bottom:3px;">Analyzed prompt:</div>
|
| 359 |
<div style="color:#cbd5e1; font-style:italic; word-break:break-word; font-size:0.85em;">"{preview}"</div>
|
| 360 |
</div>
|
| 361 |
</div>
|
|
|
|
| 366 |
safety_score = probs["Benign"] * 100
|
| 367 |
malicious_score = probs["Malicious"] * 100
|
| 368 |
if label == "Benign" and confidence > 0.85:
|
| 369 |
+
level, desc = "Low", "This prompt appears **safe**. No injection or jailbreak patterns detected."
|
| 370 |
elif label == "Benign":
|
| 371 |
+
level, desc = "Moderate", "Likely benign, but moderate confidence. Potentially ambiguous wording."
|
| 372 |
elif confidence > 0.85:
|
| 373 |
+
level, desc = "Critical", "**Malicious prompt detected** with high confidence. Likely injection or jailbreak attempt."
|
| 374 |
else:
|
| 375 |
+
level, desc = "High", "**Malicious prompt detected.** Possible injection or jailbreak. Review recommended."
|
| 376 |
return (
|
| 377 |
+
f"### Risk Level: {level}\n\n{desc}\n\n"
|
| 378 |
+
f"**Details:**\n"
|
| 379 |
+
f"- Safety score: **{safety_score:.0f}/100**\n"
|
| 380 |
+
f"- Predicted class: **{label}** ({confidence*100:.1f}%)\n"
|
| 381 |
f"- P(Benign) = {probs['Benign']*100:.1f}% | P(Malicious) = {malicious_score:.1f}%\n"
|
| 382 |
)
|
| 383 |
|
|
|
|
| 394 |
count = cat_counts[cat]
|
| 395 |
color = CATEGORY_COLORS.get(cat, CATEGORY_COLORS["unknown"])
|
| 396 |
pct = count / total * 100
|
| 397 |
+
label = CATEGORY_LABELS.get(cat, cat)
|
| 398 |
cats_html += (
|
| 399 |
f'<div style="display:flex; justify-content:space-between; padding:2px 0;">'
|
| 400 |
+
f'<span style="color:{color}; font-weight:500; font-size:0.85em;">{label}</span>'
|
| 401 |
f'<span style="color:#94a3b8; font-size:0.85em;">{count} ({pct:.1f}%)</span>'
|
| 402 |
f'</div>'
|
| 403 |
)
|
| 404 |
return f"""
|
| 405 |
<div style="background:#0f172a; border-radius:12px; padding:14px; font-family:system-ui,sans-serif;">
|
| 406 |
+
<div style="color:#e2e8f0; font-weight:700; margin-bottom:8px;">Dataset Statistics</div>
|
| 407 |
<div style="display:flex; gap:10px; margin-bottom:10px;">
|
| 408 |
<div style="flex:1; background:#1e293b; border-radius:8px; padding:8px; text-align:center;">
|
| 409 |
<div style="color:#94a3b8; font-size:0.75em;">Total</div>
|
| 410 |
<div style="color:#e2e8f0; font-weight:700; font-size:1.2em;">{total:,}</div>
|
| 411 |
</div>
|
| 412 |
<div style="flex:1; background:#1e293b; border-radius:8px; padding:8px; text-align:center;">
|
| 413 |
+
<div style="color:#22c55e; font-size:0.75em;">Benign</div>
|
| 414 |
<div style="color:#22c55e; font-weight:700; font-size:1.2em;">{n_benign:,}</div>
|
| 415 |
</div>
|
| 416 |
<div style="flex:1; background:#1e293b; border-radius:8px; padding:8px; text-align:center;">
|
| 417 |
+
<div style="color:#ef4444; font-size:0.75em;">Malicious</div>
|
| 418 |
<div style="color:#ef4444; font-weight:700; font-size:1.2em;">{n_malicious:,}</div>
|
| 419 |
</div>
|
| 420 |
</div>
|
|
|
|
| 436 |
setTimeout(setupClickHandler, 500);
|
| 437 |
return;
|
| 438 |
}
|
| 439 |
+
function handleClick(data) {
|
| 440 |
if (data && data.points && data.points.length > 0) {
|
| 441 |
const idx = data.points[0].customdata;
|
| 442 |
if (idx !== undefined && idx !== null) {
|
|
|
|
| 453 |
}
|
| 454 |
}
|
| 455 |
}
|
| 456 |
+
}
|
| 457 |
+
plotEl.on('plotly_click', handleClick);
|
| 458 |
const observer = new MutationObserver(() => {
|
| 459 |
const newPlot = document.querySelector('#tsne-chart .js-plotly-plot');
|
| 460 |
if (newPlot && !newPlot._hasClickHandler) {
|
| 461 |
newPlot._hasClickHandler = true;
|
| 462 |
+
newPlot.on('plotly_click', handleClick);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 463 |
}
|
| 464 |
});
|
| 465 |
observer.observe(document.querySelector('#tsne-chart') || document.body, {
|
|
|
|
| 478 |
<div style="text-align:center; padding:10px 0;">
|
| 479 |
<h1 style="font-size:1.8em; margin:0;">GuardLLM - Prompt Security Visualizer</h1>
|
| 480 |
<p style="color:#94a3b8; font-size:0.95em; margin-top:4px;">
|
| 481 |
+
Interactive t-SNE embedding space •
|
| 482 |
<a href="https://huggingface.co/meta-llama/Llama-Prompt-Guard-2-86M" target="_blank" style="color:#60a5fa;">
|
| 483 |
Llama Prompt Guard 2</a> •
|
| 484 |
<a href="https://huggingface.co/datasets/neuralchemy/Prompt-injection-dataset" target="_blank" style="color:#60a5fa;">
|
|
|
|
| 500 |
)
|
| 501 |
|
| 502 |
with gr.Row():
|
| 503 |
+
# ---- Left: t-SNE chart + filters ----
|
| 504 |
with gr.Column(scale=3):
|
| 505 |
+
with gr.Row():
|
| 506 |
+
select_all_btn = gr.Button("Select All", size="sm", scale=1)
|
| 507 |
+
deselect_all_btn = gr.Button("Deselect All", size="sm", scale=1)
|
| 508 |
+
|
| 509 |
category_filter = gr.CheckboxGroup(
|
| 510 |
choices=UNIQUE_CATEGORIES,
|
| 511 |
value=UNIQUE_CATEGORIES,
|
| 512 |
+
label="Filter by category",
|
| 513 |
interactive=True,
|
| 514 |
)
|
| 515 |
tsne_plot = gr.Plot(
|
| 516 |
value=build_tsne_figure(),
|
| 517 |
+
label="t-SNE Space",
|
| 518 |
elem_id="tsne-chart",
|
| 519 |
)
|
| 520 |
gr.Markdown(
|
| 521 |
+
"*Click a point to analyze it. "
|
| 522 |
+
"Hover to preview text. Use scroll wheel to zoom.*"
|
| 523 |
)
|
| 524 |
|
| 525 |
+
# ---- Right: Analysis first, then stats (swapped) ----
|
| 526 |
with gr.Column(scale=2):
|
| 527 |
+
gr.Markdown("### Analysis Result")
|
| 528 |
+
result_html = gr.HTML(value=empty_analysis_html())
|
| 529 |
+
risk_md = gr.Markdown(value="")
|
| 530 |
+
full_prompt = gr.Textbox(label="Full prompt", lines=3, interactive=False, visible=True)
|
| 531 |
+
|
| 532 |
+
gr.Markdown("---")
|
| 533 |
+
|
| 534 |
+
gr.Markdown("### Select a prompt")
|
| 535 |
prompt_dropdown = gr.Dropdown(
|
| 536 |
choices=DROPDOWN_CHOICES,
|
| 537 |
+
label="Search dataset",
|
| 538 |
filterable=True,
|
| 539 |
interactive=True,
|
| 540 |
)
|
| 541 |
+
|
| 542 |
+
gr.Markdown("### Or analyze a custom prompt")
|
| 543 |
manual_input = gr.Textbox(
|
| 544 |
+
label="Custom prompt",
|
| 545 |
+
placeholder="Type or paste a prompt...",
|
| 546 |
lines=2,
|
| 547 |
)
|
| 548 |
+
analyze_btn = gr.Button("Analyze", variant="primary")
|
| 549 |
+
|
| 550 |
gr.Markdown("---")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 551 |
|
| 552 |
+
gr.HTML(build_stats_html())
|
| 553 |
+
|
| 554 |
+
# ---- Events ----
|
| 555 |
category_filter.change(
|
| 556 |
fn=on_filter_change,
|
| 557 |
inputs=[category_filter],
|
| 558 |
outputs=[tsne_plot],
|
| 559 |
)
|
| 560 |
+
select_all_btn.click(
|
| 561 |
+
fn=select_all_categories,
|
| 562 |
+
inputs=[],
|
| 563 |
+
outputs=[category_filter, tsne_plot],
|
| 564 |
+
)
|
| 565 |
+
deselect_all_btn.click(
|
| 566 |
+
fn=deselect_all_categories,
|
| 567 |
+
inputs=[],
|
| 568 |
+
outputs=[category_filter, tsne_plot],
|
| 569 |
+
)
|
| 570 |
click_index.change(
|
| 571 |
fn=on_index_input,
|
| 572 |
inputs=[click_index],
|
|
|
|
| 593 |
"""
|
| 594 |
---
|
| 595 |
<div style="text-align:center; color:#64748b; font-size:0.8em;">
|
| 596 |
+
<strong>GuardLLM</strong> - Prompt Security Visualizer<br>
|
| 597 |
+
Model: <a href="https://huggingface.co/meta-llama/Llama-Prompt-Guard-2-86M">
|
| 598 |
+
Llama Prompt Guard 2 (86M)</a> by Meta |
|
| 599 |
+
Dataset: <a href="https://huggingface.co/datasets/neuralchemy/Prompt-injection-dataset">
|
| 600 |
neuralchemy/Prompt-injection-dataset</a>
|
| 601 |
</div>
|
| 602 |
"""
|