Spaces:
Sleeping
Sleeping
G. Claude Opus 4.7 commited on
Commit ·
dea9e25
1
Parent(s): 1569836
Apply Aleph Beth design system to GuardLLM UI
Browse files- Parchment surface + ink typography, Instrument Serif display, Geist body/mono.
- Restrained 13-category palette drawn from brand families (safe, threat, gilt, signal, ink) — no neon.
- Replace emoji (verdict, header) with geometric primitives and editorial labels.
- Plotly chart re-skinned: parchment paper, ink axes, soft grid, branded hover labels.
- Bilingual mark (א-ב · أب) in header and footer.
- Cards, buttons, inputs, filters all themed via gr.themes.Base override + custom CSS.
- Pass HF_TOKEN to from_pretrained so the gated model loads when the Space secret is set.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
app.py
CHANGED
|
@@ -1,19 +1,18 @@
|
|
| 1 |
"""
|
| 2 |
-
GuardLLM
|
| 3 |
-
|
| 4 |
Powered by Llama Prompt Guard 2 (86M) and neuralchemy/Prompt-injection-dataset.
|
| 5 |
"""
|
| 6 |
|
| 7 |
import logging
|
|
|
|
| 8 |
import sys
|
| 9 |
import json
|
| 10 |
-
import traceback
|
| 11 |
|
| 12 |
import gradio as gr
|
| 13 |
import torch
|
| 14 |
import numpy as np
|
| 15 |
import plotly.graph_objects as go
|
| 16 |
-
import plotly.io as pio
|
| 17 |
from pathlib import Path
|
| 18 |
|
| 19 |
# ---------------------------------------------------------------------------
|
|
@@ -27,22 +26,61 @@ logging.basicConfig(
|
|
| 27 |
logger = logging.getLogger("GuardLLM")
|
| 28 |
|
| 29 |
# ---------------------------------------------------------------------------
|
| 30 |
-
#
|
| 31 |
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
CATEGORY_COLORS = {
|
| 33 |
-
"benign":
|
| 34 |
-
"direct_injection":
|
| 35 |
-
"jailbreak":
|
| 36 |
-
"system_extraction":
|
| 37 |
-
"encoding_obfuscation":
|
| 38 |
-
"persona_replacement":
|
| 39 |
-
"indirect_injection":
|
| 40 |
-
"token_smuggling":
|
| 41 |
-
"many_shot":
|
| 42 |
-
"crescendo":
|
| 43 |
-
"context_overflow":
|
| 44 |
-
"prompt_leaking":
|
| 45 |
-
"unknown":
|
| 46 |
}
|
| 47 |
|
| 48 |
CATEGORY_LABELS = {
|
|
@@ -66,6 +104,7 @@ CATEGORY_LABELS = {
|
|
| 66 |
# ---------------------------------------------------------------------------
|
| 67 |
MODEL_ID = "meta-llama/Llama-Prompt-Guard-2-86M"
|
| 68 |
LABELS = ["Benign", "Malicious"]
|
|
|
|
| 69 |
_classifier = {"tokenizer": None, "model": None, "device": None}
|
| 70 |
|
| 71 |
|
|
@@ -73,8 +112,9 @@ def get_classifier():
|
|
| 73 |
if _classifier["model"] is None:
|
| 74 |
logger.info("Lazy-loading Llama Prompt Guard 2...")
|
| 75 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 76 |
-
|
| 77 |
-
|
|
|
|
| 78 |
mdl.eval()
|
| 79 |
dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 80 |
mdl.to(dev)
|
|
@@ -131,14 +171,13 @@ def analyze_prompt(text):
|
|
| 131 |
with torch.no_grad():
|
| 132 |
outputs = model(**inputs)
|
| 133 |
probs = torch.softmax(outputs.logits, dim=-1)[0].cpu().numpy()
|
| 134 |
-
pred_idx = int(np.argmax(probs))
|
| 135 |
prob_dict = {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
|
| 136 |
safety = float(probs[0])
|
| 137 |
return prob_dict, safety
|
| 138 |
|
| 139 |
|
| 140 |
# ---------------------------------------------------------------------------
|
| 141 |
-
#
|
| 142 |
# ---------------------------------------------------------------------------
|
| 143 |
def build_tsne_figure(selected_categories=None):
|
| 144 |
fig = go.Figure()
|
|
@@ -159,8 +198,8 @@ def build_tsne_figure(selected_categories=None):
|
|
| 159 |
severities = [ALL_SEVERITIES[i] or "benign" for i in indices]
|
| 160 |
hover_texts = [
|
| 161 |
f"<b>{CATEGORY_LABELS.get(cat, cat)}</b><br>"
|
| 162 |
-
f"Severity
|
| 163 |
-
f"Index
|
| 164 |
f"<i>{txt}</i>"
|
| 165 |
for idx, txt, sev in zip(indices, texts_preview, severities)
|
| 166 |
]
|
|
@@ -173,41 +212,55 @@ def build_tsne_figure(selected_categories=None):
|
|
| 173 |
marker=dict(
|
| 174 |
size=5 if len(indices) > 500 else 7,
|
| 175 |
color=color,
|
| 176 |
-
opacity=0.
|
| 177 |
-
line=dict(width=0.5, color="rgba(
|
| 178 |
),
|
| 179 |
text=hover_texts,
|
| 180 |
hoverinfo="text",
|
| 181 |
customdata=[str(i) for i in indices],
|
| 182 |
))
|
| 183 |
fig.update_layout(
|
| 184 |
-
template="
|
| 185 |
-
paper_bgcolor="
|
| 186 |
-
plot_bgcolor="
|
|
|
|
| 187 |
title=dict(
|
| 188 |
-
text="
|
| 189 |
-
|
|
|
|
| 190 |
x=0.5,
|
|
|
|
| 191 |
),
|
| 192 |
legend=dict(
|
| 193 |
-
title=dict(text="Category", font=dict(color="
|
| 194 |
-
bgcolor="rgba(
|
| 195 |
-
bordercolor="
|
| 196 |
borderwidth=1,
|
| 197 |
-
font=dict(color="
|
| 198 |
itemsizing="constant",
|
| 199 |
),
|
| 200 |
xaxis=dict(
|
| 201 |
-
title="t-SNE 1",
|
| 202 |
-
|
|
|
|
|
|
|
|
|
|
| 203 |
),
|
| 204 |
yaxis=dict(
|
| 205 |
-
title="t-SNE 2",
|
| 206 |
-
|
|
|
|
|
|
|
|
|
|
| 207 |
),
|
| 208 |
-
margin=dict(l=
|
| 209 |
-
height=
|
| 210 |
dragmode="pan",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
)
|
| 212 |
return fig
|
| 213 |
|
|
@@ -228,30 +281,34 @@ def deselect_all_categories():
|
|
| 228 |
return gr.update(value=[]), build_tsne_figure([])
|
| 229 |
|
| 230 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
def on_dropdown_select(choice):
|
| 232 |
if not choice:
|
| 233 |
-
return empty_analysis_html(), "*Select a prompt.*", ""
|
| 234 |
try:
|
| 235 |
idx = int(choice.split(" | ")[0])
|
| 236 |
text = ALL_TEXTS[idx]
|
| 237 |
category = ALL_CATEGORIES[idx]
|
| 238 |
severity = ALL_SEVERITIES[idx] or "N/A"
|
| 239 |
ground_truth = "Malicious" if ALL_LABELS_DS[idx] == 1 else "Benign"
|
| 240 |
-
prob_dict,
|
| 241 |
pred_label = max(prob_dict, key=prob_dict.get)
|
| 242 |
confidence = prob_dict[pred_label]
|
| 243 |
result_html = build_result_html(pred_label, confidence, prob_dict, text)
|
| 244 |
risk_text = build_risk_assessment(pred_label, confidence, prob_dict)
|
| 245 |
-
risk_text += (
|
| 246 |
-
f"\n\n---\n**Dataset metadata:**\n"
|
| 247 |
-
f"- Category: **{CATEGORY_LABELS.get(category, category)}**\n"
|
| 248 |
-
f"- Severity: **{severity}**\n"
|
| 249 |
-
f"- Ground truth: **{ground_truth}**\n"
|
| 250 |
-
)
|
| 251 |
return result_html, risk_text, text
|
| 252 |
except Exception as e:
|
| 253 |
logger.error("Error: %s", e)
|
| 254 |
-
return empty_analysis_html(), f"Error
|
| 255 |
|
| 256 |
|
| 257 |
def on_index_input(idx_str):
|
|
@@ -260,32 +317,27 @@ def on_index_input(idx_str):
|
|
| 260 |
try:
|
| 261 |
idx = int(idx_str.strip())
|
| 262 |
if idx < 0 or idx >= len(ALL_TEXTS):
|
| 263 |
-
return empty_analysis_html(), f"Invalid index
|
| 264 |
text = ALL_TEXTS[idx]
|
| 265 |
category = ALL_CATEGORIES[idx]
|
| 266 |
severity = ALL_SEVERITIES[idx] or "N/A"
|
| 267 |
ground_truth = "Malicious" if ALL_LABELS_DS[idx] == 1 else "Benign"
|
| 268 |
-
prob_dict,
|
| 269 |
pred_label = max(prob_dict, key=prob_dict.get)
|
| 270 |
confidence = prob_dict[pred_label]
|
| 271 |
result_html = build_result_html(pred_label, confidence, prob_dict, text)
|
| 272 |
risk_text = build_risk_assessment(pred_label, confidence, prob_dict)
|
| 273 |
-
risk_text += (
|
| 274 |
-
f"\n\n---\n**Dataset metadata:**\n"
|
| 275 |
-
f"- Category: **{CATEGORY_LABELS.get(category, category)}**\n"
|
| 276 |
-
f"- Severity: **{severity}**\n"
|
| 277 |
-
f"- Ground truth: **{ground_truth}**\n"
|
| 278 |
-
)
|
| 279 |
return result_html, risk_text, text
|
| 280 |
except Exception as e:
|
| 281 |
logger.error("Error: %s", e)
|
| 282 |
-
return empty_analysis_html(), f"Error
|
| 283 |
|
| 284 |
|
| 285 |
def on_manual_analyze(text):
|
| 286 |
if not text or not text.strip():
|
| 287 |
return empty_analysis_html(), ""
|
| 288 |
-
prob_dict,
|
| 289 |
pred_label = max(prob_dict, key=prob_dict.get)
|
| 290 |
confidence = prob_dict[pred_label]
|
| 291 |
result_html = build_result_html(pred_label, confidence, prob_dict, text)
|
|
@@ -294,69 +346,79 @@ def on_manual_analyze(text):
|
|
| 294 |
|
| 295 |
|
| 296 |
# ---------------------------------------------------------------------------
|
| 297 |
-
# UI builders
|
| 298 |
# ---------------------------------------------------------------------------
|
| 299 |
def empty_analysis_html():
|
| 300 |
-
return """
|
| 301 |
-
<div
|
| 302 |
-
<
|
| 303 |
-
|
| 304 |
-
|
|
|
|
|
|
|
| 305 |
</div>
|
| 306 |
"""
|
| 307 |
|
| 308 |
|
| 309 |
def build_result_html(label, confidence, probs, text):
|
| 310 |
-
|
| 311 |
-
|
|
|
|
| 312 |
pct = confidence * 100
|
| 313 |
safety_score = probs["Benign"] * 100
|
| 314 |
safety_color = (
|
| 315 |
-
"
|
| 316 |
-
else "
|
| 317 |
-
else "
|
| 318 |
)
|
|
|
|
| 319 |
bars_html = ""
|
| 320 |
for lbl in LABELS:
|
| 321 |
p = probs[lbl] * 100
|
| 322 |
-
c = "
|
| 323 |
bars_html += f"""
|
| 324 |
-
<div
|
| 325 |
-
<div
|
| 326 |
-
<span
|
| 327 |
-
<span
|
| 328 |
</div>
|
| 329 |
-
<div
|
| 330 |
-
<div style="
|
| 331 |
</div>
|
| 332 |
</div>
|
| 333 |
"""
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
|
|
|
|
|
|
| 337 |
return f"""
|
| 338 |
-
<div
|
| 339 |
-
<div
|
| 340 |
-
<
|
| 341 |
-
<div
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
<div style="display:flex; justify-content:space-between; margin-bottom:4px;">
|
| 346 |
-
<span style="color:#e2e8f0; font-weight:600;">Safety Score</span>
|
| 347 |
-
<span style="color:{safety_color}; font-weight:700; font-size:1.1em;">{safety_score:.0f}/100</span>
|
| 348 |
-
</div>
|
| 349 |
-
<div style="background:#334155; border-radius:8px; height:12px; overflow:hidden;">
|
| 350 |
-
<div style="background:linear-gradient(90deg, #ef4444, #f59e0b, #22c55e);
|
| 351 |
-
height:100%; width:{safety_score}%; border-radius:8px;"></div>
|
| 352 |
</div>
|
| 353 |
</div>
|
| 354 |
-
|
| 355 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 356 |
</div>
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 360 |
</div>
|
| 361 |
</div>
|
| 362 |
"""
|
|
@@ -366,19 +428,22 @@ def build_risk_assessment(label, confidence, probs):
|
|
| 366 |
safety_score = probs["Benign"] * 100
|
| 367 |
malicious_score = probs["Malicious"] * 100
|
| 368 |
if label == "Benign" and confidence > 0.85:
|
| 369 |
-
level
|
|
|
|
| 370 |
elif label == "Benign":
|
| 371 |
-
level
|
|
|
|
| 372 |
elif confidence > 0.85:
|
| 373 |
-
level
|
|
|
|
| 374 |
else:
|
| 375 |
-
level
|
|
|
|
| 376 |
return (
|
| 377 |
-
f"
|
| 378 |
-
f"**
|
| 379 |
-
f"-
|
| 380 |
-
f"-
|
| 381 |
-
f"- P(Benign) = {probs['Benign']*100:.1f}% | P(Malicious) = {malicious_score:.1f}%\n"
|
| 382 |
)
|
| 383 |
|
| 384 |
|
|
@@ -396,37 +461,37 @@ def build_stats_html():
|
|
| 396 |
pct = count / total * 100
|
| 397 |
label = CATEGORY_LABELS.get(cat, cat)
|
| 398 |
cats_html += (
|
| 399 |
-
f'<div
|
| 400 |
-
f'<span style="
|
| 401 |
-
f'<span
|
|
|
|
| 402 |
f'</div>'
|
| 403 |
)
|
| 404 |
return f"""
|
| 405 |
-
<div
|
| 406 |
-
<div
|
| 407 |
-
<
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
<div
|
|
|
|
| 411 |
</div>
|
| 412 |
-
<div
|
| 413 |
-
<div style="color:
|
| 414 |
-
<div style="color:
|
| 415 |
</div>
|
| 416 |
-
<div
|
| 417 |
-
<div style="color:
|
| 418 |
-
<div style="color:
|
| 419 |
</div>
|
| 420 |
</div>
|
| 421 |
-
<div
|
| 422 |
-
{cats_html}
|
| 423 |
-
</div>
|
| 424 |
</div>
|
| 425 |
"""
|
| 426 |
|
| 427 |
|
| 428 |
# ---------------------------------------------------------------------------
|
| 429 |
-
# JavaScript
|
| 430 |
# ---------------------------------------------------------------------------
|
| 431 |
PLOTLY_CLICK_JS = """
|
| 432 |
() => {
|
|
@@ -440,7 +505,8 @@ PLOTLY_CLICK_JS = """
|
|
| 440 |
if (data && data.points && data.points.length > 0) {
|
| 441 |
const idx = data.points[0].customdata;
|
| 442 |
if (idx !== undefined && idx !== null) {
|
| 443 |
-
const inputEl = document.querySelector('#click-index-input textarea')
|
|
|
|
| 444 |
if (inputEl) {
|
| 445 |
const proto = inputEl.tagName === 'TEXTAREA'
|
| 446 |
? window.HTMLTextAreaElement.prototype
|
|
@@ -473,60 +539,560 @@ PLOTLY_CLICK_JS = """
|
|
| 473 |
|
| 474 |
|
| 475 |
# ---------------------------------------------------------------------------
|
| 476 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 477 |
# ---------------------------------------------------------------------------
|
| 478 |
-
|
| 479 |
-
<
|
| 480 |
-
<
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
<
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 487 |
</p>
|
| 488 |
-
</
|
| 489 |
"""
|
| 490 |
|
| 491 |
HOW_TO_HTML = """
|
| 492 |
-
<div
|
| 493 |
-
<div
|
| 494 |
-
|
| 495 |
-
<div
|
| 496 |
-
|
| 497 |
-
|
|
|
|
|
|
|
| 498 |
</div>
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 502 |
</div>
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 506 |
</div>
|
| 507 |
</div>
|
| 508 |
</div>
|
| 509 |
"""
|
| 510 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 511 |
with gr.Blocks(
|
| 512 |
-
title="GuardLLM
|
|
|
|
|
|
|
| 513 |
) as demo:
|
| 514 |
|
| 515 |
-
gr.HTML(
|
| 516 |
gr.HTML(HOW_TO_HTML)
|
| 517 |
|
| 518 |
-
click_index = gr.Textbox(
|
| 519 |
-
value="",
|
| 520 |
-
visible=True,
|
| 521 |
-
elem_id="click-index-input",
|
| 522 |
-
)
|
| 523 |
|
| 524 |
with gr.Row():
|
| 525 |
-
# ---- Left
|
| 526 |
with gr.Column(scale=3):
|
| 527 |
with gr.Row():
|
| 528 |
-
select_all_btn = gr.Button("Select
|
| 529 |
-
deselect_all_btn = gr.Button("Deselect
|
| 530 |
|
| 531 |
category_filter = gr.CheckboxGroup(
|
| 532 |
choices=UNIQUE_CATEGORIES,
|
|
@@ -536,96 +1102,69 @@ with gr.Blocks(
|
|
| 536 |
)
|
| 537 |
tsne_plot = gr.Plot(
|
| 538 |
value=build_tsne_figure(),
|
| 539 |
-
label="t-SNE
|
| 540 |
elem_id="tsne-chart",
|
| 541 |
)
|
| 542 |
gr.Markdown(
|
| 543 |
-
"
|
| 544 |
-
"Hover to preview
|
| 545 |
)
|
| 546 |
|
| 547 |
-
# ---- Right
|
| 548 |
with gr.Column(scale=2):
|
| 549 |
-
gr.
|
|
|
|
| 550 |
result_html = gr.HTML(value=empty_analysis_html())
|
| 551 |
risk_md = gr.Markdown(value="")
|
| 552 |
-
full_prompt = gr.Textbox(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 553 |
|
| 554 |
gr.Markdown("---")
|
| 555 |
|
| 556 |
-
gr.
|
|
|
|
| 557 |
prompt_dropdown = gr.Dropdown(
|
| 558 |
choices=DROPDOWN_CHOICES,
|
| 559 |
-
label="Search dataset",
|
| 560 |
filterable=True,
|
| 561 |
interactive=True,
|
| 562 |
)
|
| 563 |
|
| 564 |
-
gr.
|
|
|
|
| 565 |
manual_input = gr.Textbox(
|
| 566 |
-
label="
|
| 567 |
-
placeholder="Type or paste a
|
| 568 |
lines=2,
|
| 569 |
)
|
| 570 |
-
analyze_btn = gr.Button("
|
| 571 |
|
| 572 |
gr.Markdown("---")
|
| 573 |
|
| 574 |
gr.HTML(build_stats_html())
|
| 575 |
|
| 576 |
# ---- Events ----
|
| 577 |
-
category_filter.change(
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
fn=deselect_all_categories,
|
| 589 |
-
inputs=[],
|
| 590 |
-
outputs=[category_filter, tsne_plot],
|
| 591 |
-
)
|
| 592 |
-
click_index.change(
|
| 593 |
-
fn=on_index_input,
|
| 594 |
-
inputs=[click_index],
|
| 595 |
-
outputs=[result_html, risk_md, full_prompt],
|
| 596 |
-
)
|
| 597 |
-
prompt_dropdown.change(
|
| 598 |
-
fn=on_dropdown_select,
|
| 599 |
-
inputs=[prompt_dropdown],
|
| 600 |
-
outputs=[result_html, risk_md, full_prompt],
|
| 601 |
-
)
|
| 602 |
-
analyze_btn.click(
|
| 603 |
-
fn=on_manual_analyze,
|
| 604 |
-
inputs=[manual_input],
|
| 605 |
-
outputs=[result_html, risk_md],
|
| 606 |
-
)
|
| 607 |
-
manual_input.submit(
|
| 608 |
-
fn=on_manual_analyze,
|
| 609 |
-
inputs=[manual_input],
|
| 610 |
-
outputs=[result_html, risk_md],
|
| 611 |
-
)
|
| 612 |
demo.load(fn=None, inputs=None, outputs=None, js=PLOTLY_CLICK_JS)
|
| 613 |
|
| 614 |
-
gr.
|
| 615 |
-
"""
|
| 616 |
-
---
|
| 617 |
-
<div style="text-align:center; color:#64748b; font-size:0.8em;">
|
| 618 |
-
<strong>GuardLLM</strong> - Prompt Security Visualizer<br>
|
| 619 |
-
Model: <a href="https://huggingface.co/meta-llama/Llama-Prompt-Guard-2-86M">
|
| 620 |
-
Llama Prompt Guard 2 (86M)</a> by Meta |
|
| 621 |
-
Dataset: <a href="https://huggingface.co/datasets/neuralchemy/Prompt-injection-dataset">
|
| 622 |
-
neuralchemy/Prompt-injection-dataset</a>
|
| 623 |
-
</div>
|
| 624 |
-
"""
|
| 625 |
-
)
|
| 626 |
|
| 627 |
|
| 628 |
logger.info("Gradio app built. Ready to launch.")
|
| 629 |
|
| 630 |
if __name__ == "__main__":
|
| 631 |
-
demo.launch(
|
|
|
|
| 1 |
"""
|
| 2 |
+
GuardLLM — Prompt Security Visualizer
|
| 3 |
+
Aleph Beth design system applied. Editorial calm, bilingual FR/EN posture.
|
| 4 |
Powered by Llama Prompt Guard 2 (86M) and neuralchemy/Prompt-injection-dataset.
|
| 5 |
"""
|
| 6 |
|
| 7 |
import logging
|
| 8 |
+
import os
|
| 9 |
import sys
|
| 10 |
import json
|
|
|
|
| 11 |
|
| 12 |
import gradio as gr
|
| 13 |
import torch
|
| 14 |
import numpy as np
|
| 15 |
import plotly.graph_objects as go
|
|
|
|
| 16 |
from pathlib import Path
|
| 17 |
|
| 18 |
# ---------------------------------------------------------------------------
|
|
|
|
| 26 |
logger = logging.getLogger("GuardLLM")
|
| 27 |
|
| 28 |
# ---------------------------------------------------------------------------
|
| 29 |
+
# Aleph Beth — palette tokens (mirrored from colors_and_type.css)
|
| 30 |
# ---------------------------------------------------------------------------
|
| 31 |
+
AB = {
|
| 32 |
+
"ink_950": "#0B1626",
|
| 33 |
+
"ink_900": "#11203A",
|
| 34 |
+
"ink_800": "#1B2F4E",
|
| 35 |
+
"ink_700": "#2A4566",
|
| 36 |
+
"ink_600": "#44607F",
|
| 37 |
+
"ink_500": "#6B829D",
|
| 38 |
+
"ink_400": "#95A6BB",
|
| 39 |
+
"ink_300": "#BCC8D6",
|
| 40 |
+
"ink_200": "#DAE1EA",
|
| 41 |
+
"ink_100": "#ECF0F5",
|
| 42 |
+
"ink_50": "#F6F8FB",
|
| 43 |
+
"parchment_50": "#FCFAF2",
|
| 44 |
+
"parchment_100": "#F8F3E6",
|
| 45 |
+
"parchment_200": "#ECE5D2",
|
| 46 |
+
"parchment_300": "#DDD3B9",
|
| 47 |
+
"parchment_400": "#C2B695",
|
| 48 |
+
"gilt_50": "#FCEEDA",
|
| 49 |
+
"gilt_100": "#F8D9A4",
|
| 50 |
+
"gilt_200": "#F2BD72",
|
| 51 |
+
"gilt_300": "#EAA046",
|
| 52 |
+
"gilt_400": "#DC8B2A",
|
| 53 |
+
"gilt_500": "#A66718",
|
| 54 |
+
"gilt_600": "#7A4912",
|
| 55 |
+
"signal_100": "#C9DDEB",
|
| 56 |
+
"signal_200": "#9BBFD9",
|
| 57 |
+
"signal_300": "#6FA0C2",
|
| 58 |
+
"signal_400": "#4A82AA",
|
| 59 |
+
"signal_500": "#36678C",
|
| 60 |
+
"signal_600": "#244D6B",
|
| 61 |
+
"threat_400": "#D44A3E",
|
| 62 |
+
"threat_300": "#E07065",
|
| 63 |
+
"threat_100": "#F8DAD5",
|
| 64 |
+
"safe_400": "#3F8F6E",
|
| 65 |
+
"safe_300": "#66AB8C",
|
| 66 |
+
"safe_100": "#D4E8DD",
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
# Category colors stay within the brand families — no neon, no inventions.
|
| 70 |
CATEGORY_COLORS = {
|
| 71 |
+
"benign": AB["safe_400"],
|
| 72 |
+
"direct_injection": AB["threat_400"],
|
| 73 |
+
"jailbreak": AB["gilt_400"],
|
| 74 |
+
"system_extraction": AB["gilt_600"],
|
| 75 |
+
"encoding_obfuscation": AB["signal_500"],
|
| 76 |
+
"persona_replacement": AB["gilt_300"],
|
| 77 |
+
"indirect_injection": AB["threat_300"],
|
| 78 |
+
"token_smuggling": AB["signal_600"],
|
| 79 |
+
"many_shot": AB["signal_400"],
|
| 80 |
+
"crescendo": AB["signal_200"],
|
| 81 |
+
"context_overflow": AB["ink_600"],
|
| 82 |
+
"prompt_leaking": AB["gilt_500"],
|
| 83 |
+
"unknown": AB["ink_400"],
|
| 84 |
}
|
| 85 |
|
| 86 |
CATEGORY_LABELS = {
|
|
|
|
| 104 |
# ---------------------------------------------------------------------------
|
| 105 |
MODEL_ID = "meta-llama/Llama-Prompt-Guard-2-86M"
|
| 106 |
LABELS = ["Benign", "Malicious"]
|
| 107 |
+
HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
|
| 108 |
_classifier = {"tokenizer": None, "model": None, "device": None}
|
| 109 |
|
| 110 |
|
|
|
|
| 112 |
if _classifier["model"] is None:
|
| 113 |
logger.info("Lazy-loading Llama Prompt Guard 2...")
|
| 114 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 115 |
+
kwargs = {"token": HF_TOKEN} if HF_TOKEN else {}
|
| 116 |
+
tok = AutoTokenizer.from_pretrained(MODEL_ID, **kwargs)
|
| 117 |
+
mdl = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, **kwargs)
|
| 118 |
mdl.eval()
|
| 119 |
dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 120 |
mdl.to(dev)
|
|
|
|
| 171 |
with torch.no_grad():
|
| 172 |
outputs = model(**inputs)
|
| 173 |
probs = torch.softmax(outputs.logits, dim=-1)[0].cpu().numpy()
|
|
|
|
| 174 |
prob_dict = {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
|
| 175 |
safety = float(probs[0])
|
| 176 |
return prob_dict, safety
|
| 177 |
|
| 178 |
|
| 179 |
# ---------------------------------------------------------------------------
|
| 180 |
+
# Plotly figure — parchment surface, ink axes, restrained palette
|
| 181 |
# ---------------------------------------------------------------------------
|
| 182 |
def build_tsne_figure(selected_categories=None):
|
| 183 |
fig = go.Figure()
|
|
|
|
| 198 |
severities = [ALL_SEVERITIES[i] or "benign" for i in indices]
|
| 199 |
hover_texts = [
|
| 200 |
f"<b>{CATEGORY_LABELS.get(cat, cat)}</b><br>"
|
| 201 |
+
f"Severity — {sev}<br>"
|
| 202 |
+
f"Index — {idx}<br>"
|
| 203 |
f"<i>{txt}</i>"
|
| 204 |
for idx, txt, sev in zip(indices, texts_preview, severities)
|
| 205 |
]
|
|
|
|
| 212 |
marker=dict(
|
| 213 |
size=5 if len(indices) > 500 else 7,
|
| 214 |
color=color,
|
| 215 |
+
opacity=0.78,
|
| 216 |
+
line=dict(width=0.5, color="rgba(17,32,58,0.20)"),
|
| 217 |
),
|
| 218 |
text=hover_texts,
|
| 219 |
hoverinfo="text",
|
| 220 |
customdata=[str(i) for i in indices],
|
| 221 |
))
|
| 222 |
fig.update_layout(
|
| 223 |
+
template="plotly_white",
|
| 224 |
+
paper_bgcolor=AB["parchment_100"],
|
| 225 |
+
plot_bgcolor=AB["parchment_50"],
|
| 226 |
+
font=dict(family="Geist, Inter, system-ui, sans-serif", color=AB["ink_700"]),
|
| 227 |
title=dict(
|
| 228 |
+
text="<span style='font-family: Instrument Serif, serif; font-size:18px;'>"
|
| 229 |
+
"t-SNE — Prompt Security Landscape</span>",
|
| 230 |
+
font=dict(color=AB["ink_900"]),
|
| 231 |
x=0.5,
|
| 232 |
+
xanchor="center",
|
| 233 |
),
|
| 234 |
legend=dict(
|
| 235 |
+
title=dict(text="Category", font=dict(color=AB["ink_700"], size=11)),
|
| 236 |
+
bgcolor="rgba(252,250,242,0.88)",
|
| 237 |
+
bordercolor="rgba(17,32,58,0.12)",
|
| 238 |
borderwidth=1,
|
| 239 |
+
font=dict(color=AB["ink_800"], size=10),
|
| 240 |
itemsizing="constant",
|
| 241 |
),
|
| 242 |
xaxis=dict(
|
| 243 |
+
title=dict(text="t-SNE 1", font=dict(color=AB["ink_500"], size=11)),
|
| 244 |
+
showgrid=True,
|
| 245 |
+
gridcolor="rgba(17,32,58,0.06)",
|
| 246 |
+
zeroline=False,
|
| 247 |
+
color=AB["ink_500"],
|
| 248 |
),
|
| 249 |
yaxis=dict(
|
| 250 |
+
title=dict(text="t-SNE 2", font=dict(color=AB["ink_500"], size=11)),
|
| 251 |
+
showgrid=True,
|
| 252 |
+
gridcolor="rgba(17,32,58,0.06)",
|
| 253 |
+
zeroline=False,
|
| 254 |
+
color=AB["ink_500"],
|
| 255 |
),
|
| 256 |
+
margin=dict(l=44, r=44, t=56, b=44),
|
| 257 |
+
height=620,
|
| 258 |
dragmode="pan",
|
| 259 |
+
hoverlabel=dict(
|
| 260 |
+
bgcolor=AB["parchment_50"],
|
| 261 |
+
bordercolor="rgba(17,32,58,0.12)",
|
| 262 |
+
font=dict(family="Geist, sans-serif", color=AB["ink_900"], size=12),
|
| 263 |
+
),
|
| 264 |
)
|
| 265 |
return fig
|
| 266 |
|
|
|
|
| 281 |
return gr.update(value=[]), build_tsne_figure([])
|
| 282 |
|
| 283 |
|
| 284 |
+
def _dataset_meta_block(category, severity, ground_truth):
|
| 285 |
+
return (
|
| 286 |
+
f"\n\n<span class='ab-eyebrow'>Dataset metadata</span>\n"
|
| 287 |
+
f"- Category — **{CATEGORY_LABELS.get(category, category)}**\n"
|
| 288 |
+
f"- Severity — **{severity}**\n"
|
| 289 |
+
f"- Ground truth — **{ground_truth}**\n"
|
| 290 |
+
)
|
| 291 |
+
|
| 292 |
+
|
| 293 |
def on_dropdown_select(choice):
|
| 294 |
if not choice:
|
| 295 |
+
return empty_analysis_html(), "*Select a prompt to begin.*", ""
|
| 296 |
try:
|
| 297 |
idx = int(choice.split(" | ")[0])
|
| 298 |
text = ALL_TEXTS[idx]
|
| 299 |
category = ALL_CATEGORIES[idx]
|
| 300 |
severity = ALL_SEVERITIES[idx] or "N/A"
|
| 301 |
ground_truth = "Malicious" if ALL_LABELS_DS[idx] == 1 else "Benign"
|
| 302 |
+
prob_dict, _ = analyze_prompt(text)
|
| 303 |
pred_label = max(prob_dict, key=prob_dict.get)
|
| 304 |
confidence = prob_dict[pred_label]
|
| 305 |
result_html = build_result_html(pred_label, confidence, prob_dict, text)
|
| 306 |
risk_text = build_risk_assessment(pred_label, confidence, prob_dict)
|
| 307 |
+
risk_text += _dataset_meta_block(category, severity, ground_truth)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
return result_html, risk_text, text
|
| 309 |
except Exception as e:
|
| 310 |
logger.error("Error: %s", e)
|
| 311 |
+
return empty_analysis_html(), f"Error — {e}", ""
|
| 312 |
|
| 313 |
|
| 314 |
def on_index_input(idx_str):
|
|
|
|
| 317 |
try:
|
| 318 |
idx = int(idx_str.strip())
|
| 319 |
if idx < 0 or idx >= len(ALL_TEXTS):
|
| 320 |
+
return empty_analysis_html(), f"Invalid index — {idx}", ""
|
| 321 |
text = ALL_TEXTS[idx]
|
| 322 |
category = ALL_CATEGORIES[idx]
|
| 323 |
severity = ALL_SEVERITIES[idx] or "N/A"
|
| 324 |
ground_truth = "Malicious" if ALL_LABELS_DS[idx] == 1 else "Benign"
|
| 325 |
+
prob_dict, _ = analyze_prompt(text)
|
| 326 |
pred_label = max(prob_dict, key=prob_dict.get)
|
| 327 |
confidence = prob_dict[pred_label]
|
| 328 |
result_html = build_result_html(pred_label, confidence, prob_dict, text)
|
| 329 |
risk_text = build_risk_assessment(pred_label, confidence, prob_dict)
|
| 330 |
+
risk_text += _dataset_meta_block(category, severity, ground_truth)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 331 |
return result_html, risk_text, text
|
| 332 |
except Exception as e:
|
| 333 |
logger.error("Error: %s", e)
|
| 334 |
+
return empty_analysis_html(), f"Error — {e}", ""
|
| 335 |
|
| 336 |
|
| 337 |
def on_manual_analyze(text):
|
| 338 |
if not text or not text.strip():
|
| 339 |
return empty_analysis_html(), ""
|
| 340 |
+
prob_dict, _ = analyze_prompt(text)
|
| 341 |
pred_label = max(prob_dict, key=prob_dict.get)
|
| 342 |
confidence = prob_dict[pred_label]
|
| 343 |
result_html = build_result_html(pred_label, confidence, prob_dict, text)
|
|
|
|
| 346 |
|
| 347 |
|
| 348 |
# ---------------------------------------------------------------------------
|
| 349 |
+
# UI builders — editorial, parchment surface, ink type, no emoji
|
| 350 |
# ---------------------------------------------------------------------------
|
| 351 |
def empty_analysis_html():
|
| 352 |
+
return f"""
|
| 353 |
+
<div class="ab-card ab-card--quiet">
|
| 354 |
+
<div class="ab-eyebrow">Idle</div>
|
| 355 |
+
<p class="ab-prose">
|
| 356 |
+
Click a point on the chart, pick a prompt from the list,
|
| 357 |
+
or paste your own below. The classifier runs on demand.
|
| 358 |
+
</p>
|
| 359 |
</div>
|
| 360 |
"""
|
| 361 |
|
| 362 |
|
| 363 |
def build_result_html(label, confidence, probs, text):
|
| 364 |
+
is_safe = label == "Benign"
|
| 365 |
+
accent = AB["safe_400"] if is_safe else AB["threat_400"]
|
| 366 |
+
marker = "●" # geometric primitive instead of emoji
|
| 367 |
pct = confidence * 100
|
| 368 |
safety_score = probs["Benign"] * 100
|
| 369 |
safety_color = (
|
| 370 |
+
AB["safe_400"] if safety_score >= 70
|
| 371 |
+
else AB["gilt_400"] if safety_score >= 40
|
| 372 |
+
else AB["threat_400"]
|
| 373 |
)
|
| 374 |
+
|
| 375 |
bars_html = ""
|
| 376 |
for lbl in LABELS:
|
| 377 |
p = probs[lbl] * 100
|
| 378 |
+
c = AB["safe_400"] if lbl == "Benign" else AB["threat_400"]
|
| 379 |
bars_html += f"""
|
| 380 |
+
<div class="ab-bar">
|
| 381 |
+
<div class="ab-bar__row">
|
| 382 |
+
<span class="ab-bar__label">{lbl}</span>
|
| 383 |
+
<span class="ab-bar__value">{p:.1f}%</span>
|
| 384 |
</div>
|
| 385 |
+
<div class="ab-bar__track">
|
| 386 |
+
<div class="ab-bar__fill" style="width:{p}%; background:{c};"></div>
|
| 387 |
</div>
|
| 388 |
</div>
|
| 389 |
"""
|
| 390 |
+
|
| 391 |
+
preview = text[:180].replace("<", "<").replace(">", ">")
|
| 392 |
+
if len(text) > 180:
|
| 393 |
+
preview += "…"
|
| 394 |
+
|
| 395 |
return f"""
|
| 396 |
+
<div class="ab-card">
|
| 397 |
+
<div class="ab-result__head">
|
| 398 |
+
<span class="ab-result__marker" style="color:{accent};">{marker}</span>
|
| 399 |
+
<div>
|
| 400 |
+
<div class="ab-eyebrow">Verdict</div>
|
| 401 |
+
<div class="ab-result__label" style="color:{accent};">{label}</div>
|
| 402 |
+
<div class="ab-caption">Confidence — {pct:.1f}%</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 403 |
</div>
|
| 404 |
</div>
|
| 405 |
+
|
| 406 |
+
<div class="ab-divider"></div>
|
| 407 |
+
|
| 408 |
+
<div class="ab-eyebrow">Safety score</div>
|
| 409 |
+
<div class="ab-score">
|
| 410 |
+
<div class="ab-score__value" style="color:{safety_color};">{safety_score:.0f}<span>/100</span></div>
|
| 411 |
+
<div class="ab-score__track">
|
| 412 |
+
<div class="ab-score__fill" style="width:{safety_score}%;"></div>
|
| 413 |
+
</div>
|
| 414 |
</div>
|
| 415 |
+
|
| 416 |
+
<div class="ab-eyebrow" style="margin-top:18px;">Class probabilities</div>
|
| 417 |
+
<div class="ab-bars">{bars_html}</div>
|
| 418 |
+
|
| 419 |
+
<div class="ab-quote">
|
| 420 |
+
<div class="ab-eyebrow">Analyzed prompt</div>
|
| 421 |
+
<blockquote>“{preview}”</blockquote>
|
| 422 |
</div>
|
| 423 |
</div>
|
| 424 |
"""
|
|
|
|
| 428 |
safety_score = probs["Benign"] * 100
|
| 429 |
malicious_score = probs["Malicious"] * 100
|
| 430 |
if label == "Benign" and confidence > 0.85:
|
| 431 |
+
level = "Low"
|
| 432 |
+
desc = "The request appears **safe**. No injection or jailbreak patterns were detected."
|
| 433 |
elif label == "Benign":
|
| 434 |
+
level = "Moderate"
|
| 435 |
+
desc = "Likely benign, with moderate confidence. The wording may be ambiguous."
|
| 436 |
elif confidence > 0.85:
|
| 437 |
+
level = "Critical"
|
| 438 |
+
desc = "**Malicious request detected** with high confidence. Likely injection or jailbreak."
|
| 439 |
else:
|
| 440 |
+
level = "High"
|
| 441 |
+
desc = "**Malicious request detected.** Possible injection or jailbreak — review recommended."
|
| 442 |
return (
|
| 443 |
+
f"<span class='ab-eyebrow'>Risk level — {level}</span>\n\n{desc}\n\n"
|
| 444 |
+
f"- Safety score — **{safety_score:.0f}/100**\n"
|
| 445 |
+
f"- Predicted class — **{label}** ({confidence*100:.1f}%)\n"
|
| 446 |
+
f"- P(Benign) — {probs['Benign']*100:.1f}% · P(Malicious) — {malicious_score:.1f}%\n"
|
|
|
|
| 447 |
)
|
| 448 |
|
| 449 |
|
|
|
|
| 461 |
pct = count / total * 100
|
| 462 |
label = CATEGORY_LABELS.get(cat, cat)
|
| 463 |
cats_html += (
|
| 464 |
+
f'<div class="ab-stats__row">'
|
| 465 |
+
f'<span class="ab-stats__dot" style="background:{color};"></span>'
|
| 466 |
+
f'<span class="ab-stats__name">{label}</span>'
|
| 467 |
+
f'<span class="ab-stats__count">{count:,} <em>({pct:.1f}%)</em></span>'
|
| 468 |
f'</div>'
|
| 469 |
)
|
| 470 |
return f"""
|
| 471 |
+
<div class="ab-card">
|
| 472 |
+
<div class="ab-eyebrow">Dataset</div>
|
| 473 |
+
<h3 class="ab-h3">Composition</h3>
|
| 474 |
+
<div class="ab-kpi-row">
|
| 475 |
+
<div class="ab-kpi">
|
| 476 |
+
<div class="ab-kpi__label">Total</div>
|
| 477 |
+
<div class="ab-kpi__value">{total:,}</div>
|
| 478 |
</div>
|
| 479 |
+
<div class="ab-kpi">
|
| 480 |
+
<div class="ab-kpi__label" style="color:{AB['safe_400']};">Benign</div>
|
| 481 |
+
<div class="ab-kpi__value" style="color:{AB['safe_400']};">{n_benign:,}</div>
|
| 482 |
</div>
|
| 483 |
+
<div class="ab-kpi">
|
| 484 |
+
<div class="ab-kpi__label" style="color:{AB['threat_400']};">Malicious</div>
|
| 485 |
+
<div class="ab-kpi__value" style="color:{AB['threat_400']};">{n_malicious:,}</div>
|
| 486 |
</div>
|
| 487 |
</div>
|
| 488 |
+
<div class="ab-stats">{cats_html}</div>
|
|
|
|
|
|
|
| 489 |
</div>
|
| 490 |
"""
|
| 491 |
|
| 492 |
|
| 493 |
# ---------------------------------------------------------------------------
|
| 494 |
+
# JavaScript bridge: Plotly clicks → Gradio hidden input
|
| 495 |
# ---------------------------------------------------------------------------
|
| 496 |
PLOTLY_CLICK_JS = """
|
| 497 |
() => {
|
|
|
|
| 505 |
if (data && data.points && data.points.length > 0) {
|
| 506 |
const idx = data.points[0].customdata;
|
| 507 |
if (idx !== undefined && idx !== null) {
|
| 508 |
+
const inputEl = document.querySelector('#click-index-input textarea')
|
| 509 |
+
|| document.querySelector('#click-index-input input');
|
| 510 |
if (inputEl) {
|
| 511 |
const proto = inputEl.tagName === 'TEXTAREA'
|
| 512 |
? window.HTMLTextAreaElement.prototype
|
|
|
|
| 539 |
|
| 540 |
|
| 541 |
# ---------------------------------------------------------------------------
|
| 542 |
+
# Aleph Beth — global CSS
|
| 543 |
+
# ---------------------------------------------------------------------------
|
| 544 |
+
ALEPH_BETH_CSS = """
|
| 545 |
+
@import url('https://fonts.googleapis.com/css2?family=Instrument+Serif:ital@0;1&family=Geist:wght@300;400;500;600;700&family=Geist+Mono:wght@400;500;600&family=Frank+Ruhl+Libre:wght@400;500&family=Amiri:wght@400;700&display=swap');
|
| 546 |
+
|
| 547 |
+
:root, .gradio-container {
|
| 548 |
+
--ab-ink-950:#0B1626; --ab-ink-900:#11203A; --ab-ink-800:#1B2F4E;
|
| 549 |
+
--ab-ink-700:#2A4566; --ab-ink-600:#44607F; --ab-ink-500:#6B829D;
|
| 550 |
+
--ab-ink-400:#95A6BB; --ab-ink-300:#BCC8D6; --ab-ink-200:#DAE1EA;
|
| 551 |
+
--ab-ink-100:#ECF0F5; --ab-ink-50:#F6F8FB;
|
| 552 |
+
--ab-parchment-50:#FCFAF2; --ab-parchment-100:#F8F3E6;
|
| 553 |
+
--ab-parchment-200:#ECE5D2; --ab-parchment-300:#DDD3B9;
|
| 554 |
+
--ab-gilt-300:#EAA046; --ab-gilt-400:#DC8B2A; --ab-gilt-500:#A66718; --ab-gilt-600:#7A4912;
|
| 555 |
+
--ab-signal-300:#6FA0C2; --ab-signal-400:#4A82AA; --ab-signal-500:#36678C;
|
| 556 |
+
--ab-threat-400:#D44A3E; --ab-safe-400:#3F8F6E;
|
| 557 |
+
--ab-border: rgba(17,32,58,0.12);
|
| 558 |
+
--ab-border-subtle: rgba(17,32,58,0.06);
|
| 559 |
+
--ab-shadow-sm: 0 2px 6px rgba(17,32,58,0.07), 0 1px 2px rgba(17,32,58,0.04);
|
| 560 |
+
--ab-shadow-md: 0 8px 20px rgba(17,32,58,0.08), 0 2px 4px rgba(17,32,58,0.05);
|
| 561 |
+
--ab-ease: cubic-bezier(0.16, 1, 0.3, 1);
|
| 562 |
+
--font-display: 'Instrument Serif', 'Cormorant Garamond', serif;
|
| 563 |
+
--font-body: 'Geist', 'Inter', system-ui, sans-serif;
|
| 564 |
+
--font-mono: 'Geist Mono', 'JetBrains Mono', ui-monospace, monospace;
|
| 565 |
+
}
|
| 566 |
+
|
| 567 |
+
/* ---------- Base canvas ---------- */
|
| 568 |
+
.gradio-container, body, html {
|
| 569 |
+
background: var(--ab-parchment-100) !important;
|
| 570 |
+
color: var(--ab-ink-900) !important;
|
| 571 |
+
font-family: var(--font-body) !important;
|
| 572 |
+
font-feature-settings: 'ss01', 'cv01';
|
| 573 |
+
}
|
| 574 |
+
.gradio-container { max-width: 1440px !important; margin: 0 auto !important; padding: 24px 32px !important; }
|
| 575 |
+
|
| 576 |
+
/* Remove Gradio gradient backgrounds */
|
| 577 |
+
.gradio-container *::before, .gradio-container *::after { background-image: none !important; }
|
| 578 |
+
|
| 579 |
+
/* ---------- Header / brand ---------- */
|
| 580 |
+
.ab-header {
|
| 581 |
+
padding: 18px 4px 22px;
|
| 582 |
+
border-bottom: 1px solid var(--ab-border);
|
| 583 |
+
margin-bottom: 24px;
|
| 584 |
+
display: flex; align-items: baseline; justify-content: space-between; gap: 24px;
|
| 585 |
+
flex-wrap: wrap;
|
| 586 |
+
}
|
| 587 |
+
.ab-header__brand {
|
| 588 |
+
display: flex; align-items: baseline; gap: 14px;
|
| 589 |
+
}
|
| 590 |
+
.ab-header__mark {
|
| 591 |
+
font-family: var(--font-display);
|
| 592 |
+
font-size: 32px; line-height: 1;
|
| 593 |
+
color: var(--ab-gilt-500);
|
| 594 |
+
letter-spacing: -0.01em;
|
| 595 |
+
}
|
| 596 |
+
.ab-header__mark .heb { font-family: 'Frank Ruhl Libre', serif; }
|
| 597 |
+
.ab-header__mark .ar { font-family: 'Amiri', serif; }
|
| 598 |
+
.ab-header__title {
|
| 599 |
+
font-family: var(--font-display);
|
| 600 |
+
font-size: 38px; line-height: 1.05;
|
| 601 |
+
color: var(--ab-ink-900);
|
| 602 |
+
letter-spacing: -0.01em;
|
| 603 |
+
margin: 0;
|
| 604 |
+
}
|
| 605 |
+
.ab-header__title em { font-style: italic; color: var(--ab-gilt-600); }
|
| 606 |
+
.ab-header__sub {
|
| 607 |
+
font-family: var(--font-body);
|
| 608 |
+
color: var(--ab-ink-700);
|
| 609 |
+
font-size: 14px; line-height: 1.5;
|
| 610 |
+
max-width: 460px;
|
| 611 |
+
}
|
| 612 |
+
.ab-header__sub a { color: var(--ab-signal-500); text-decoration: underline; text-underline-offset: 3px; }
|
| 613 |
+
|
| 614 |
+
/* ---------- Eyebrow / labels / type ---------- */
|
| 615 |
+
.ab-eyebrow {
|
| 616 |
+
display: inline-block;
|
| 617 |
+
font-family: var(--font-body);
|
| 618 |
+
font-size: 11px; font-weight: 500;
|
| 619 |
+
text-transform: uppercase;
|
| 620 |
+
letter-spacing: 0.16em;
|
| 621 |
+
color: var(--ab-gilt-600);
|
| 622 |
+
margin-bottom: 6px;
|
| 623 |
+
}
|
| 624 |
+
.ab-h3 {
|
| 625 |
+
font-family: var(--font-display);
|
| 626 |
+
font-size: 22px; line-height: 1.2;
|
| 627 |
+
color: var(--ab-ink-900);
|
| 628 |
+
margin: 0 0 12px 0;
|
| 629 |
+
letter-spacing: -0.005em;
|
| 630 |
+
}
|
| 631 |
+
.ab-prose {
|
| 632 |
+
font-family: var(--font-body);
|
| 633 |
+
font-size: 14px; line-height: 1.55;
|
| 634 |
+
color: var(--ab-ink-700);
|
| 635 |
+
}
|
| 636 |
+
.ab-caption {
|
| 637 |
+
font-family: var(--font-body);
|
| 638 |
+
font-size: 12px;
|
| 639 |
+
color: var(--ab-ink-500);
|
| 640 |
+
letter-spacing: 0.02em;
|
| 641 |
+
}
|
| 642 |
+
.ab-divider {
|
| 643 |
+
height: 1px; background: var(--ab-border);
|
| 644 |
+
margin: 16px 0;
|
| 645 |
+
}
|
| 646 |
+
|
| 647 |
+
/* ---------- Cards ---------- */
|
| 648 |
+
.ab-card {
|
| 649 |
+
background: var(--ab-parchment-50);
|
| 650 |
+
border: 1px solid var(--ab-border);
|
| 651 |
+
border-radius: 12px;
|
| 652 |
+
padding: 20px 22px;
|
| 653 |
+
box-shadow: var(--ab-shadow-sm);
|
| 654 |
+
font-family: var(--font-body);
|
| 655 |
+
}
|
| 656 |
+
.ab-card--quiet {
|
| 657 |
+
background: transparent;
|
| 658 |
+
border-style: dashed;
|
| 659 |
+
box-shadow: none;
|
| 660 |
+
}
|
| 661 |
+
|
| 662 |
+
/* ---------- How-to (3-up) ---------- */
|
| 663 |
+
.ab-howto {
|
| 664 |
+
display: grid;
|
| 665 |
+
grid-template-columns: repeat(3, 1fr);
|
| 666 |
+
gap: 12px;
|
| 667 |
+
margin: 8px 0 20px;
|
| 668 |
+
}
|
| 669 |
+
@media (max-width: 900px) { .ab-howto { grid-template-columns: 1fr; } }
|
| 670 |
+
.ab-howto__step {
|
| 671 |
+
background: var(--ab-parchment-50);
|
| 672 |
+
border: 1px solid var(--ab-border);
|
| 673 |
+
border-radius: 12px;
|
| 674 |
+
padding: 16px 18px;
|
| 675 |
+
transition: transform var(--ab-ease) 220ms, box-shadow var(--ab-ease) 220ms;
|
| 676 |
+
}
|
| 677 |
+
.ab-howto__step:hover { transform: translateY(-1px); box-shadow: var(--ab-shadow-md); }
|
| 678 |
+
.ab-howto__num {
|
| 679 |
+
font-family: var(--font-display);
|
| 680 |
+
font-size: 28px;
|
| 681 |
+
color: var(--ab-gilt-500);
|
| 682 |
+
line-height: 1;
|
| 683 |
+
}
|
| 684 |
+
.ab-howto__title {
|
| 685 |
+
font-family: var(--font-body);
|
| 686 |
+
font-size: 14px; font-weight: 600;
|
| 687 |
+
color: var(--ab-ink-900);
|
| 688 |
+
margin: 8px 0 6px;
|
| 689 |
+
}
|
| 690 |
+
.ab-howto__body {
|
| 691 |
+
font-family: var(--font-body);
|
| 692 |
+
font-size: 13px; line-height: 1.5;
|
| 693 |
+
color: var(--ab-ink-700);
|
| 694 |
+
}
|
| 695 |
+
|
| 696 |
+
/* ---------- Result card ---------- */
|
| 697 |
+
.ab-result__head {
|
| 698 |
+
display: flex; align-items: center; gap: 14px;
|
| 699 |
+
}
|
| 700 |
+
.ab-result__marker {
|
| 701 |
+
font-size: 28px; line-height: 1;
|
| 702 |
+
}
|
| 703 |
+
.ab-result__label {
|
| 704 |
+
font-family: var(--font-display);
|
| 705 |
+
font-size: 28px;
|
| 706 |
+
line-height: 1.1;
|
| 707 |
+
letter-spacing: -0.01em;
|
| 708 |
+
margin-top: 2px;
|
| 709 |
+
}
|
| 710 |
+
.ab-score {
|
| 711 |
+
display: flex; align-items: center; gap: 14px;
|
| 712 |
+
margin: 6px 0 4px;
|
| 713 |
+
}
|
| 714 |
+
.ab-score__value {
|
| 715 |
+
font-family: var(--font-display);
|
| 716 |
+
font-size: 44px; line-height: 1;
|
| 717 |
+
letter-spacing: -0.02em;
|
| 718 |
+
}
|
| 719 |
+
.ab-score__value span { font-size: 16px; color: var(--ab-ink-500); margin-left: 2px; }
|
| 720 |
+
.ab-score__track {
|
| 721 |
+
flex: 1; height: 8px;
|
| 722 |
+
background: var(--ab-parchment-200);
|
| 723 |
+
border-radius: 999px; overflow: hidden;
|
| 724 |
+
}
|
| 725 |
+
.ab-score__fill {
|
| 726 |
+
height: 100%;
|
| 727 |
+
background: linear-gradient(90deg, var(--ab-threat-400), var(--ab-gilt-400) 50%, var(--ab-safe-400));
|
| 728 |
+
border-radius: 999px;
|
| 729 |
+
transition: width 380ms var(--ab-ease);
|
| 730 |
+
}
|
| 731 |
+
.ab-bars { display: flex; flex-direction: column; gap: 10px; margin-top: 4px; }
|
| 732 |
+
.ab-bar__row {
|
| 733 |
+
display: flex; justify-content: space-between;
|
| 734 |
+
font-size: 13px; margin-bottom: 4px;
|
| 735 |
+
}
|
| 736 |
+
.ab-bar__label { color: var(--ab-ink-800); font-weight: 500; }
|
| 737 |
+
.ab-bar__value { color: var(--ab-ink-700); font-family: var(--font-mono); font-size: 12px; }
|
| 738 |
+
.ab-bar__track {
|
| 739 |
+
height: 8px; background: var(--ab-parchment-200);
|
| 740 |
+
border-radius: 999px; overflow: hidden;
|
| 741 |
+
}
|
| 742 |
+
.ab-bar__fill { height: 100%; border-radius: 999px; transition: width 380ms var(--ab-ease); }
|
| 743 |
+
.ab-quote {
|
| 744 |
+
margin-top: 18px;
|
| 745 |
+
padding: 14px 16px;
|
| 746 |
+
background: var(--ab-parchment-100);
|
| 747 |
+
border-left: 2px solid var(--ab-gilt-400);
|
| 748 |
+
border-radius: 4px;
|
| 749 |
+
}
|
| 750 |
+
.ab-quote blockquote {
|
| 751 |
+
font-family: var(--font-display);
|
| 752 |
+
font-style: italic;
|
| 753 |
+
font-size: 16px;
|
| 754 |
+
color: var(--ab-ink-800);
|
| 755 |
+
margin: 6px 0 0; padding: 0;
|
| 756 |
+
line-height: 1.45;
|
| 757 |
+
}
|
| 758 |
+
|
| 759 |
+
/* ---------- Stats ---------- */
|
| 760 |
+
.ab-kpi-row {
|
| 761 |
+
display: grid; grid-template-columns: repeat(3, 1fr); gap: 10px;
|
| 762 |
+
margin: 4px 0 16px;
|
| 763 |
+
}
|
| 764 |
+
.ab-kpi {
|
| 765 |
+
background: var(--ab-parchment-100);
|
| 766 |
+
border: 1px solid var(--ab-border-subtle);
|
| 767 |
+
border-radius: 8px;
|
| 768 |
+
padding: 10px 12px;
|
| 769 |
+
text-align: center;
|
| 770 |
+
}
|
| 771 |
+
.ab-kpi__label {
|
| 772 |
+
font-family: var(--font-body);
|
| 773 |
+
font-size: 11px; text-transform: uppercase; letter-spacing: 0.12em;
|
| 774 |
+
color: var(--ab-ink-500);
|
| 775 |
+
margin-bottom: 4px;
|
| 776 |
+
}
|
| 777 |
+
.ab-kpi__value {
|
| 778 |
+
font-family: var(--font-display);
|
| 779 |
+
font-size: 26px; line-height: 1;
|
| 780 |
+
color: var(--ab-ink-900);
|
| 781 |
+
letter-spacing: -0.01em;
|
| 782 |
+
}
|
| 783 |
+
.ab-stats { display: flex; flex-direction: column; }
|
| 784 |
+
.ab-stats__row {
|
| 785 |
+
display: flex; align-items: center; gap: 10px;
|
| 786 |
+
padding: 6px 0;
|
| 787 |
+
border-bottom: 1px solid var(--ab-border-subtle);
|
| 788 |
+
font-size: 13px;
|
| 789 |
+
}
|
| 790 |
+
.ab-stats__row:last-child { border-bottom: 0; }
|
| 791 |
+
.ab-stats__dot { width: 8px; height: 8px; border-radius: 999px; flex-shrink: 0; }
|
| 792 |
+
.ab-stats__name { color: var(--ab-ink-800); flex: 1; }
|
| 793 |
+
.ab-stats__count { color: var(--ab-ink-600); font-family: var(--font-mono); font-size: 12px; }
|
| 794 |
+
.ab-stats__count em { color: var(--ab-ink-500); font-style: normal; }
|
| 795 |
+
|
| 796 |
+
/* ---------- Gradio component overrides ---------- */
|
| 797 |
+
.gradio-container .block, .gradio-container .form, .gradio-container .panel {
|
| 798 |
+
background: transparent !important;
|
| 799 |
+
border: none !important;
|
| 800 |
+
}
|
| 801 |
+
.gradio-container .gr-box, .gradio-container .gr-panel,
|
| 802 |
+
.gradio-container .gr-form, .gradio-container [data-testid="block"] {
|
| 803 |
+
background: transparent !important;
|
| 804 |
+
border: none !important;
|
| 805 |
+
box-shadow: none !important;
|
| 806 |
+
}
|
| 807 |
+
|
| 808 |
+
/* Plot wrapper — paper card */
|
| 809 |
+
#tsne-chart {
|
| 810 |
+
background: var(--ab-parchment-50) !important;
|
| 811 |
+
border: 1px solid var(--ab-border) !important;
|
| 812 |
+
border-radius: 12px !important;
|
| 813 |
+
padding: 8px !important;
|
| 814 |
+
box-shadow: var(--ab-shadow-sm) !important;
|
| 815 |
+
}
|
| 816 |
+
|
| 817 |
+
/* Buttons */
|
| 818 |
+
.gradio-container button {
|
| 819 |
+
font-family: var(--font-body) !important;
|
| 820 |
+
font-weight: 500 !important;
|
| 821 |
+
letter-spacing: 0 !important;
|
| 822 |
+
border-radius: 8px !important;
|
| 823 |
+
transition: transform 80ms var(--ab-ease), background-color 220ms var(--ab-ease) !important;
|
| 824 |
+
}
|
| 825 |
+
.gradio-container button:active { transform: scale(0.98) !important; }
|
| 826 |
+
.gradio-container button.primary, .gradio-container button[variant="primary"] {
|
| 827 |
+
background: var(--ab-ink-900) !important;
|
| 828 |
+
color: var(--ab-parchment-50) !important;
|
| 829 |
+
border: 1px solid var(--ab-ink-900) !important;
|
| 830 |
+
}
|
| 831 |
+
.gradio-container button.primary:hover {
|
| 832 |
+
background: var(--ab-ink-800) !important;
|
| 833 |
+
}
|
| 834 |
+
.gradio-container button.secondary {
|
| 835 |
+
background: var(--ab-parchment-50) !important;
|
| 836 |
+
color: var(--ab-ink-900) !important;
|
| 837 |
+
border: 1px solid var(--ab-border) !important;
|
| 838 |
+
}
|
| 839 |
+
.gradio-container button.secondary:hover {
|
| 840 |
+
background: var(--ab-parchment-200) !important;
|
| 841 |
+
}
|
| 842 |
+
|
| 843 |
+
/* Text inputs / textareas */
|
| 844 |
+
.gradio-container input[type="text"],
|
| 845 |
+
.gradio-container textarea,
|
| 846 |
+
.gradio-container .gr-input,
|
| 847 |
+
.gradio-container .gr-textbox textarea {
|
| 848 |
+
background: var(--ab-parchment-50) !important;
|
| 849 |
+
color: var(--ab-ink-900) !important;
|
| 850 |
+
border: 1px solid var(--ab-border) !important;
|
| 851 |
+
border-radius: 8px !important;
|
| 852 |
+
font-family: var(--font-body) !important;
|
| 853 |
+
font-size: 14px !important;
|
| 854 |
+
box-shadow: inset 0 1px 2px rgba(17,32,58,0.04);
|
| 855 |
+
}
|
| 856 |
+
.gradio-container input[type="text"]:focus,
|
| 857 |
+
.gradio-container textarea:focus,
|
| 858 |
+
.gradio-container .gr-textbox textarea:focus {
|
| 859 |
+
outline: none !important;
|
| 860 |
+
border-color: var(--ab-gilt-400) !important;
|
| 861 |
+
box-shadow: 0 0 0 3px rgba(220,139,42,0.18) !important;
|
| 862 |
+
}
|
| 863 |
+
|
| 864 |
+
/* Labels */
|
| 865 |
+
.gradio-container label, .gradio-container .label-wrap {
|
| 866 |
+
color: var(--ab-ink-700) !important;
|
| 867 |
+
font-family: var(--font-body) !important;
|
| 868 |
+
font-size: 13px !important;
|
| 869 |
+
font-weight: 500 !important;
|
| 870 |
+
letter-spacing: 0.01em !important;
|
| 871 |
+
}
|
| 872 |
+
|
| 873 |
+
/* Dropdowns */
|
| 874 |
+
.gradio-container .gr-dropdown, .gradio-container [data-testid="dropdown"] select,
|
| 875 |
+
.gradio-container .wrap.svelte-1cl284s {
|
| 876 |
+
background: var(--ab-parchment-50) !important;
|
| 877 |
+
border: 1px solid var(--ab-border) !important;
|
| 878 |
+
border-radius: 8px !important;
|
| 879 |
+
color: var(--ab-ink-900) !important;
|
| 880 |
+
}
|
| 881 |
+
|
| 882 |
+
/* Checkbox group filter */
|
| 883 |
+
.gradio-container .gr-check-radio,
|
| 884 |
+
.gradio-container fieldset[data-testid="checkbox-group"] {
|
| 885 |
+
background: var(--ab-parchment-50) !important;
|
| 886 |
+
border: 1px solid var(--ab-border) !important;
|
| 887 |
+
border-radius: 12px !important;
|
| 888 |
+
padding: 12px 14px !important;
|
| 889 |
+
}
|
| 890 |
+
.gradio-container fieldset[data-testid="checkbox-group"] label {
|
| 891 |
+
background: var(--ab-parchment-100) !important;
|
| 892 |
+
border: 1px solid var(--ab-border-subtle) !important;
|
| 893 |
+
border-radius: 999px !important;
|
| 894 |
+
padding: 4px 10px !important;
|
| 895 |
+
margin: 3px !important;
|
| 896 |
+
font-size: 12px !important;
|
| 897 |
+
}
|
| 898 |
+
.gradio-container fieldset[data-testid="checkbox-group"] label:hover {
|
| 899 |
+
background: var(--ab-parchment-200) !important;
|
| 900 |
+
}
|
| 901 |
+
.gradio-container input[type="checkbox"]:checked + * {
|
| 902 |
+
color: var(--ab-ink-900) !important;
|
| 903 |
+
}
|
| 904 |
+
.gradio-container input[type="checkbox"] {
|
| 905 |
+
accent-color: var(--ab-gilt-400) !important;
|
| 906 |
+
}
|
| 907 |
+
|
| 908 |
+
/* Markdown */
|
| 909 |
+
.gradio-container .markdown, .gradio-container .prose {
|
| 910 |
+
color: var(--ab-ink-800) !important;
|
| 911 |
+
font-family: var(--font-body) !important;
|
| 912 |
+
}
|
| 913 |
+
.gradio-container .markdown h1, .gradio-container .markdown h2,
|
| 914 |
+
.gradio-container .prose h1, .gradio-container .prose h2 {
|
| 915 |
+
font-family: var(--font-display) !important;
|
| 916 |
+
color: var(--ab-ink-900) !important;
|
| 917 |
+
font-weight: 400 !important;
|
| 918 |
+
letter-spacing: -0.01em !important;
|
| 919 |
+
}
|
| 920 |
+
.gradio-container .markdown h3, .gradio-container .prose h3 {
|
| 921 |
+
font-family: var(--font-body) !important;
|
| 922 |
+
font-weight: 600 !important;
|
| 923 |
+
color: var(--ab-ink-900) !important;
|
| 924 |
+
font-size: 16px !important;
|
| 925 |
+
margin-bottom: 8px !important;
|
| 926 |
+
}
|
| 927 |
+
.gradio-container .markdown strong { color: var(--ab-ink-900) !important; font-weight: 600 !important; }
|
| 928 |
+
.gradio-container .markdown a { color: var(--ab-signal-500) !important; }
|
| 929 |
+
.gradio-container .markdown hr {
|
| 930 |
+
border: none !important;
|
| 931 |
+
border-top: 1px solid var(--ab-border) !important;
|
| 932 |
+
margin: 18px 0 !important;
|
| 933 |
+
}
|
| 934 |
+
|
| 935 |
+
/* Hidden index input (kept invisible) */
|
| 936 |
+
#click-index-input {
|
| 937 |
+
position: absolute !important;
|
| 938 |
+
width: 1px !important;
|
| 939 |
+
height: 1px !important;
|
| 940 |
+
overflow: hidden !important;
|
| 941 |
+
opacity: 0 !important;
|
| 942 |
+
pointer-events: none !important;
|
| 943 |
+
}
|
| 944 |
+
|
| 945 |
+
/* Footer */
|
| 946 |
+
.ab-footer {
|
| 947 |
+
border-top: 1px solid var(--ab-border);
|
| 948 |
+
margin-top: 36px;
|
| 949 |
+
padding-top: 18px;
|
| 950 |
+
text-align: center;
|
| 951 |
+
}
|
| 952 |
+
.ab-footer__line {
|
| 953 |
+
font-family: var(--font-body);
|
| 954 |
+
color: var(--ab-ink-500);
|
| 955 |
+
font-size: 12px;
|
| 956 |
+
letter-spacing: 0.02em;
|
| 957 |
+
}
|
| 958 |
+
.ab-footer__line a { color: var(--ab-signal-500); }
|
| 959 |
+
.ab-footer__mark {
|
| 960 |
+
font-family: var(--font-display);
|
| 961 |
+
color: var(--ab-gilt-500);
|
| 962 |
+
font-size: 14px;
|
| 963 |
+
letter-spacing: 0.04em;
|
| 964 |
+
margin-bottom: 6px;
|
| 965 |
+
}
|
| 966 |
+
.ab-footer__mark .heb { font-family: 'Frank Ruhl Libre', serif; }
|
| 967 |
+
.ab-footer__mark .ar { font-family: 'Amiri', serif; }
|
| 968 |
+
"""
|
| 969 |
+
|
| 970 |
+
|
| 971 |
+
# ---------------------------------------------------------------------------
|
| 972 |
+
# Header / How-to / Footer markup
|
| 973 |
# ---------------------------------------------------------------------------
|
| 974 |
+
HEADER_HTML = """
|
| 975 |
+
<header class="ab-header">
|
| 976 |
+
<div class="ab-header__brand">
|
| 977 |
+
<div class="ab-header__mark">
|
| 978 |
+
<span class="heb">א-ב</span> · <span class="ar">أب</span>
|
| 979 |
+
</div>
|
| 980 |
+
<div>
|
| 981 |
+
<h1 class="ab-header__title">GuardLLM <em>—</em> Prompt Security Visualizer</h1>
|
| 982 |
+
</div>
|
| 983 |
+
</div>
|
| 984 |
+
<p class="ab-header__sub">
|
| 985 |
+
Editorial inspection of the prompt attack surface. Powered by
|
| 986 |
+
<a href="https://huggingface.co/meta-llama/Llama-Prompt-Guard-2-86M" target="_blank">Llama Prompt Guard 2 (86M)</a>
|
| 987 |
+
on the <a href="https://huggingface.co/datasets/neuralchemy/Prompt-injection-dataset" target="_blank">neuralchemy</a> corpus.
|
| 988 |
</p>
|
| 989 |
+
</header>
|
| 990 |
"""
|
| 991 |
|
| 992 |
HOW_TO_HTML = """
|
| 993 |
+
<div class="ab-howto">
|
| 994 |
+
<div class="ab-howto__step">
|
| 995 |
+
<div class="ab-howto__num">01</div>
|
| 996 |
+
<div class="ab-eyebrow">Map</div>
|
| 997 |
+
<div class="ab-howto__title">Explore the landscape</div>
|
| 998 |
+
<div class="ab-howto__body">
|
| 999 |
+
Each point is a prompt placed by semantic similarity. Color encodes the attack class.
|
| 1000 |
+
Hover to preview, scroll to zoom, drag to pan.
|
| 1001 |
</div>
|
| 1002 |
+
</div>
|
| 1003 |
+
<div class="ab-howto__step">
|
| 1004 |
+
<div class="ab-howto__num">02</div>
|
| 1005 |
+
<div class="ab-eyebrow">Inspect</div>
|
| 1006 |
+
<div class="ab-howto__title">Click to analyze</div>
|
| 1007 |
+
<div class="ab-howto__body">
|
| 1008 |
+
Selecting a point runs the classifier and returns a verdict, a safety score,
|
| 1009 |
+
and the full class probability breakdown.
|
| 1010 |
</div>
|
| 1011 |
+
</div>
|
| 1012 |
+
<div class="ab-howto__step">
|
| 1013 |
+
<div class="ab-howto__num">03</div>
|
| 1014 |
+
<div class="ab-eyebrow">Probe</div>
|
| 1015 |
+
<div class="ab-howto__title">Try your own prompt</div>
|
| 1016 |
+
<div class="ab-howto__body">
|
| 1017 |
+
Paste any text into the custom field below to see whether the model would flag
|
| 1018 |
+
it as injection or jailbreak.
|
| 1019 |
</div>
|
| 1020 |
</div>
|
| 1021 |
</div>
|
| 1022 |
"""
|
| 1023 |
|
| 1024 |
+
FOOTER_HTML = """
|
| 1025 |
+
<footer class="ab-footer">
|
| 1026 |
+
<div class="ab-footer__mark"><span class="heb">א-ב</span> · ALEPH BETH · <span class="ar">أب</span></div>
|
| 1027 |
+
<div class="ab-footer__line">
|
| 1028 |
+
GuardLLM — Prompt Security Visualizer.
|
| 1029 |
+
Model: <a href="https://huggingface.co/meta-llama/Llama-Prompt-Guard-2-86M">Llama Prompt Guard 2 (86M)</a>.
|
| 1030 |
+
Dataset: <a href="https://huggingface.co/datasets/neuralchemy/Prompt-injection-dataset">neuralchemy / Prompt-injection-dataset</a>.
|
| 1031 |
+
</div>
|
| 1032 |
+
</footer>
|
| 1033 |
+
"""
|
| 1034 |
+
|
| 1035 |
+
|
| 1036 |
+
# ---------------------------------------------------------------------------
|
| 1037 |
+
# Gradio theme (parchment / ink)
|
| 1038 |
+
# ---------------------------------------------------------------------------
|
| 1039 |
+
ab_theme = gr.themes.Base(
|
| 1040 |
+
primary_hue=gr.themes.Color(
|
| 1041 |
+
c50=AB["parchment_50"], c100=AB["parchment_100"], c200=AB["parchment_200"],
|
| 1042 |
+
c300=AB["parchment_300"], c400=AB["gilt_300"], c500=AB["gilt_400"],
|
| 1043 |
+
c600=AB["gilt_500"], c700=AB["gilt_600"], c800=AB["ink_800"],
|
| 1044 |
+
c900=AB["ink_900"], c950=AB["ink_950"],
|
| 1045 |
+
),
|
| 1046 |
+
neutral_hue=gr.themes.Color(
|
| 1047 |
+
c50=AB["parchment_50"], c100=AB["parchment_100"], c200=AB["parchment_200"],
|
| 1048 |
+
c300=AB["ink_200"], c400=AB["ink_300"], c500=AB["ink_500"],
|
| 1049 |
+
c600=AB["ink_600"], c700=AB["ink_700"], c800=AB["ink_800"],
|
| 1050 |
+
c900=AB["ink_900"], c950=AB["ink_950"],
|
| 1051 |
+
),
|
| 1052 |
+
font=[gr.themes.GoogleFont("Geist"), "Inter", "system-ui", "sans-serif"],
|
| 1053 |
+
font_mono=[gr.themes.GoogleFont("Geist Mono"), "JetBrains Mono", "monospace"],
|
| 1054 |
+
).set(
|
| 1055 |
+
body_background_fill=AB["parchment_100"],
|
| 1056 |
+
body_text_color=AB["ink_900"],
|
| 1057 |
+
background_fill_primary=AB["parchment_50"],
|
| 1058 |
+
background_fill_secondary=AB["parchment_100"],
|
| 1059 |
+
border_color_primary="rgba(17,32,58,0.12)",
|
| 1060 |
+
block_background_fill=AB["parchment_50"],
|
| 1061 |
+
block_border_color="rgba(17,32,58,0.12)",
|
| 1062 |
+
block_label_text_color=AB["ink_700"],
|
| 1063 |
+
block_title_text_color=AB["ink_900"],
|
| 1064 |
+
input_background_fill=AB["parchment_50"],
|
| 1065 |
+
input_border_color="rgba(17,32,58,0.12)",
|
| 1066 |
+
input_border_color_focus=AB["gilt_400"],
|
| 1067 |
+
button_primary_background_fill=AB["ink_900"],
|
| 1068 |
+
button_primary_background_fill_hover=AB["ink_800"],
|
| 1069 |
+
button_primary_text_color=AB["parchment_50"],
|
| 1070 |
+
button_secondary_background_fill=AB["parchment_50"],
|
| 1071 |
+
button_secondary_background_fill_hover=AB["parchment_200"],
|
| 1072 |
+
button_secondary_text_color=AB["ink_900"],
|
| 1073 |
+
)
|
| 1074 |
+
|
| 1075 |
+
|
| 1076 |
+
# ---------------------------------------------------------------------------
|
| 1077 |
+
# Gradio Interface
|
| 1078 |
+
# ---------------------------------------------------------------------------
|
| 1079 |
with gr.Blocks(
|
| 1080 |
+
title="GuardLLM — Prompt Security Visualizer",
|
| 1081 |
+
theme=ab_theme,
|
| 1082 |
+
css=ALEPH_BETH_CSS,
|
| 1083 |
) as demo:
|
| 1084 |
|
| 1085 |
+
gr.HTML(HEADER_HTML)
|
| 1086 |
gr.HTML(HOW_TO_HTML)
|
| 1087 |
|
| 1088 |
+
click_index = gr.Textbox(value="", visible=True, elem_id="click-index-input")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1089 |
|
| 1090 |
with gr.Row():
|
| 1091 |
+
# ---- Left — t-SNE chart + filters ----
|
| 1092 |
with gr.Column(scale=3):
|
| 1093 |
with gr.Row():
|
| 1094 |
+
select_all_btn = gr.Button("Select all", size="sm", scale=1)
|
| 1095 |
+
deselect_all_btn = gr.Button("Deselect all", size="sm", scale=1)
|
| 1096 |
|
| 1097 |
category_filter = gr.CheckboxGroup(
|
| 1098 |
choices=UNIQUE_CATEGORIES,
|
|
|
|
| 1102 |
)
|
| 1103 |
tsne_plot = gr.Plot(
|
| 1104 |
value=build_tsne_figure(),
|
| 1105 |
+
label="t-SNE space",
|
| 1106 |
elem_id="tsne-chart",
|
| 1107 |
)
|
| 1108 |
gr.Markdown(
|
| 1109 |
+
"<span class='ab-caption'>Click a point to inspect it. "
|
| 1110 |
+
"Hover to preview. Scroll to zoom, drag to pan.</span>"
|
| 1111 |
)
|
| 1112 |
|
| 1113 |
+
# ---- Right — Analysis + controls + stats ----
|
| 1114 |
with gr.Column(scale=2):
|
| 1115 |
+
gr.HTML("<div class='ab-eyebrow'>Analysis</div>"
|
| 1116 |
+
"<h3 class='ab-h3'>Verdict & confidence</h3>")
|
| 1117 |
result_html = gr.HTML(value=empty_analysis_html())
|
| 1118 |
risk_md = gr.Markdown(value="")
|
| 1119 |
+
full_prompt = gr.Textbox(
|
| 1120 |
+
label="Full prompt",
|
| 1121 |
+
lines=3,
|
| 1122 |
+
interactive=False,
|
| 1123 |
+
visible=True,
|
| 1124 |
+
)
|
| 1125 |
|
| 1126 |
gr.Markdown("---")
|
| 1127 |
|
| 1128 |
+
gr.HTML("<div class='ab-eyebrow'>Library</div>"
|
| 1129 |
+
"<h3 class='ab-h3'>Pick a prompt</h3>")
|
| 1130 |
prompt_dropdown = gr.Dropdown(
|
| 1131 |
choices=DROPDOWN_CHOICES,
|
| 1132 |
+
label="Search the dataset",
|
| 1133 |
filterable=True,
|
| 1134 |
interactive=True,
|
| 1135 |
)
|
| 1136 |
|
| 1137 |
+
gr.HTML("<div class='ab-eyebrow' style='margin-top:14px;'>Custom</div>"
|
| 1138 |
+
"<h3 class='ab-h3'>Analyze your own</h3>")
|
| 1139 |
manual_input = gr.Textbox(
|
| 1140 |
+
label="Prompt",
|
| 1141 |
+
placeholder="Type or paste a request to evaluate…",
|
| 1142 |
lines=2,
|
| 1143 |
)
|
| 1144 |
+
analyze_btn = gr.Button("Inspect", variant="primary")
|
| 1145 |
|
| 1146 |
gr.Markdown("---")
|
| 1147 |
|
| 1148 |
gr.HTML(build_stats_html())
|
| 1149 |
|
| 1150 |
# ---- Events ----
|
| 1151 |
+
category_filter.change(fn=on_filter_change, inputs=[category_filter], outputs=[tsne_plot])
|
| 1152 |
+
select_all_btn.click(fn=select_all_categories, inputs=[], outputs=[category_filter, tsne_plot])
|
| 1153 |
+
deselect_all_btn.click(fn=deselect_all_categories, inputs=[], outputs=[category_filter, tsne_plot])
|
| 1154 |
+
click_index.change(fn=on_index_input, inputs=[click_index],
|
| 1155 |
+
outputs=[result_html, risk_md, full_prompt])
|
| 1156 |
+
prompt_dropdown.change(fn=on_dropdown_select, inputs=[prompt_dropdown],
|
| 1157 |
+
outputs=[result_html, risk_md, full_prompt])
|
| 1158 |
+
analyze_btn.click(fn=on_manual_analyze, inputs=[manual_input],
|
| 1159 |
+
outputs=[result_html, risk_md])
|
| 1160 |
+
manual_input.submit(fn=on_manual_analyze, inputs=[manual_input],
|
| 1161 |
+
outputs=[result_html, risk_md])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1162 |
demo.load(fn=None, inputs=None, outputs=None, js=PLOTLY_CLICK_JS)
|
| 1163 |
|
| 1164 |
+
gr.HTML(FOOTER_HTML)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1165 |
|
| 1166 |
|
| 1167 |
logger.info("Gradio app built. Ready to launch.")
|
| 1168 |
|
| 1169 |
if __name__ == "__main__":
|
| 1170 |
+
demo.launch()
|