AlephBeth-AI commited on
Commit
385fae5
·
verified ·
1 Parent(s): e189cee

Fix: update to 2-class model (Benign/Malicious)

Browse files
Files changed (1) hide show
  1. app.py +113 -41
app.py CHANGED
@@ -18,16 +18,15 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
18
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
19
  model.eval()
20
 
21
- LABELS = ["Benign", "Injection", "Jailbreak"]
 
22
  LABEL_COLORS = {
23
- "Benign": "#22c55e",
24
- "Injection": "#ef4444",
25
- "Jailbreak": "#f97316",
26
  }
27
  LABEL_EMOJIS = {
28
  "Benign": "\u2705",
29
- "Injection": "\u26a0\ufe0f",
30
- "Jailbreak": "\U0001f6e8\ufe0f",
31
  }
32
 
33
 
@@ -60,8 +59,13 @@ def analyze_prompt(text: str):
60
  predicted_label = LABELS[predicted_idx]
61
  confidence = float(probabilities[predicted_idx])
62
 
 
63
  prob_dict = {LABELS[i]: float(probabilities[i]) for i in range(len(LABELS))}
 
 
64
  detail_html = build_result_html(predicted_label, confidence, prob_dict, text)
 
 
65
  risk_text = build_risk_assessment(predicted_label, confidence, prob_dict)
66
 
67
  return (
@@ -82,13 +86,20 @@ def empty_html():
82
  """
83
 
84
 
85
- def build_result_html(label, confidence, probs, text):
86
  color = LABEL_COLORS[label]
87
  emoji = LABEL_EMOJIS[label]
88
  pct = confidence * 100
 
 
89
  safety_score = probs["Benign"] * 100
90
- safety_color = "#22c55e" if safety_score >= 70 else "#f59e0b" if safety_score >= 40 else "#ef4444"
 
 
 
 
91
 
 
92
  bars_html = ""
93
  for lbl in LABELS:
94
  p = probs[lbl] * 100
@@ -106,16 +117,21 @@ def build_result_html(label, confidence, probs, text):
106
  </div>
107
  """
108
 
 
109
  preview = text[:120] + "..." if len(text) > 120 else text
110
  preview = preview.replace("<", "&lt;").replace(">", "&gt;")
111
 
112
  return f"""
113
  <div style="background:#0f172a; border-radius:16px; padding:24px; font-family:system-ui,-apple-system,sans-serif;">
 
 
114
  <div style="text-align:center; margin-bottom:20px;">
115
  <div style="font-size:2.5em; margin-bottom:4px;">{emoji}</div>
116
  <div style="font-size:1.4em; font-weight:700; color:{color};">{label}</div>
117
  <div style="color:#94a3b8; font-size:0.9em;">Confidence: {pct:.1f}%</div>
118
  </div>
 
 
119
  <div style="background:#1e293b; border-radius:12px; padding:16px; margin-bottom:16px;">
120
  <div style="display:flex; justify-content:space-between; margin-bottom:6px;">
121
  <span style="color:#e2e8f0; font-weight:600;">Safety Score</span>
@@ -127,34 +143,54 @@ def build_result_html(label, confidence, probs, text):
127
  transition: width 0.5s ease-in-out;"></div>
128
  </div>
129
  </div>
 
 
130
  <div style="background:#1e293b; border-radius:12px; padding:16px; margin-bottom:16px;">
131
  <div style="color:#e2e8f0; font-weight:600; margin-bottom:12px;">Class Probabilities</div>
132
  {bars_html}
133
  </div>
 
 
134
  <div style="background:#1e293b; border-radius:12px; padding:16px;">
135
  <div style="color:#94a3b8; font-size:0.85em; margin-bottom:4px;">Analyzed prompt:</div>
136
- <div style="color:#cbd5e1; font-style:italic; word-break:break-word;">\"{preview}\"</div>
137
  </div>
138
  </div>
139
  """
140
 
141
 
142
- def build_risk_assessment(label, confidence, probs):
143
  """Return a Markdown risk assessment."""
144
  safety_score = probs["Benign"] * 100
 
145
 
146
  if label == "Benign" and confidence > 0.85:
147
  level = "Low"
148
- desc = "This prompt appears **safe**. No signs of injection or jailbreak detected."
 
 
 
149
  elif label == "Benign":
150
  level = "Moderate"
151
- desc = "This prompt is likely benign, but the model confidence is moderate. It may contain ambiguous phrasing."
152
- elif label == "Injection":
153
- level = "High"
154
- desc = "**Injection detected.** This prompt attempts to manipulate the LLM behavior by injecting new instructions (e.g., system prompt override, system prompt extraction)."
155
- else:
156
  level = "Critical"
157
- desc = "**Jailbreak detected.** This prompt attempts to bypass the LLM's safety guardrails (e.g., unrestricted roleplay, DAN mode, filter deactivation)."
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
  return f"""### Risk Level: {level}
160
 
@@ -163,7 +199,7 @@ def build_risk_assessment(label, confidence, probs):
163
  **Details:**
164
  - Safety score: **{safety_score:.0f}/100**
165
  - Predicted class: **{label}** ({confidence*100:.1f}%)
166
- - P(Benign) = {probs['Benign']*100:.1f}% | P(Injection) = {probs['Injection']*100:.1f}% | P(Jailbreak) = {probs['Jailbreak']*100:.1f}%
167
  """
168
 
169
 
@@ -187,7 +223,9 @@ EXAMPLES = [
187
  # ---------------------------------------------------------------------------
188
  TITLE = """
189
  <div style="text-align:center; padding:16px 0;">
190
- <h1 style="font-size:2em; margin:0;">\U0001f6e1\ufe0f GuardLLM</h1>
 
 
191
  <p style="color:#94a3b8; font-size:1.1em; margin-top:4px;">
192
  Prompt Security Analyzer \u2014 Powered by
193
  <a href="https://huggingface.co/meta-llama/Llama-Prompt-Guard-2-86M"
@@ -197,9 +235,15 @@ TITLE = """
197
  """
198
 
199
  with gr.Blocks(
200
- theme=gr.themes.Soft(primary_hue="blue", neutral_hue="slate"),
 
 
 
201
  title="GuardLLM - Prompt Security Analyzer",
202
- css=".main-container { max-width: 900px; margin: 0 auto; } footer { display: none !important; }",
 
 
 
203
  ) as demo:
204
 
205
  gr.HTML(TITLE)
@@ -209,34 +253,62 @@ with gr.Blocks(
209
  prompt_input = gr.Textbox(
210
  label="Prompt to analyze",
211
  placeholder="Enter a prompt to evaluate its safety...",
212
- lines=4, max_lines=10,
 
 
 
 
 
 
 
 
 
 
 
213
  )
214
- analyze_btn = gr.Button("Analyze", variant="primary", size="lg")
215
- gr.Examples(examples=EXAMPLES, inputs=prompt_input, label="Example prompts")
216
 
217
  with gr.Column(scale=1):
218
  result_html = gr.HTML(value=empty_html(), label="Result")
219
 
220
  with gr.Row():
221
  with gr.Column(scale=1):
222
- label_output = gr.Label(label="Probability Distribution", num_top_classes=3)
 
 
 
223
  with gr.Column(scale=1):
224
- risk_output = gr.Markdown(value="*Risk assessment will appear here.*", label="Risk Assessment")
225
-
226
- analyze_btn.click(fn=analyze_prompt, inputs=[prompt_input], outputs=[result_html, label_output, risk_output])
227
- prompt_input.submit(fn=analyze_prompt, inputs=[prompt_input], outputs=[result_html, label_output, risk_output])
228
-
229
- gr.Markdown("""
230
- ---
231
- <div style="text-align:center; color:#64748b; font-size:0.85em;">
232
- <strong>GuardLLM</strong> is powered by
233
- <a href="https://huggingface.co/meta-llama/Llama-Prompt-Guard-2-86M">Llama Prompt Guard 2 (86M)</a> by Meta.<br>
234
- This model classifies prompts into 3 categories:
235
- <strong>Benign</strong>, <strong>Injection</strong> and <strong>Jailbreak</strong>.<br>
236
- Maximum input length: 512 tokens.
237
- </div>
238
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
 
241
  if __name__ == "__main__":
242
- demo.launch()
 
18
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
19
  model.eval()
20
 
21
+ # Llama Prompt Guard 2 outputs 2 classes: Benign (0) and Malicious (1)
22
+ LABELS = ["Benign", "Malicious"]
23
  LABEL_COLORS = {
24
+ "Benign": "#22c55e", # green
25
+ "Malicious": "#ef4444", # red
 
26
  }
27
  LABEL_EMOJIS = {
28
  "Benign": "\u2705",
29
+ "Malicious": "\u26a0\ufe0f",
 
30
  }
31
 
32
 
 
59
  predicted_label = LABELS[predicted_idx]
60
  confidence = float(probabilities[predicted_idx])
61
 
62
+ # Build probability dict for gr.Label
63
  prob_dict = {LABELS[i]: float(probabilities[i]) for i in range(len(LABELS))}
64
+
65
+ # Build detail HTML
66
  detail_html = build_result_html(predicted_label, confidence, prob_dict, text)
67
+
68
+ # Risk assessment text
69
  risk_text = build_risk_assessment(predicted_label, confidence, prob_dict)
70
 
71
  return (
 
86
  """
87
 
88
 
89
+ def build_result_html(label: str, confidence: float, probs: dict, text: str) -> str:
90
  color = LABEL_COLORS[label]
91
  emoji = LABEL_EMOJIS[label]
92
  pct = confidence * 100
93
+
94
+ # Safety score = probability of benign
95
  safety_score = probs["Benign"] * 100
96
+ safety_color = (
97
+ "#22c55e" if safety_score >= 70
98
+ else "#f59e0b" if safety_score >= 40
99
+ else "#ef4444"
100
+ )
101
 
102
+ # Bar chart for each class
103
  bars_html = ""
104
  for lbl in LABELS:
105
  p = probs[lbl] * 100
 
117
  </div>
118
  """
119
 
120
+ # Truncated prompt preview
121
  preview = text[:120] + "..." if len(text) > 120 else text
122
  preview = preview.replace("<", "&lt;").replace(">", "&gt;")
123
 
124
  return f"""
125
  <div style="background:#0f172a; border-radius:16px; padding:24px; font-family:system-ui,-apple-system,sans-serif;">
126
+
127
+ <!-- Header -->
128
  <div style="text-align:center; margin-bottom:20px;">
129
  <div style="font-size:2.5em; margin-bottom:4px;">{emoji}</div>
130
  <div style="font-size:1.4em; font-weight:700; color:{color};">{label}</div>
131
  <div style="color:#94a3b8; font-size:0.9em;">Confidence: {pct:.1f}%</div>
132
  </div>
133
+
134
+ <!-- Safety gauge -->
135
  <div style="background:#1e293b; border-radius:12px; padding:16px; margin-bottom:16px;">
136
  <div style="display:flex; justify-content:space-between; margin-bottom:6px;">
137
  <span style="color:#e2e8f0; font-weight:600;">Safety Score</span>
 
143
  transition: width 0.5s ease-in-out;"></div>
144
  </div>
145
  </div>
146
+
147
+ <!-- Probability bars -->
148
  <div style="background:#1e293b; border-radius:12px; padding:16px; margin-bottom:16px;">
149
  <div style="color:#e2e8f0; font-weight:600; margin-bottom:12px;">Class Probabilities</div>
150
  {bars_html}
151
  </div>
152
+
153
+ <!-- Prompt preview -->
154
  <div style="background:#1e293b; border-radius:12px; padding:16px;">
155
  <div style="color:#94a3b8; font-size:0.85em; margin-bottom:4px;">Analyzed prompt:</div>
156
+ <div style="color:#cbd5e1; font-style:italic; word-break:break-word;">"{preview}"</div>
157
  </div>
158
  </div>
159
  """
160
 
161
 
162
+ def build_risk_assessment(label: str, confidence: float, probs: dict) -> str:
163
  """Return a Markdown risk assessment."""
164
  safety_score = probs["Benign"] * 100
165
+ malicious_score = probs["Malicious"] * 100
166
 
167
  if label == "Benign" and confidence > 0.85:
168
  level = "Low"
169
+ desc = (
170
+ "This prompt appears **safe**. No signs of injection "
171
+ "or jailbreak detected."
172
+ )
173
  elif label == "Benign":
174
  level = "Moderate"
175
+ desc = (
176
+ "This prompt is likely benign, but the model confidence is "
177
+ "moderate. It may contain ambiguous phrasing worth reviewing."
178
+ )
179
+ elif confidence > 0.85:
180
  level = "Critical"
181
+ desc = (
182
+ "**Malicious prompt detected** with high confidence. "
183
+ "This prompt likely attempts to inject instructions or "
184
+ "bypass the LLM's safety guardrails (e.g., system prompt override, "
185
+ "jailbreak, DAN mode, filter deactivation)."
186
+ )
187
+ else:
188
+ level = "High"
189
+ desc = (
190
+ "**Malicious prompt detected.** This prompt may attempt to manipulate "
191
+ "the LLM through injection or jailbreak techniques. "
192
+ "Review recommended before processing."
193
+ )
194
 
195
  return f"""### Risk Level: {level}
196
 
 
199
  **Details:**
200
  - Safety score: **{safety_score:.0f}/100**
201
  - Predicted class: **{label}** ({confidence*100:.1f}%)
202
+ - P(Benign) = {probs['Benign']*100:.1f}% | P(Malicious) = {malicious_score:.1f}%
203
  """
204
 
205
 
 
223
  # ---------------------------------------------------------------------------
224
  TITLE = """
225
  <div style="text-align:center; padding:16px 0;">
226
+ <h1 style="font-size:2em; margin:0;">
227
+ \U0001f6e1\ufe0f GuardLLM
228
+ </h1>
229
  <p style="color:#94a3b8; font-size:1.1em; margin-top:4px;">
230
  Prompt Security Analyzer \u2014 Powered by
231
  <a href="https://huggingface.co/meta-llama/Llama-Prompt-Guard-2-86M"
 
235
  """
236
 
237
  with gr.Blocks(
238
+ theme=gr.themes.Soft(
239
+ primary_hue="blue",
240
+ neutral_hue="slate",
241
+ ),
242
  title="GuardLLM - Prompt Security Analyzer",
243
+ css="""
244
+ .main-container { max-width: 900px; margin: 0 auto; }
245
+ footer { display: none !important; }
246
+ """,
247
  ) as demo:
248
 
249
  gr.HTML(TITLE)
 
253
  prompt_input = gr.Textbox(
254
  label="Prompt to analyze",
255
  placeholder="Enter a prompt to evaluate its safety...",
256
+ lines=4,
257
+ max_lines=10,
258
+ )
259
+ analyze_btn = gr.Button(
260
+ "Analyze",
261
+ variant="primary",
262
+ size="lg",
263
+ )
264
+ gr.Examples(
265
+ examples=EXAMPLES,
266
+ inputs=prompt_input,
267
+ label="Example prompts",
268
  )
 
 
269
 
270
  with gr.Column(scale=1):
271
  result_html = gr.HTML(value=empty_html(), label="Result")
272
 
273
  with gr.Row():
274
  with gr.Column(scale=1):
275
+ label_output = gr.Label(
276
+ label="Probability Distribution",
277
+ num_top_classes=2,
278
+ )
279
  with gr.Column(scale=1):
280
+ risk_output = gr.Markdown(
281
+ value="*Risk assessment will appear here.*",
282
+ label="Risk Assessment",
283
+ )
284
+
285
+ # Events
286
+ analyze_btn.click(
287
+ fn=analyze_prompt,
288
+ inputs=[prompt_input],
289
+ outputs=[result_html, label_output, risk_output],
290
+ )
291
+ prompt_input.submit(
292
+ fn=analyze_prompt,
293
+ inputs=[prompt_input],
294
+ outputs=[result_html, label_output, risk_output],
295
+ )
296
+
297
+ # Footer
298
+ gr.Markdown(
299
+ """
300
+ ---
301
+ <div style="text-align:center; color:#64748b; font-size:0.85em;">
302
+ <strong>GuardLLM</strong> is powered by
303
+ <a href="https://huggingface.co/meta-llama/Llama-Prompt-Guard-2-86M">
304
+ Llama Prompt Guard 2 (86M)</a> by Meta.<br>
305
+ This model classifies prompts into 2 categories:
306
+ <strong>Benign</strong> and <strong>Malicious</strong> (injection/jailbreak).<br>
307
+ Maximum input length: 512 tokens.
308
+ </div>
309
+ """,
310
+ )
311
 
312
 
313
  if __name__ == "__main__":
314
+ demo.launch()