Spaces:

AlephBeth-AI
/

GuardLLM

Sleeping

App Files Files Community

AlephBeth-AI commited on Apr 12

Commit

385fae5

verified ·

1 Parent(s): e189cee

Fix: update to 2-class model (Benign/Malicious)

Browse files

Files changed (1) hide show

app.py +113 -41

app.py CHANGED Viewed

@@ -18,16 +18,15 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
 model.eval()
-LABELS = ["Benign", "Injection", "Jailbreak"]
 LABEL_COLORS = {
-    "Benign": "#22c55e",
-    "Injection": "#ef4444",
-    "Jailbreak": "#f97316",
 }
 LABEL_EMOJIS = {
     "Benign": "\u2705",
-    "Injection": "\u26a0\ufe0f",
-    "Jailbreak": "\U0001f6e8\ufe0f",
 }
@@ -60,8 +59,13 @@ def analyze_prompt(text: str):
     predicted_label = LABELS[predicted_idx]
     confidence = float(probabilities[predicted_idx])
     prob_dict = {LABELS[i]: float(probabilities[i]) for i in range(len(LABELS))}
     detail_html = build_result_html(predicted_label, confidence, prob_dict, text)
     risk_text = build_risk_assessment(predicted_label, confidence, prob_dict)
     return (
@@ -82,13 +86,20 @@ def empty_html():
     """
-def build_result_html(label, confidence, probs, text):
     color = LABEL_COLORS[label]
     emoji = LABEL_EMOJIS[label]
     pct = confidence * 100
     safety_score = probs["Benign"] * 100
-    safety_color = "#22c55e" if safety_score >= 70 else "#f59e0b" if safety_score >= 40 else "#ef4444"
     bars_html = ""
     for lbl in LABELS:
         p = probs[lbl] * 100
@@ -106,16 +117,21 @@ def build_result_html(label, confidence, probs, text):
         </div>
         """
     preview = text[:120] + "..." if len(text) > 120 else text
     preview = preview.replace("<", "&lt;").replace(">", "&gt;")
     return f"""
     <div style="background:#0f172a; border-radius:16px; padding:24px; font-family:system-ui,-apple-system,sans-serif;">
         <div style="text-align:center; margin-bottom:20px;">
             <div style="font-size:2.5em; margin-bottom:4px;">{emoji}</div>
             <div style="font-size:1.4em; font-weight:700; color:{color};">{label}</div>
             <div style="color:#94a3b8; font-size:0.9em;">Confidence: {pct:.1f}%</div>
         </div>
         <div style="background:#1e293b; border-radius:12px; padding:16px; margin-bottom:16px;">
             <div style="display:flex; justify-content:space-between; margin-bottom:6px;">
                 <span style="color:#e2e8f0; font-weight:600;">Safety Score</span>
@@ -127,34 +143,54 @@ def build_result_html(label, confidence, probs, text):
                             transition: width 0.5s ease-in-out;"></div>
             </div>
         </div>
         <div style="background:#1e293b; border-radius:12px; padding:16px; margin-bottom:16px;">
             <div style="color:#e2e8f0; font-weight:600; margin-bottom:12px;">Class Probabilities</div>
             {bars_html}
         </div>
         <div style="background:#1e293b; border-radius:12px; padding:16px;">
             <div style="color:#94a3b8; font-size:0.85em; margin-bottom:4px;">Analyzed prompt:</div>
-            <div style="color:#cbd5e1; font-style:italic; word-break:break-word;">\"{preview}\"</div>
         </div>
     </div>
     """
-def build_risk_assessment(label, confidence, probs):
     """Return a Markdown risk assessment."""
     safety_score = probs["Benign"] * 100
     if label == "Benign" and confidence > 0.85:
         level = "Low"
-        desc = "This prompt appears **safe**. No signs of injection or jailbreak detected."
     elif label == "Benign":
         level = "Moderate"
-        desc = "This prompt is likely benign, but the model confidence is moderate. It may contain ambiguous phrasing."
-    elif label == "Injection":
-        level = "High"
-        desc = "**Injection detected.** This prompt attempts to manipulate the LLM behavior by injecting new instructions (e.g., system prompt override, system prompt extraction)."
-    else:
         level = "Critical"
-        desc = "**Jailbreak detected.** This prompt attempts to bypass the LLM's safety guardrails (e.g., unrestricted roleplay, DAN mode, filter deactivation)."
     return f"""### Risk Level: {level}
@@ -163,7 +199,7 @@ def build_risk_assessment(label, confidence, probs):
 **Details:**
 - Safety score: **{safety_score:.0f}/100**
 - Predicted class: **{label}** ({confidence*100:.1f}%)
-- P(Benign) = {probs['Benign']*100:.1f}% | P(Injection) = {probs['Injection']*100:.1f}% | P(Jailbreak) = {probs['Jailbreak']*100:.1f}%
 """
@@ -187,7 +223,9 @@ EXAMPLES = [
 # ---------------------------------------------------------------------------
 TITLE = """
 <div style="text-align:center; padding:16px 0;">
-    <h1 style="font-size:2em; margin:0;">\U0001f6e1\ufe0f GuardLLM</h1>
     <p style="color:#94a3b8; font-size:1.1em; margin-top:4px;">
         Prompt Security Analyzer \u2014 Powered by
         <a href="https://huggingface.co/meta-llama/Llama-Prompt-Guard-2-86M"
@@ -197,9 +235,15 @@ TITLE = """
 """
 with gr.Blocks(
-    theme=gr.themes.Soft(primary_hue="blue", neutral_hue="slate"),
     title="GuardLLM - Prompt Security Analyzer",
-    css=".main-container { max-width: 900px; margin: 0 auto; } footer { display: none !important; }",
 ) as demo:
     gr.HTML(TITLE)
@@ -209,34 +253,62 @@ with gr.Blocks(
             prompt_input = gr.Textbox(
                 label="Prompt to analyze",
                 placeholder="Enter a prompt to evaluate its safety...",
-                lines=4, max_lines=10,
             )
-            analyze_btn = gr.Button("Analyze", variant="primary", size="lg")
-            gr.Examples(examples=EXAMPLES, inputs=prompt_input, label="Example prompts")
         with gr.Column(scale=1):
             result_html = gr.HTML(value=empty_html(), label="Result")
     with gr.Row():
         with gr.Column(scale=1):
-            label_output = gr.Label(label="Probability Distribution", num_top_classes=3)
         with gr.Column(scale=1):
-            risk_output = gr.Markdown(value="*Risk assessment will appear here.*", label="Risk Assessment")
-    analyze_btn.click(fn=analyze_prompt, inputs=[prompt_input], outputs=[result_html, label_output, risk_output])
-    prompt_input.submit(fn=analyze_prompt, inputs=[prompt_input], outputs=[result_html, label_output, risk_output])
-    gr.Markdown("""
----
-<div style="text-align:center; color:#64748b; font-size:0.85em;">
-    <strong>GuardLLM</strong> is powered by
-    <a href="https://huggingface.co/meta-llama/Llama-Prompt-Guard-2-86M">Llama Prompt Guard 2 (86M)</a> by Meta.<br>
-    This model classifies prompts into 3 categories:
-    <strong>Benign</strong>, <strong>Injection</strong> and <strong>Jailbreak</strong>.<br>
-    Maximum input length: 512 tokens.
-</div>
-""")
 if __name__ == "__main__":
-    demo.launch()

 model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
 model.eval()
+# Llama Prompt Guard 2 outputs 2 classes: Benign (0) and Malicious (1)
+LABELS = ["Benign", "Malicious"]
 LABEL_COLORS = {
+    "Benign": "#22c55e",      # green
+    "Malicious": "#ef4444",   # red
 }
 LABEL_EMOJIS = {
     "Benign": "\u2705",
+    "Malicious": "\u26a0\ufe0f",
 }
     predicted_label = LABELS[predicted_idx]
     confidence = float(probabilities[predicted_idx])
+    # Build probability dict for gr.Label
     prob_dict = {LABELS[i]: float(probabilities[i]) for i in range(len(LABELS))}
+    # Build detail HTML
     detail_html = build_result_html(predicted_label, confidence, prob_dict, text)
+    # Risk assessment text
     risk_text = build_risk_assessment(predicted_label, confidence, prob_dict)
     return (
     """
+def build_result_html(label: str, confidence: float, probs: dict, text: str) -> str:
     color = LABEL_COLORS[label]
     emoji = LABEL_EMOJIS[label]
     pct = confidence * 100
+    # Safety score = probability of benign
     safety_score = probs["Benign"] * 100
+    safety_color = (
+        "#22c55e" if safety_score >= 70
+        else "#f59e0b" if safety_score >= 40
+        else "#ef4444"
+    )
+    # Bar chart for each class
     bars_html = ""
     for lbl in LABELS:
         p = probs[lbl] * 100
         </div>
         """
+    # Truncated prompt preview
     preview = text[:120] + "..." if len(text) > 120 else text
     preview = preview.replace("<", "&lt;").replace(">", "&gt;")
     return f"""
     <div style="background:#0f172a; border-radius:16px; padding:24px; font-family:system-ui,-apple-system,sans-serif;">
+        <!-- Header -->
         <div style="text-align:center; margin-bottom:20px;">
             <div style="font-size:2.5em; margin-bottom:4px;">{emoji}</div>
             <div style="font-size:1.4em; font-weight:700; color:{color};">{label}</div>
             <div style="color:#94a3b8; font-size:0.9em;">Confidence: {pct:.1f}%</div>
         </div>
+        <!-- Safety gauge -->
         <div style="background:#1e293b; border-radius:12px; padding:16px; margin-bottom:16px;">
             <div style="display:flex; justify-content:space-between; margin-bottom:6px;">
                 <span style="color:#e2e8f0; font-weight:600;">Safety Score</span>
                             transition: width 0.5s ease-in-out;"></div>
             </div>
         </div>
+        <!-- Probability bars -->
         <div style="background:#1e293b; border-radius:12px; padding:16px; margin-bottom:16px;">
             <div style="color:#e2e8f0; font-weight:600; margin-bottom:12px;">Class Probabilities</div>
             {bars_html}
         </div>
+        <!-- Prompt preview -->
         <div style="background:#1e293b; border-radius:12px; padding:16px;">
             <div style="color:#94a3b8; font-size:0.85em; margin-bottom:4px;">Analyzed prompt:</div>
+            <div style="color:#cbd5e1; font-style:italic; word-break:break-word;">"{preview}"</div>
         </div>
     </div>
     """
+def build_risk_assessment(label: str, confidence: float, probs: dict) -> str:
     """Return a Markdown risk assessment."""
     safety_score = probs["Benign"] * 100
+    malicious_score = probs["Malicious"] * 100
     if label == "Benign" and confidence > 0.85:
         level = "Low"
+        desc = (
+            "This prompt appears **safe**. No signs of injection "
+            "or jailbreak detected."
+        )
     elif label == "Benign":
         level = "Moderate"
+        desc = (
+            "This prompt is likely benign, but the model confidence is "
+            "moderate. It may contain ambiguous phrasing worth reviewing."
+        )
+    elif confidence > 0.85:
         level = "Critical"
+        desc = (
+            "**Malicious prompt detected** with high confidence. "
+            "This prompt likely attempts to inject instructions or "
+            "bypass the LLM's safety guardrails (e.g., system prompt override, "
+            "jailbreak, DAN mode, filter deactivation)."
+        )
+    else:
+        level = "High"
+        desc = (
+            "**Malicious prompt detected.** This prompt may attempt to manipulate "
+            "the LLM through injection or jailbreak techniques. "
+            "Review recommended before processing."
+        )
     return f"""### Risk Level: {level}
 **Details:**
 - Safety score: **{safety_score:.0f}/100**
 - Predicted class: **{label}** ({confidence*100:.1f}%)
+- P(Benign) = {probs['Benign']*100:.1f}% | P(Malicious) = {malicious_score:.1f}%
 """
 # ---------------------------------------------------------------------------
 TITLE = """
 <div style="text-align:center; padding:16px 0;">
+    <h1 style="font-size:2em; margin:0;">
+        \U0001f6e1\ufe0f GuardLLM
+    </h1>
     <p style="color:#94a3b8; font-size:1.1em; margin-top:4px;">
         Prompt Security Analyzer \u2014 Powered by
         <a href="https://huggingface.co/meta-llama/Llama-Prompt-Guard-2-86M"
 """
 with gr.Blocks(
+    theme=gr.themes.Soft(
+        primary_hue="blue",
+        neutral_hue="slate",
+    ),
     title="GuardLLM - Prompt Security Analyzer",
+    css="""
+    .main-container { max-width: 900px; margin: 0 auto; }
+    footer { display: none !important; }
+    """,
 ) as demo:
     gr.HTML(TITLE)
             prompt_input = gr.Textbox(
                 label="Prompt to analyze",
                 placeholder="Enter a prompt to evaluate its safety...",
+                lines=4,
+                max_lines=10,
+            )
+            analyze_btn = gr.Button(
+                "Analyze",
+                variant="primary",
+                size="lg",
+            )
+            gr.Examples(
+                examples=EXAMPLES,
+                inputs=prompt_input,
+                label="Example prompts",
             )
         with gr.Column(scale=1):
             result_html = gr.HTML(value=empty_html(), label="Result")
     with gr.Row():
         with gr.Column(scale=1):
+            label_output = gr.Label(
+                label="Probability Distribution",
+                num_top_classes=2,
+            )
         with gr.Column(scale=1):
+            risk_output = gr.Markdown(
+                value="*Risk assessment will appear here.*",
+                label="Risk Assessment",
+            )
+    # Events
+    analyze_btn.click(
+        fn=analyze_prompt,
+        inputs=[prompt_input],
+        outputs=[result_html, label_output, risk_output],
+    )
+    prompt_input.submit(
+        fn=analyze_prompt,
+        inputs=[prompt_input],
+        outputs=[result_html, label_output, risk_output],
+    )
+    # Footer
+    gr.Markdown(
+        """
+        ---
+        <div style="text-align:center; color:#64748b; font-size:0.85em;">
+            <strong>GuardLLM</strong> is powered by
+            <a href="https://huggingface.co/meta-llama/Llama-Prompt-Guard-2-86M">
+            Llama Prompt Guard 2 (86M)</a> by Meta.<br>
+            This model classifies prompts into 2 categories:
+            <strong>Benign</strong> and <strong>Malicious</strong> (injection/jailbreak).<br>
+            Maximum input length: 512 tokens.
+        </div>
+        """,
+    )
 if __name__ == "__main__":
+    demo.launch()