Spaces:

dispatchAI
/

mcp-safety-classifier

Runtime error

File size: 3,333 Bytes

af3ca6e

import gradio as gr
import json
import re

TOXIC_KEYWORDS = ["kill","harm","violent","attack","hate","stupid","idiot","racist","destroy","weapon","bomb","poison","abuse","threat","curse"]

INJECTION_PATTERNS = [
    r"ignore (all|previous) instructions", r"disregard (the above|previous|your)",
    r"you are now (a|an) ", r"system prompt", r"\[system\]",
    r"act as (if you are|a different)", r"override (your|the) rules",
    r"forget (everything|all|your instructions)", r"new instructions:",
    r"jailbreak", r"DAN mode", r"developer mode",
    r"reveal (your|the) (system|prompt|instructions)",
]

PII_PATTERNS = {
    "email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
    "phone": r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b",
    "ssn": r"\b\d{3}-\d{2}-\d{4}\b",
    "credit_card": r"\b(?:\d[ -]*?){13,16}\b",
    "ip_address": r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b",
}

def classify_safety(text: str) -> str:
    """Classify text for safety: toxicity, prompt injection, and PII detection."""
    lower = text.lower()
    toxic_hits = [kw for kw in TOXIC_KEYWORDS if kw in lower]
    toxicity_score = min(len(toxic_hits) / 3, 1.0)
    is_toxic = toxicity_score >= 0.5
    
    injection_hits = [p for p in INJECTION_PATTERNS if re.search(p, text, re.IGNORECASE)]
    is_injection = len(injection_hits) > 0
    
    pii_found = {}
    for pii_type, pattern in PII_PATTERNS.items():
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            pii_found[pii_type] = len(matches)
    has_pii = len(pii_found) > 0
    
    is_safe = not is_toxic and not is_injection and not has_pii
    
    return json.dumps({
        "is_safe": is_safe,
        "toxicity": {"detected": is_toxic, "score": round(toxicity_score, 2), "flagged_words": toxic_hits},
        "prompt_injection": {"detected": is_injection, "matched_patterns": len(injection_hits)},
        "pii_detected": {"found": has_pii, "types": pii_found},
        "recommendation": "ALLOW" if is_safe else "BLOCK",
    }, indent=2)

def detect_pii(text: str) -> str:
    """Detect personally identifiable information in text."""
    results = {}
    for pii_type, pattern in PII_PATTERNS.items():
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            results[pii_type] = {"count": len(matches), "examples": matches[:3]}
    return json.dumps({"pii_found": len(results) > 0, "types": results}, indent=2)

with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="dispatchAI Safety Classifier") as demo:
    gr.Markdown("# 🛡️ dispatchAI On-Device Safety Classifier (MCP)")
    with gr.Tab("Safety Check"):
        s_input = gr.Textbox(label="Text to check", lines=5)
        s_btn = gr.Button("Classify Safety", variant="primary")
        s_out = gr.Textbox(label="Assessment (JSON)", lines=15)
        s_btn.click(fn=classify_safety, inputs=s_input, outputs=s_out)
    with gr.Tab("PII Detection"):
        p_input = gr.Textbox(label="Text to scan", lines=5)
        p_btn = gr.Button("Detect PII", variant="primary")
        p_out = gr.Textbox(label="PII Report (JSON)", lines=12)
        p_btn.click(fn=detect_pii, inputs=p_input, outputs=p_out)
    gr.Markdown("---\n🚀 [dispatchAI](https://huggingface.co/dispatchAI) — Small. Mobile. Free. UAE-built.")

demo.launch(mcp_server=True)