3morixd's picture
Upload app.py with huggingface_hub
af3ca6e verified
Raw
History Blame Contribute Delete
3.33 kB
import gradio as gr
import json
import re
TOXIC_KEYWORDS = ["kill","harm","violent","attack","hate","stupid","idiot","racist","destroy","weapon","bomb","poison","abuse","threat","curse"]
INJECTION_PATTERNS = [
r"ignore (all|previous) instructions", r"disregard (the above|previous|your)",
r"you are now (a|an) ", r"system prompt", r"\[system\]",
r"act as (if you are|a different)", r"override (your|the) rules",
r"forget (everything|all|your instructions)", r"new instructions:",
r"jailbreak", r"DAN mode", r"developer mode",
r"reveal (your|the) (system|prompt|instructions)",
]
PII_PATTERNS = {
"email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
"phone": r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b",
"ssn": r"\b\d{3}-\d{2}-\d{4}\b",
"credit_card": r"\b(?:\d[ -]*?){13,16}\b",
"ip_address": r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b",
}
def classify_safety(text: str) -> str:
"""Classify text for safety: toxicity, prompt injection, and PII detection."""
lower = text.lower()
toxic_hits = [kw for kw in TOXIC_KEYWORDS if kw in lower]
toxicity_score = min(len(toxic_hits) / 3, 1.0)
is_toxic = toxicity_score >= 0.5
injection_hits = [p for p in INJECTION_PATTERNS if re.search(p, text, re.IGNORECASE)]
is_injection = len(injection_hits) > 0
pii_found = {}
for pii_type, pattern in PII_PATTERNS.items():
matches = re.findall(pattern, text, re.IGNORECASE)
if matches:
pii_found[pii_type] = len(matches)
has_pii = len(pii_found) > 0
is_safe = not is_toxic and not is_injection and not has_pii
return json.dumps({
"is_safe": is_safe,
"toxicity": {"detected": is_toxic, "score": round(toxicity_score, 2), "flagged_words": toxic_hits},
"prompt_injection": {"detected": is_injection, "matched_patterns": len(injection_hits)},
"pii_detected": {"found": has_pii, "types": pii_found},
"recommendation": "ALLOW" if is_safe else "BLOCK",
}, indent=2)
def detect_pii(text: str) -> str:
"""Detect personally identifiable information in text."""
results = {}
for pii_type, pattern in PII_PATTERNS.items():
matches = re.findall(pattern, text, re.IGNORECASE)
if matches:
results[pii_type] = {"count": len(matches), "examples": matches[:3]}
return json.dumps({"pii_found": len(results) > 0, "types": results}, indent=2)
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="dispatchAI Safety Classifier") as demo:
gr.Markdown("# 🛡️ dispatchAI On-Device Safety Classifier (MCP)")
with gr.Tab("Safety Check"):
s_input = gr.Textbox(label="Text to check", lines=5)
s_btn = gr.Button("Classify Safety", variant="primary")
s_out = gr.Textbox(label="Assessment (JSON)", lines=15)
s_btn.click(fn=classify_safety, inputs=s_input, outputs=s_out)
with gr.Tab("PII Detection"):
p_input = gr.Textbox(label="Text to scan", lines=5)
p_btn = gr.Button("Detect PII", variant="primary")
p_out = gr.Textbox(label="PII Report (JSON)", lines=12)
p_btn.click(fn=detect_pii, inputs=p_input, outputs=p_out)
gr.Markdown("---\n🚀 [dispatchAI](https://huggingface.co/dispatchAI) — Small. Mobile. Free. UAE-built.")
demo.launch(mcp_server=True)