Spaces:

dispatchAI
/

mcp-safety-classifier

Runtime error

App Files Files Community

mcp-safety-classifier / app.py

3morixd

Upload app.py with huggingface_hub

af3ca6e verified about 21 hours ago

Raw

History Blame Contribute Delete

3.33 kB

	import gradio as gr
	import json
	import re

	TOXIC_KEYWORDS = ["kill","harm","violent","attack","hate","stupid","idiot","racist","destroy","weapon","bomb","poison","abuse","threat","curse"]

	INJECTION_PATTERNS = [
	r"ignore (all\|previous) instructions", r"disregard (the above\|previous\|your)",
	r"you are now (a\|an) ", r"system prompt", r"\[system\]",
	r"act as (if you are\|a different)", r"override (your\|the) rules",
	r"forget (everything\|all\|your instructions)", r"new instructions:",
	r"jailbreak", r"DAN mode", r"developer mode",
	r"reveal (your\|the) (system\|prompt\|instructions)",
	]

	PII_PATTERNS = {
	"email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b",
	"phone": r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b",
	"ssn": r"\b\d{3}-\d{2}-\d{4}\b",
	"credit_card": r"\b(?:\d[ -]*?){13,16}\b",
	"ip_address": r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b",
	}

	def classify_safety(text: str) -> str:
	"""Classify text for safety: toxicity, prompt injection, and PII detection."""
	lower = text.lower()
	toxic_hits = [kw for kw in TOXIC_KEYWORDS if kw in lower]
	toxicity_score = min(len(toxic_hits) / 3, 1.0)
	is_toxic = toxicity_score >= 0.5

	injection_hits = [p for p in INJECTION_PATTERNS if re.search(p, text, re.IGNORECASE)]
	is_injection = len(injection_hits) > 0

	pii_found = {}
	for pii_type, pattern in PII_PATTERNS.items():
	matches = re.findall(pattern, text, re.IGNORECASE)
	if matches:
	pii_found[pii_type] = len(matches)
	has_pii = len(pii_found) > 0

	is_safe = not is_toxic and not is_injection and not has_pii

	return json.dumps({
	"is_safe": is_safe,
	"toxicity": {"detected": is_toxic, "score": round(toxicity_score, 2), "flagged_words": toxic_hits},
	"prompt_injection": {"detected": is_injection, "matched_patterns": len(injection_hits)},
	"pii_detected": {"found": has_pii, "types": pii_found},
	"recommendation": "ALLOW" if is_safe else "BLOCK",
	}, indent=2)

	def detect_pii(text: str) -> str:
	"""Detect personally identifiable information in text."""
	results = {}
	for pii_type, pattern in PII_PATTERNS.items():
	matches = re.findall(pattern, text, re.IGNORECASE)
	if matches:
	results[pii_type] = {"count": len(matches), "examples": matches[:3]}
	return json.dumps({"pii_found": len(results) > 0, "types": results}, indent=2)

	with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="dispatchAI Safety Classifier") as demo:
	gr.Markdown("# 🛡️ dispatchAI On-Device Safety Classifier (MCP)")
	with gr.Tab("Safety Check"):
	s_input = gr.Textbox(label="Text to check", lines=5)
	s_btn = gr.Button("Classify Safety", variant="primary")
	s_out = gr.Textbox(label="Assessment (JSON)", lines=15)
	s_btn.click(fn=classify_safety, inputs=s_input, outputs=s_out)
	with gr.Tab("PII Detection"):
	p_input = gr.Textbox(label="Text to scan", lines=5)
	p_btn = gr.Button("Detect PII", variant="primary")
	p_out = gr.Textbox(label="PII Report (JSON)", lines=12)
	p_btn.click(fn=detect_pii, inputs=p_input, outputs=p_out)
	gr.Markdown("---\n🚀 [dispatchAI](https://huggingface.co/dispatchAI) — Small. Mobile. Free. UAE-built.")

	demo.launch(mcp_server=True)