| | """ |
| | Prompt Injection Scanner Tool — Detects adversarial prompt injection in text. |
| | |
| | Assigned To: Safety Guardian agent ONLY |
| | Reference: system_design.md — Tool 4 (Lines 504-541) |
| | Reference: engineering_guardrails.md — §2 Tool-Call Argument Validation |
| | |
| | Key guardrails: |
| | - Checks 10+ adversarial patterns |
| | - Returns error STRINGS, never raises exceptions |
| | - Flags document as suspicious if patterns found |
| | """ |
| |
|
| | import re |
| | import json |
| | from crewai.tools import tool |
| |
|
| |
|
| | @tool |
| | def prompt_injection_scanner_tool(text: str = "") -> str: |
| | """Scan text for prompt injection attempts. Returns JSON with is_safe flag |
| | and list of suspicious patterns found. Pass the text to scan as the 'text' argument.""" |
| |
|
| | |
| | if not text or not isinstance(text, str): |
| | return json.dumps({"is_safe": False, "suspicious_patterns": [], "error": "Empty or invalid input"}) |
| |
|
| | if len(text.strip()) == 0: |
| | return json.dumps({"is_safe": False, "suspicious_patterns": [], "error": "Empty text provided"}) |
| |
|
| | |
| | try: |
| | suspicious_patterns = [ |
| | r"ignore\s+(all\s+)?previous\s+instructions", |
| | r"disregard\s+(all\s+)?(above|previous)", |
| | r"forget\s+(everything|all|your\s+instructions)", |
| | r"new\s+instructions?\s*:", |
| | r"\[INST\]", |
| | r"<\|im_start\|>", |
| | r"<\|system\|>", |
| | r"override\s+(all\s+)?safety", |
| | r"jailbreak", |
| | ] |
| |
|
| | findings = [] |
| | text_lower = text.lower() |
| |
|
| | for pattern in suspicious_patterns: |
| | if re.search(pattern, text_lower): |
| | findings.append(pattern) |
| |
|
| | is_safe = len(findings) == 0 |
| |
|
| | return json.dumps({ |
| | "is_safe": is_safe, |
| | "suspicious_patterns": findings, |
| | "patterns_checked": len(suspicious_patterns), |
| | }) |
| |
|
| | except Exception as e: |
| | |
| | return json.dumps({ |
| | "is_safe": False, |
| | "suspicious_patterns": [], |
| | "error": f"Injection scan failed: {type(e).__name__}: {str(e)}" |
| | }) |
| |
|