Spaces:

Danielfonseca1212
/

Promptinjection

Sleeping

App Files Files Community

Promptinjection / detector.py

Danielfonseca1212

Rename Detector.py to detector.py

63a54b9 verified about 2 months ago

raw

history blame contribute delete

12.5 kB

	"""
	Detector de Prompt Injection — Motor principal
	Combina regex, heurísticas e scoring semântico
	"""

	import re
	import unicodedata
	import hashlib
	import time
	import random
	from dataclasses import dataclass, field
	from typing import Optional
	from enum import Enum


	class ThreatLevel(str, Enum):
	CLEAN = "CLEAN"
	SUSPICIOUS = "SUSPICIOUS"
	BLOCKED = "BLOCKED"


	@dataclass
	class DetectionResult:
	trace_id: str
	threat_level: ThreatLevel
	risk_score: int
	threats_found: list
	modifications: list
	sanitized_text: str
	char_count_original: int
	char_count_sanitized: int
	processing_ms: float
	blocked_reason: Optional[str] = None
	trace: dict = field(default_factory=dict)


	BLOCK_PATTERNS = [
	(r"(?i)(ignore\|forget\|disregard)\s+(all\s+)?(previous\|prior\|above\|earlier)\s+(instructions?\|prompts?\|rules?\|constraints?)", "LLM01", 95),
	(r"(?i)you\s+are\s+now\s+(a\|an\|the)\s+\w+\s+(without\|with no\|free from)", "LLM01", 90),
	(r"(?i)(act\|behave\|pretend\|roleplay)\s+as\s+(if\s+you\s+(are\|were)\|a\|an)\s+", "LLM01", 85),
	(r"(?i)your\s+(new\s+)?(instructions?\|rules?\|persona\|role)\s+(are\|is\|will be)", "LLM01", 88),
	(r"(?i)(override\|bypass\|disable\|remove)\s+(your\s+)?(safety\|filter\|restriction\|guardrail\|alignment)", "LLM01", 95),
	(r"(?i)(print\|show\|reveal\|display\|output\|repeat)\s+(your\s+)?(system\s+prompt\|base\s+prompt\|initial\s+prompt)", "LLM02", 90),
	(r"(?i)(ignore\|skip)\s+(the\s+)?(system\|user)\s+(prompt\|message\|instructions?)", "LLM01", 88),
	(r"(?i)\bDAN\b.*\b(mode\|prompt\|jailbreak)\b", "LLM01", 95),
	(r"(?i)do\s+anything\s+now", "LLM01", 90),
	(r"(?i)(developer\|jailbreak\|god\|admin\|root)\s+mode", "LLM01", 88),
	(r"(?i)\[SYSTEM\]\|\[INST\]\|\[\/INST\]\|<\\|system\\|>\|<\\|user\\|>\|<\\|assistant\\|>", "LLM01", 92),
	(r"(?i)(execute\|run\|eval\|call)\s+(this\s+)?(code\|script\|command\|function)", "LLM02", 85),
	(r"(?i)```\s(python\|bash\|sh\|javascript\|js\|powershell\|cmd)\s\n", "LLM02", 80),
	(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]", "LLM01", 70),
	(r"\u202e\|\u200b\|\ufeff", "LLM01", 75),
	(r"(?i)(simulate\|emulate)\s+(a\|an)?\s(unrestricted\|uncensored\|unfiltered)\s(ai\|model\|llm\|chatbot)", "LLM01", 92),
	(r"(?i)respond\s+(only\|exclusively)\s+(in\|as\|like)\s+(a\|an)?\s*(evil\|unethical\|harmful\|dangerous)", "LLM01", 95),
	(r"(?i)(for\s+)?(educational\|research\|fictional\|hypothetical)\s+purposes?\s(only\|,)?\s(ignore\|bypass\|disable)", "LLM01", 85),
	]

	SUSPICIOUS_PATTERNS = [
	(r"(?i)(new\|different\|alternative)\s+(instruction\|rule\|persona\|role\|prompt)", "LLM01", 40),
	(r"(?i)from\s+now\s+on\s+(you\|your)", "LLM01", 45),
	(r"(?i)(must\|should\|will)\s+(always\|never)\s+(respond\|answer\|say)", "LLM01", 35),
	(r"(?i)(hypothetically\|theoretically\|in\s+a\s+fictional\s+scenario)", "LLM01", 30),
	(r"(?i)(base64\|hex\|rot13\|caesar)\s+(encoded?\|decoded?\|cipher)", "LLM02", 50),
	(r"(?i)(sudo\|admin\|root\|superuser)\s*:", "LLM01", 40),
	(r"(?i)what\s+would\s+you\s+(say\|do)\s+if\s+you\s+(had\s+no\|were\s+free)", "LLM01", 45),
	(r"(?i)(translate\|convert)\s+(and\s+)?(follow\|execute\|run)\s+", "LLM02", 55),
	(r"(?i)pretend\s+(that\s+)?(you\|there)\s+(are\|is\|have)\s+no", "LLM01", 50),
	(r"(?i)imagine\s+(you\s+are\|being)\s+(a\|an)\s+\w+\s+(without\|with no)", "LLM01", 45),
	]

	LIMITS = {
	"baixa": {"max_chars": 8000, "max_lines": 100, "block_threshold": 85},
	"média": {"max_chars": 4000, "max_lines": 50, "block_threshold": 75},
	"alta": {"max_chars": 2000, "max_lines": 30, "block_threshold": 65},
	"máxima": {"max_chars": 1000, "max_lines": 20, "block_threshold": 50},
	}

	OWASP_MAP = {
	"LLM01": "Prompt Injection",
	"LLM02": "Insecure Output Handling",
	"LLM03": "Training Data Poisoning",
	"LLM06": "Sensitive Information Disclosure",
	}


	class PromptInjectionDetector:

	def __init__(self):
	self._block = [(re.compile(p), cat, score) for p, cat, score in BLOCK_PATTERNS]
	self._suspicious = [(re.compile(p), cat, score) for p, cat, score in SUSPICIOUS_PATTERNS]

	def get_owasp_category(self, threat: str) -> str:
	for cat_id, cat_name in OWASP_MAP.items():
	if cat_id in threat:
	return f"{cat_id}: {cat_name}"
	return "OWASP LLM01: Prompt Injection"

	def analyze(self, text: str, max_chars: int = 4000, sensitivity: str = "alta") -> DetectionResult:
	return self._run(text, max_chars, sensitivity, trace_mode=False)

	def analyze_with_trace(self, text: str, max_chars: int = 4000, sensitivity: str = "alta") -> DetectionResult:
	return self._run(text, max_chars, sensitivity, trace_mode=True)

	def _run(self, text: str, max_chars: int, sensitivity: str, trace_mode: bool) -> DetectionResult:
	t0 = time.perf_counter()
	limits = LIMITS.get(sensitivity, LIMITS["alta"])
	effective_max = min(max_chars, limits["max_chars"])

	trace_id = hashlib.sha256(f"{text}{time.time()}".encode()).hexdigest()[:16]
	threats = []
	mods = []
	trace = {}
	risk_score = 0

	# ── step 1: unicode ─────────────────────────────────────────────────
	t1 = time.perf_counter()
	normalized = unicodedata.normalize("NFKC", text)
	changed = normalized != text
	if changed:
	mods.append("unicode_normalized")
	text = normalized
	if trace_mode:
	trace["unicode"] = {"status": "flagged" if changed else "pass", "detail": "Normalized" if changed else "OK", "ms": round((time.perf_counter()-t1)*1000, 2)}

	# ── step 2: control chars ────────────────────────────────────────────
	t1 = time.perf_counter()
	cleaned = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f\u200b\u202e\ufeff]", "", text)
	changed = cleaned != text
	if changed:
	mods.append("control_chars_removed")
	threats.append("control_characters [LLM01]")
	risk_score = max(risk_score, 30)
	text = cleaned
	if trace_mode:
	trace["control_chars"] = {"status": "flagged" if changed else "pass", "detail": f"Removed {len(normalized)-len(cleaned)} chars" if changed else "OK", "ms": round((time.perf_counter()-t1)*1000, 2)}

	# ── step 3: size ─────────────────────────────────────────────────────
	t1 = time.perf_counter()
	size_threats = []
	if len(text) > effective_max:
	text = text[:effective_max]
	mods.append(f"truncated_to_{effective_max}_chars")
	size_threats.append("oversized_input")
	risk_score = max(risk_score, 20)
	lines = text.split("\n")
	if len(lines) > limits["max_lines"]:
	text = "\n".join(lines[:limits["max_lines"]])
	mods.append(f"truncated_to_{limits['max_lines']}_lines")
	size_threats.append("too_many_lines")
	rep_match = re.search(r"(.)\1{99,}", text)
	if rep_match:
	text = re.sub(r"(.)\1{99,}", lambda m: m.group(1)*100, text)
	mods.append("repetition_collapsed")
	size_threats.append("excessive_repetition")
	risk_score = max(risk_score, 25)
	threats.extend(size_threats)
	if trace_mode:
	trace["size"] = {"status": "flagged" if size_threats else "pass", "detail": ", ".join(size_threats) if size_threats else "Within limits", "ms": round((time.perf_counter()-t1)*1000, 2)}

	# ── step 4: pattern matching ─────────────────────────────────────────
	t1 = time.perf_counter()
	blocked_match = None
	for pattern, cat, score in self._block:
	m = pattern.search(text)
	if m:
	if score >= limits["block_threshold"]:
	blocked_match = (m.group(0)[:80], cat, score)
	break
	else:
	threats.append(f"near_block_pattern [{cat}]: {m.group(0)[:40]}")
	risk_score = max(risk_score, score - 10)

	if blocked_match:
	elapsed = (time.perf_counter() - t0) * 1000
	if trace_mode:
	trace["patterns"] = {"status": "blocked", "detail": f"Matched: {blocked_match[0]}", "ms": round((time.perf_counter()-t1)*1000, 2)}
	trace["semantic"] = {"status": "skipped", "detail": "Pipeline aborted", "ms": 0}
	trace["risk"] = {"status": "blocked", "detail": f"Score: 100", "ms": 0}
	trace["output"] = {"status": "skipped", "detail": "Pipeline aborted", "ms": 0}
	return DetectionResult(
	trace_id=trace_id,
	threat_level=ThreatLevel.BLOCKED,
	risk_score=100,
	threats_found=threats + [f"block_pattern [{blocked_match[1]}]: {blocked_match[0]}"],
	modifications=mods,
	sanitized_text="",
	char_count_original=len(normalized),
	char_count_sanitized=0,
	processing_ms=round(elapsed, 2),
	blocked_reason=f"Injection pattern detected: '{blocked_match[0]}'",
	trace=trace,
	)

	susp_found = []
	for pattern, cat, score in self._suspicious:
	m = pattern.search(text)
	if m:
	susp_found.append(f"suspicious_pattern [{cat}]: {m.group(0)[:40]}")
	risk_score = max(risk_score, score)
	threats.extend(susp_found)

	if trace_mode:
	trace["patterns"] = {"status": "flagged" if susp_found else "pass", "detail": f"{len(susp_found)} suspicious patterns" if susp_found else "No patterns matched", "ms": round((time.perf_counter()-t1)*1000, 2)}

	# ── step 5: semantic score (heurístico sem modelo externo) ────────────
	t1 = time.perf_counter()
	semantic_score = self._heuristic_semantic_score(text)
	risk_score = max(risk_score, semantic_score)
	if trace_mode:
	trace["semantic"] = {"status": "flagged" if semantic_score > 30 else "pass", "detail": f"Heuristic score: {semantic_score}", "ms": round((time.perf_counter()-t1)*1000, 2)}

	# ── step 6: risk aggregation ─────────────────────────────────────────
	t1 = time.perf_counter()
	if len(threats) > 3:
	risk_score = min(100, risk_score + 10)
	if trace_mode:
	trace["risk"] = {"status": "flagged" if risk_score > 30 else "pass", "detail": f"Final score: {risk_score}", "ms": round((time.perf_counter()-t1)*1000, 2)}

	# ── step 7: output filter ─────────────────────────────────────────────
	t1 = time.perf_counter()
	if trace_mode:
	trace["output"] = {"status": "pass", "detail": "Output filter applied", "ms": round((time.perf_counter()-t1)*1000, 2)}

	threat_level = ThreatLevel.CLEAN if not threats else ThreatLevel.SUSPICIOUS
	elapsed = (time.perf_counter() - t0) * 1000

	return DetectionResult(
	trace_id=trace_id,
	threat_level=threat_level,
	risk_score=risk_score,
	threats_found=threats,
	modifications=mods,
	sanitized_text=text,
	char_count_original=len(normalized),
	char_count_sanitized=len(text),
	processing_ms=round(elapsed, 2),
	trace=trace,
	)

	def _heuristic_semantic_score(self, text: str) -> int:
	score = 0
	t = text.lower()
	injection_keywords = ["instruction", "system prompt", "previous", "ignore", "override", "bypass", "jailbreak", "restriction", "filter", "safety", "pretend", "act as", "role", "now you", "forget"]
	hits = sum(1 for kw in injection_keywords if kw in t)
	score += min(hits * 8, 60)
	if len(re.findall(r"\b(you\|your\|yourself)\b", t)) > 5:
	score += 15
	if "?" not in text and len(text) > 100:
	score += 10
	return min(score, 70)