Spaces:

1337XCode
/

personabot-api

Running

personabot-api / app /security /guard_classifier.py

GitHub Actions

Deploy 2aeaf16

661c2d6 about 1 month ago

8.53 kB

	import re
	import os

	from app.core.logging import get_logger

	logger = get_logger(__name__)


	class GuardClassifier:
	"""
	GuardClassifier integrates a fine-tuned DistilBERT instance aiming
	for strictly relevant inputs (>0.70 confidence threshold).
	If the fine-tuned model path does not exist (or torch is not installed),
	it falls back to permissive Regex heuristics.
	"""
	def __init__(self, model_path: str = "fine_tuning/guard_classifier/model"):
	self.model_path = model_path
	self._model = None
	self._tokenizer = None

	if os.path.exists(self.model_path) and os.listdir(self.model_path):
	try:
	import torch # noqa: F811 — lazy import, not installed in CI or prod API
	from transformers import AutoTokenizer, AutoModelForSequenceClassification

	logger.info("Loading GuardClassifier model from %s", self.model_path)
	self._tokenizer = AutoTokenizer.from_pretrained(self.model_path)
	self._model = AutoModelForSequenceClassification.from_pretrained(self.model_path)
	self._model.eval()
	except ImportError:
	logger.warning("torch/transformers not installed, falling back to rule-based guard.")
	self._model = None
	except Exception as e:
	logger.warning("Failed to load DistilBERT Guard model, falling back to rule-based: %s", e)
	self._model = None
	else:
	logger.info("GuardClassifier model path not found, falling back to rule-based.")

	def is_safe_and_relevant(self, query: str) -> bool:
	"""Wrapper to maintain existing pipeline signature."""
	safe, score = self.is_in_scope(query)
	return safe

	def is_in_scope(self, text: str) -> tuple[bool, float]:
	"""
	Returns (is_in_scope, confidence_score).
	Threshold: 0.70. Below threshold -> out of scope.
	"""
	if not self._model or not self._tokenizer:
	result = self._rule_based_check(text)
	return (result, 1.0 if result else 0.0)

	try:
	import torch

	inputs = self._tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
	with torch.no_grad():
	outputs = self._model(**inputs)

	probs = torch.softmax(outputs.logits, dim=-1)[0]
	in_scope_prob = probs[1].item()

	is_in_scope = in_scope_prob >= 0.70
	return (is_in_scope, float(in_scope_prob))

	except Exception as e:
	logger.warning("Inference error, reverting to rules: %s", e)
	result = self._rule_based_check(text)
	return (result, 1.0 if result else 0.0)

	# Compiled once at class load — cheaper than recompiling per call.
	_INJECTION_PATTERNS: list = []

	@classmethod
	def _build_patterns(cls) -> list:
	"""Compile and cache all injection-detection regexes."""
	if cls._INJECTION_PATTERNS:
	return cls._INJECTION_PATTERNS

	raw = [
	# ── Classic prompt injection ──────────────────────────────────────
	r"ignore\s+(all\s+)?(previous\|prior\|above\|earlier)\s+(instructions?\|prompts?\|rules?\|context)",
	r"disregard\s+(all\s+)?(previous\|prior\|above)\s+(instructions?\|prompts?\|rules?\|context)",
	r"forget\s+(everything\|all\s+(previous\|prior\|your))",
	r"override\s+(your\s+)?(instructions?\|rules?\|directives?\|constraints?)",
	r"bypass\s+your\s+(restrictions?\|safety\|filters?\|rules?\|instructions?)",
	r"(do\s+not\s+follow\|stop\s+following)\s+(your\s+)?(instructions?\|rules?\|guidelines?)",

	# ── System prompt extraction ──────────────────────────────────────
	r"(repeat\|print\|output\|reveal\|show\|display\|dump\|share)\s+(your\s+)?(system\s+)?(prompt\|instructions?\|rules?\|directives?\|constraints?\|message)",
	r"what\s+(are\|were)\s+your\s+(instructions?\|rules?\|system\s+prompt\|directives?)",
	r"(tell\|show)\s+me\s+(your\s+)?(system\|initial\|original\|hidden\|secret)\s+(prompt\|instructions?\|message)",
	r"\bsystem\s+message\b",

	# ── Role / persona jailbreaks ─────────────────────────────────────
	r"you\s+are\s+now\s+(a\s+\|an\s+)?(?!(darshan\|assistant))",
	r"(pretend\|act\|behave)\s+(like\|as\s+if)\s+you\s+(are\|have\s+no\|don.t\s+have)",
	r"(pretend\|imagine\|assume\|suppose)\s+you\s+(are\|were)\s+(a\s+\|an\s+)?(?!(darshan))",
	r"roleplay\s+as",
	r"(simulate\|impersonate)\s+(a\s+\|an\s+)?(different\|other\|unrestricted\|evil\|jailbroken)",
	r"(act\|respond)\s+as\s+if\s+you\s+(have\s+no\|don.t\s+have)\s+(restrictions?\|rules?\|guidelines?\|filters?\|safety)",
	r"you\s+(have\s+no\|don.t\s+have)\s+(restrictions?\|rules?\|limits?\|filters?)",
	r"\bdan\s+(mode\|prompt\|jailbreak)\b",
	r"developer\s+mode",
	r"jailbreak\b",
	r"unrestricted\s+(mode\|access\|version\|ai)",
	r"no\s+filter(s\|ed)?\s+(mode\|version\|response)",

	# ── Hypothetical / simulation bypass (meta-instruction targeted only) ─────
	# Note: kept narrow on purpose — Darshan has security/infosec repos and
	# visitors may legitimately ask about prompt injection, exploits, bypass
	# techniques, etc. as topics. These patterns only fire when they are
	# clearly attempts to change the bot's behaviour, not discuss a topic.
	r"in\s+a\s+(simulation\|hypothetical\|imaginary\|alternate)\s+(scenario\|world\|universe).{0,30}(no\s+rules?\|no\s+restrictions?\|you\s+can)",
	r"(act\|respond\|behave).{0,20}as\s+if.{0,20}(no\s+restrictions?\|no\s+rules?\|unrestricted\|jailbroken)",

	# ── User private-info extraction ──────────────────────────────────
	r"(what\|share\|give\|show\|tell).{0,20}(user.{0,10})?(email\|phone\|address\|password\|credit.?card\|ssn\|date.of.birth\|location\|ip.?address)",
	r"(collect\|store\|log\|extract\|retrieve\|access).{0,20}(user\|visitor\|personal)\s+(data\|info\|information\|details)",
	r"(do\s+you\s+have\|can\s+you\s+access).{0,20}(my\|the\s+user.s?)\s+(email\|phone\|data\|address\|password)",

	# ── Reputation / defamation attacks ──────────────────────────────
	r"(say\|write\|tell\|claim\|state)\s+(that\s+)?darshan\s+(is\|was\|has\s+been).{0,40}(bad\|stupid\|incompetent\|fraud\|liar\|criminal\|terrible\|fake\|cheat)",
	r"(make\|portray\|describe)\s+darshan.{0,20}(negatively\|badly\|unfavorably\|as\s+a\s+(fraud\|liar\|failure))",
	r"write\s+a\s+(negative\|bad\|false\|defamatory\|fake).{0,20}(review\|statement\|claim).{0,20}(about\|of)\s+darshan",
	r"(discredit\|slander\|defame\|insult\|mock)\s+darshan",

	# ── Instruction injection via delimiters ──────────────────────────
	r"<\\|\s(system\|user\|assistant\|im_start\|im_end)\s\\|>",
	r"<<\ssys\s>>",
	r"\[\sinst\s\]",
	r"---\ssystem\s---",
	r"#+\ssystem\sprompt",
	r"#+\s*new\s+instructions?",

	# ── Training-data poisoning signals ──────────────────────────────
	r"(add\|inject\|insert\|plant\|embed)\s+(this\|the\s+following\|text\|instructions?)\s+(into\|in)\s+(your\s+)?(training\|context\|memory\|knowledge)",
	r"remember\s+(this\|the\s+following)\s+(for\s+(future\|all\|every)\|always)",
	r"from\s+now\s+on\s+(you\s+)?(must\|will\|should\|always)",
	r"update\s+your\s+(instructions?\|rules?\|behaviour\|system\s+prompt)",
	]

	cls._INJECTION_PATTERNS = [re.compile(p, re.IGNORECASE) for p in raw]
	return cls._INJECTION_PATTERNS

	def _rule_based_check(self, text: str) -> bool:
	"""Block on any known injection pattern; permissive otherwise."""
	for pattern in self._build_patterns():
	if pattern.search(text):
	return False
	return True