Spaces:
Running
Running
File size: 8,526 Bytes
bbe01fe 661c2d6 bbe01fe 661c2d6 bbe01fe 661c2d6 bbe01fe | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 | import re
import os
from app.core.logging import get_logger
logger = get_logger(__name__)
class GuardClassifier:
"""
GuardClassifier integrates a fine-tuned DistilBERT instance aiming
for strictly relevant inputs (>0.70 confidence threshold).
If the fine-tuned model path does not exist (or torch is not installed),
it falls back to permissive Regex heuristics.
"""
def __init__(self, model_path: str = "fine_tuning/guard_classifier/model"):
self.model_path = model_path
self._model = None
self._tokenizer = None
if os.path.exists(self.model_path) and os.listdir(self.model_path):
try:
import torch # noqa: F811 β lazy import, not installed in CI or prod API
from transformers import AutoTokenizer, AutoModelForSequenceClassification
logger.info("Loading GuardClassifier model from %s", self.model_path)
self._tokenizer = AutoTokenizer.from_pretrained(self.model_path)
self._model = AutoModelForSequenceClassification.from_pretrained(self.model_path)
self._model.eval()
except ImportError:
logger.warning("torch/transformers not installed, falling back to rule-based guard.")
self._model = None
except Exception as e:
logger.warning("Failed to load DistilBERT Guard model, falling back to rule-based: %s", e)
self._model = None
else:
logger.info("GuardClassifier model path not found, falling back to rule-based.")
def is_safe_and_relevant(self, query: str) -> bool:
"""Wrapper to maintain existing pipeline signature."""
safe, score = self.is_in_scope(query)
return safe
def is_in_scope(self, text: str) -> tuple[bool, float]:
"""
Returns (is_in_scope, confidence_score).
Threshold: 0.70. Below threshold -> out of scope.
"""
if not self._model or not self._tokenizer:
result = self._rule_based_check(text)
return (result, 1.0 if result else 0.0)
try:
import torch
inputs = self._tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
with torch.no_grad():
outputs = self._model(**inputs)
probs = torch.softmax(outputs.logits, dim=-1)[0]
in_scope_prob = probs[1].item()
is_in_scope = in_scope_prob >= 0.70
return (is_in_scope, float(in_scope_prob))
except Exception as e:
logger.warning("Inference error, reverting to rules: %s", e)
result = self._rule_based_check(text)
return (result, 1.0 if result else 0.0)
# Compiled once at class load β cheaper than recompiling per call.
_INJECTION_PATTERNS: list = []
@classmethod
def _build_patterns(cls) -> list:
"""Compile and cache all injection-detection regexes."""
if cls._INJECTION_PATTERNS:
return cls._INJECTION_PATTERNS
raw = [
# ββ Classic prompt injection ββββββββββββββββββββββββββββββββββββββ
r"ignore\s+(all\s+)?(previous|prior|above|earlier)\s+(instructions?|prompts?|rules?|context)",
r"disregard\s+(all\s+)?(previous|prior|above)\s+(instructions?|prompts?|rules?|context)",
r"forget\s+(everything|all\s+(previous|prior|your))",
r"override\s+(your\s+)?(instructions?|rules?|directives?|constraints?)",
r"bypass\s+your\s+(restrictions?|safety|filters?|rules?|instructions?)",
r"(do\s+not\s+follow|stop\s+following)\s+(your\s+)?(instructions?|rules?|guidelines?)",
# ββ System prompt extraction ββββββββββββββββββββββββββββββββββββββ
r"(repeat|print|output|reveal|show|display|dump|share)\s+(your\s+)?(system\s+)?(prompt|instructions?|rules?|directives?|constraints?|message)",
r"what\s+(are|were)\s+your\s+(instructions?|rules?|system\s+prompt|directives?)",
r"(tell|show)\s+me\s+(your\s+)?(system|initial|original|hidden|secret)\s+(prompt|instructions?|message)",
r"\bsystem\s+message\b",
# ββ Role / persona jailbreaks βββββββββββββββββββββββββββββββββββββ
r"you\s+are\s+now\s+(a\s+|an\s+)?(?!(darshan|assistant))",
r"(pretend|act|behave)\s+(like|as\s+if)\s+you\s+(are|have\s+no|don.t\s+have)",
r"(pretend|imagine|assume|suppose)\s+you\s+(are|were)\s+(a\s+|an\s+)?(?!(darshan))",
r"roleplay\s+as",
r"(simulate|impersonate)\s+(a\s+|an\s+)?(different|other|unrestricted|evil|jailbroken)",
r"(act|respond)\s+as\s+if\s+you\s+(have\s+no|don.t\s+have)\s+(restrictions?|rules?|guidelines?|filters?|safety)",
r"you\s+(have\s+no|don.t\s+have)\s+(restrictions?|rules?|limits?|filters?)",
r"\bdan\s+(mode|prompt|jailbreak)\b",
r"developer\s+mode",
r"jailbreak\b",
r"unrestricted\s+(mode|access|version|ai)",
r"no\s+filter(s|ed)?\s+(mode|version|response)",
# ββ Hypothetical / simulation bypass (meta-instruction targeted only) βββββ
# Note: kept narrow on purpose β Darshan has security/infosec repos and
# visitors may legitimately ask about prompt injection, exploits, bypass
# techniques, etc. as topics. These patterns only fire when they are
# clearly attempts to change the *bot's behaviour*, not discuss a topic.
r"in\s+a\s+(simulation|hypothetical|imaginary|alternate)\s+(scenario|world|universe).{0,30}(no\s+rules?|no\s+restrictions?|you\s+can)",
r"(act|respond|behave).{0,20}as\s+if.{0,20}(no\s+restrictions?|no\s+rules?|unrestricted|jailbroken)",
# ββ User private-info extraction ββββββββββββββββββββββββββββββββββ
r"(what|share|give|show|tell).{0,20}(user.{0,10})?(email|phone|address|password|credit.?card|ssn|date.of.birth|location|ip.?address)",
r"(collect|store|log|extract|retrieve|access).{0,20}(user|visitor|personal)\s+(data|info|information|details)",
r"(do\s+you\s+have|can\s+you\s+access).{0,20}(my|the\s+user.s?)\s+(email|phone|data|address|password)",
# ββ Reputation / defamation attacks ββββββββββββββββββββββββββββββ
r"(say|write|tell|claim|state)\s+(that\s+)?darshan\s+(is|was|has\s+been).{0,40}(bad|stupid|incompetent|fraud|liar|criminal|terrible|fake|cheat)",
r"(make|portray|describe)\s+darshan.{0,20}(negatively|badly|unfavorably|as\s+a\s+(fraud|liar|failure))",
r"write\s+a\s+(negative|bad|false|defamatory|fake).{0,20}(review|statement|claim).{0,20}(about|of)\s+darshan",
r"(discredit|slander|defame|insult|mock)\s+darshan",
# ββ Instruction injection via delimiters ββββββββββββββββββββββββββ
r"<\|\s*(system|user|assistant|im_start|im_end)\s*\|>",
r"<<\s*sys\s*>>",
r"\[\s*inst\s*\]",
r"---\s*system\s*---",
r"#+\s*system\s*prompt",
r"#+\s*new\s+instructions?",
# ββ Training-data poisoning signals ββββββββββββββββββββββββββββββ
r"(add|inject|insert|plant|embed)\s+(this|the\s+following|text|instructions?)\s+(into|in)\s+(your\s+)?(training|context|memory|knowledge)",
r"remember\s+(this|the\s+following)\s+(for\s+(future|all|every)|always)",
r"from\s+now\s+on\s+(you\s+)?(must|will|should|always)",
r"update\s+your\s+(instructions?|rules?|behaviour|system\s+prompt)",
]
cls._INJECTION_PATTERNS = [re.compile(p, re.IGNORECASE) for p in raw]
return cls._INJECTION_PATTERNS
def _rule_based_check(self, text: str) -> bool:
"""Block on any known injection pattern; permissive otherwise."""
for pattern in self._build_patterns():
if pattern.search(text):
return False
return True
|