securebot / backend /classifier.py
absence404's picture
Upload 6 files
2fd7279 verified
raw
history blame contribute delete
883 Bytes
from transformers import pipeline
classifier = pipeline(
"text-classification",
model="protectai/deberta-v3-base-prompt-injection-v2",
device=-1
)
def detect_injection(user_input: str) -> dict:
result = classifier(user_input)[0]
label = result["label"]
score = result["score"]
suspicious_keywords = [
"ignore previous instructions",
"hack the system",
"disregard your system prompt",
"you are now",
"forget everything",
"act as",
"jailbreak"
]
rule_triggered = any(kw in user_input.lower() for kw in suspicious_keywords)
is_malicious = (label == "INJECTION" and score > 0.85) or rule_triggered
return {
"input": user_input,
"label": label,
"confidence": round(score, 3),
"rule_triggered": rule_triggered,
"is_malicious": is_malicious
}