from transformers import pipeline classifier = pipeline( "text-classification", model="protectai/deberta-v3-base-prompt-injection-v2", device=-1 ) def detect_injection(user_input: str) -> dict: result = classifier(user_input)[0] label = result["label"] score = result["score"] suspicious_keywords = [ "ignore previous instructions", "hack the system", "disregard your system prompt", "you are now", "forget everything", "act as", "jailbreak" ] rule_triggered = any(kw in user_input.lower() for kw in suspicious_keywords) is_malicious = (label == "INJECTION" and score > 0.85) or rule_triggered return { "input": user_input, "label": label, "confidence": round(score, 3), "rule_triggered": rule_triggered, "is_malicious": is_malicious }