Spaces:
Running
Running
| from transformers import pipeline | |
| classifier = pipeline( | |
| "text-classification", | |
| model="protectai/deberta-v3-base-prompt-injection-v2", | |
| device=-1 | |
| ) | |
| def detect_injection(user_input: str) -> dict: | |
| result = classifier(user_input)[0] | |
| label = result["label"] | |
| score = result["score"] | |
| suspicious_keywords = [ | |
| "ignore previous instructions", | |
| "hack the system", | |
| "disregard your system prompt", | |
| "you are now", | |
| "forget everything", | |
| "act as", | |
| "jailbreak" | |
| ] | |
| rule_triggered = any(kw in user_input.lower() for kw in suspicious_keywords) | |
| is_malicious = (label == "INJECTION" and score > 0.85) or rule_triggered | |
| return { | |
| "input": user_input, | |
| "label": label, | |
| "confidence": round(score, 3), | |
| "rule_triggered": rule_triggered, | |
| "is_malicious": is_malicious | |
| } |