File size: 883 Bytes
2fd7279
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from transformers import pipeline

classifier = pipeline(
    "text-classification",
    model="protectai/deberta-v3-base-prompt-injection-v2",
    device=-1
)

def detect_injection(user_input: str) -> dict:
    result = classifier(user_input)[0]
    label = result["label"]
    score = result["score"]

    suspicious_keywords = [
        "ignore previous instructions",
        "hack the system",
        "disregard your system prompt",
        "you are now",
        "forget everything",
        "act as",
        "jailbreak"
    ]
    rule_triggered = any(kw in user_input.lower() for kw in suspicious_keywords)
    is_malicious = (label == "INJECTION" and score > 0.85) or rule_triggered

    return {
        "input": user_input,
        "label": label,
        "confidence": round(score, 3),
        "rule_triggered": rule_triggered,
        "is_malicious": is_malicious
    }