File size: 1,429 Bytes
c4f4657
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50

from dataclasses import dataclass
from typing import Dict, Any, List, Optional

from .rules import match_rules


@dataclass
class DetectionResult:
    risk_score: float
    fired_rules: List[Dict[str, Any]]
    metadata: Dict[str, Any]


class JailbreakDetector:
    """
    Lightweight, rule-based jailbreak detector.

    - Looks at prompt, output, or both.
    - Returns a normalized risk score 0–1 + which patterns fired.
    """

    def __init__(self, consider_output: bool = True):
        self.consider_output = consider_output

    def score(self, prompt: str, output: Optional[str] = None) -> DetectionResult:
        combined_text = prompt or ""
        source_flags = {"prompt_rules": [], "output_rules": []}

        prompt_hits = match_rules(prompt or "")
        source_flags["prompt_rules"] = prompt_hits

        all_hits = list(prompt_hits)

        if self.consider_output and output:
            out_hits = match_rules(output)
            source_flags["output_rules"] = out_hits
            all_hits.extend(out_hits)

        # Compute a simple normalized risk score
        total_weight = sum(h["weight"] for h in all_hits)
        # Cap score to [0,1] using a simple normalization
        risk_score = min(1.0, total_weight / 3.0)  # 3.0 is arbitrary scale

        return DetectionResult(
            risk_score=risk_score,
            fired_rules=all_hits,
            metadata=source_flags,
        )