kriti0608 commited on
Commit
c4f4657
·
verified ·
1 Parent(s): 66e8062

Create src/detector.py

Browse files
Files changed (1) hide show
  1. src/detector.py +49 -0
src/detector.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from dataclasses import dataclass
3
+ from typing import Dict, Any, List, Optional
4
+
5
+ from .rules import match_rules
6
+
7
+
8
+ @dataclass
9
+ class DetectionResult:
10
+ risk_score: float
11
+ fired_rules: List[Dict[str, Any]]
12
+ metadata: Dict[str, Any]
13
+
14
+
15
+ class JailbreakDetector:
16
+ """
17
+ Lightweight, rule-based jailbreak detector.
18
+
19
+ - Looks at prompt, output, or both.
20
+ - Returns a normalized risk score 0–1 + which patterns fired.
21
+ """
22
+
23
+ def __init__(self, consider_output: bool = True):
24
+ self.consider_output = consider_output
25
+
26
+ def score(self, prompt: str, output: Optional[str] = None) -> DetectionResult:
27
+ combined_text = prompt or ""
28
+ source_flags = {"prompt_rules": [], "output_rules": []}
29
+
30
+ prompt_hits = match_rules(prompt or "")
31
+ source_flags["prompt_rules"] = prompt_hits
32
+
33
+ all_hits = list(prompt_hits)
34
+
35
+ if self.consider_output and output:
36
+ out_hits = match_rules(output)
37
+ source_flags["output_rules"] = out_hits
38
+ all_hits.extend(out_hits)
39
+
40
+ # Compute a simple normalized risk score
41
+ total_weight = sum(h["weight"] for h in all_hits)
42
+ # Cap score to [0,1] using a simple normalization
43
+ risk_score = min(1.0, total_weight / 3.0) # 3.0 is arbitrary scale
44
+
45
+ return DetectionResult(
46
+ risk_score=risk_score,
47
+ fired_rules=all_hits,
48
+ metadata=source_flags,
49
+ )