Spaces:
Running
Running
File size: 1,429 Bytes
c4f4657 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
from dataclasses import dataclass
from typing import Dict, Any, List, Optional
from .rules import match_rules
@dataclass
class DetectionResult:
risk_score: float
fired_rules: List[Dict[str, Any]]
metadata: Dict[str, Any]
class JailbreakDetector:
"""
Lightweight, rule-based jailbreak detector.
- Looks at prompt, output, or both.
- Returns a normalized risk score 0–1 + which patterns fired.
"""
def __init__(self, consider_output: bool = True):
self.consider_output = consider_output
def score(self, prompt: str, output: Optional[str] = None) -> DetectionResult:
combined_text = prompt or ""
source_flags = {"prompt_rules": [], "output_rules": []}
prompt_hits = match_rules(prompt or "")
source_flags["prompt_rules"] = prompt_hits
all_hits = list(prompt_hits)
if self.consider_output and output:
out_hits = match_rules(output)
source_flags["output_rules"] = out_hits
all_hits.extend(out_hits)
# Compute a simple normalized risk score
total_weight = sum(h["weight"] for h in all_hits)
# Cap score to [0,1] using a simple normalization
risk_score = min(1.0, total_weight / 3.0) # 3.0 is arbitrary scale
return DetectionResult(
risk_score=risk_score,
fired_rules=all_hits,
metadata=source_flags,
)
|