import re from dataclasses import dataclass from typing import List, Dict, Any @dataclass class JailbreakRule: name: str pattern: re.Pattern description: str weight: float = 1.0 # A small but realistic set of jailbreak-style patterns _RAW_RULES = [ JailbreakRule( name="ignore_safety", pattern=re.compile(r"\bignore (all )?(previous )?(rules|instructions|safety)\b", re.I), description="Tries to override or ignore prior safety instructions.", weight=1.2, ), JailbreakRule( name="dan_style", pattern=re.compile(r"\bDAN\b|\bdo anything now\b", re.I), description="Classic 'DAN' jailbreak pattern.", weight=1.5, ), JailbreakRule( name="uncensored_mode", pattern=re.compile(r"\buncensored\b|\bno restrictions\b|\bno filter\b", re.I), description="Requests uncensored / unrestricted behavior.", weight=1.3, ), JailbreakRule( name="jailbreak_word", pattern=re.compile(r"\bjailbreak\b", re.I), description="Explicitly mentions jailbreak.", weight=1.1, ), JailbreakRule( name="pretend_roleplay", pattern=re.compile(r"\bpretend to be\b|\broleplay as\b", re.I), description="Asks the model to roleplay as an unsafe persona.", weight=1.0, ), JailbreakRule( name="override_system", pattern=re.compile(r"\byou are not an (AI|assistant)\b|\byou must always answer\b", re.I), description="Attempts to override system-level identity or policy.", weight=1.4, ), ] def get_rules() -> List[JailbreakRule]: return list(_RAW_RULES) def match_rules(text: str) -> List[Dict[str, Any]]: """ Returns a list of fired rules with spans for debugging. """ hits = [] for rule in _RAW_RULES: for m in rule.pattern.finditer(text): hits.append( { "rule": rule.name, "description": rule.description, "start": m.start(), "end": m.end(), "match_text": m.group(0), "weight": rule.weight, } ) return hits