kriti0608's picture
Create src/rules.py
66e8062 verified
import re
from dataclasses import dataclass
from typing import List, Dict, Any
@dataclass
class JailbreakRule:
name: str
pattern: re.Pattern
description: str
weight: float = 1.0
# A small but realistic set of jailbreak-style patterns
_RAW_RULES = [
JailbreakRule(
name="ignore_safety",
pattern=re.compile(r"\bignore (all )?(previous )?(rules|instructions|safety)\b", re.I),
description="Tries to override or ignore prior safety instructions.",
weight=1.2,
),
JailbreakRule(
name="dan_style",
pattern=re.compile(r"\bDAN\b|\bdo anything now\b", re.I),
description="Classic 'DAN' jailbreak pattern.",
weight=1.5,
),
JailbreakRule(
name="uncensored_mode",
pattern=re.compile(r"\buncensored\b|\bno restrictions\b|\bno filter\b", re.I),
description="Requests uncensored / unrestricted behavior.",
weight=1.3,
),
JailbreakRule(
name="jailbreak_word",
pattern=re.compile(r"\bjailbreak\b", re.I),
description="Explicitly mentions jailbreak.",
weight=1.1,
),
JailbreakRule(
name="pretend_roleplay",
pattern=re.compile(r"\bpretend to be\b|\broleplay as\b", re.I),
description="Asks the model to roleplay as an unsafe persona.",
weight=1.0,
),
JailbreakRule(
name="override_system",
pattern=re.compile(r"\byou are not an (AI|assistant)\b|\byou must always answer\b", re.I),
description="Attempts to override system-level identity or policy.",
weight=1.4,
),
]
def get_rules() -> List[JailbreakRule]:
return list(_RAW_RULES)
def match_rules(text: str) -> List[Dict[str, Any]]:
"""
Returns a list of fired rules with spans for debugging.
"""
hits = []
for rule in _RAW_RULES:
for m in rule.pattern.finditer(text):
hits.append(
{
"rule": rule.name,
"description": rule.description,
"start": m.start(),
"end": m.end(),
"match_text": m.group(0),
"weight": rule.weight,
}
)
return hits