Rajan Sharma
Create safety.py
d78ac98 verified
raw
history blame
1.69 kB
UNSAFE_PATTERNS = {
"violent_harm": ["kill", "murder", "assassinate", "hurt someone", "violently harm"],
"child_sexual": ["csam", "child sexual", "underage porn", "minor sexual"],
"sexual_explicit": ["explicit porn", "incest fetish", "graphic sexual"],
"self_harm": ["kill myself", "commit suicide", "how to self-harm", "end my life"],
"weapons_making": ["make a bomb", "build an explosive", "molotov", "ghost gun"],
"serious_illegal": ["credit card generator", "ssn list", "hack bank", "counterfeit money", "sell drugs"],
"dox_malware": ["doxx", "make ransomware", "build malware", "backdoor", "keylogger"],
"bio_chem": ["bioweapon", "chemical weapon", "create nerve agent"],
}
def _match_unsafe(text):
t = (text or "").lower()
for category, needles in UNSAFE_PATTERNS.items():
for n in needles:
if n in t:
return category
return None
def safety_filter(text, mode="input"):
cat = _match_unsafe(text)
if cat:
return text, True, cat
return text, False, None
def refusal_reply(category):
reasons = {
"violent_harm": "violent harm",
"child_sexual": "sexual content involving minors",
"sexual_explicit": "explicit sexual content",
"self_harm": "self-harm",
"weapons_making": "weapon construction",
"serious_illegal": "illegal activity",
"dox_malware": "privacy or malware abuse",
"bio_chem": "biological or chemical harm",
}
reason = reasons.get(category, "unsafe content")
return (f"⚠️ I can’t help with {reason}. "
"I can offer safer, high-level context or alternatives if you’d like.")