UNSAFE_PATTERNS = { "violent_harm": ["kill", "murder", "assassinate", "hurt someone", "violently harm"], "child_sexual": ["csam", "child sexual", "underage porn", "minor sexual"], "sexual_explicit": ["explicit porn", "incest fetish", "graphic sexual"], "self_harm": ["kill myself", "commit suicide", "how to self-harm", "end my life"], "weapons_making": ["make a bomb", "build an explosive", "molotov", "ghost gun"], "serious_illegal": ["credit card generator", "ssn list", "hack bank", "counterfeit money", "sell drugs"], "dox_malware": ["doxx", "make ransomware", "build malware", "backdoor", "keylogger"], "bio_chem": ["bioweapon", "chemical weapon", "create nerve agent"], } def _match_unsafe(text): t = (text or "").lower() for category, needles in UNSAFE_PATTERNS.items(): for n in needles: if n in t: return category return None def safety_filter(text, mode="input"): cat = _match_unsafe(text) if cat: return text, True, cat return text, False, None def refusal_reply(category): reasons = { "violent_harm": "violent harm", "child_sexual": "sexual content involving minors", "sexual_explicit": "explicit sexual content", "self_harm": "self-harm", "weapons_making": "weapon construction", "serious_illegal": "illegal activity", "dox_malware": "privacy or malware abuse", "bio_chem": "biological or chemical harm", } reason = reasons.get(category, "unsafe content") return (f"⚠️ I can’t help with {reason}. " "I can offer safer, high-level context or alternatives if you’d like.")