"""Guardrails — two layers. 1. check_input(text): a regex / keyword blocklist that catches common jailbreak and prompt-injection attempts BEFORE they reach the model. Fast, deterministic, no API call. Tuned to "moderate" — well-known attack phrases only, to keep false positives low. 2. moderate_output(text): sends the model's reply to Claude Haiku 4.5 with a moderation rubric and blocks unsafe content. Tuned to a "standard safety set" (violence/weapons facilitation, illegal acts, hate/harassment, sexual content involving minors, self-harm encouragement) while allowing normal discussion of sensitive topics. Both return a GuardrailResult so callers can branch and the UI can explain what happened. """ from __future__ import annotations import json import re from dataclasses import dataclass from src.config import settings from src.observability import observe # --- Shared result type --------------------------------------------------- @dataclass class GuardrailResult: """Outcome of a guardrail check.""" blocked: bool reason: str = "" # human-readable explanation when blocked # Canned reply shown to the user when a guardrail blocks something. INPUT_REFUSAL = ( "I can't help with that request. It looks like an attempt to bypass my " "safety guidelines. Feel free to rephrase if I misunderstood." ) OUTPUT_REFUSAL = ( "[Response withheld by the output moderation guardrail because it may be " "unsafe.]" ) # --- Layer 1: input blocklist --------------------------------------------- # "Moderate" set: well-known jailbreak / prompt-injection patterns. Case # -insensitive. Kept deliberately specific to avoid blocking benign questions # (e.g. we match "ignore previous instructions", not the bare word "ignore"). _JAILBREAK_PATTERNS: list[str] = [ r"ignore (all )?(previous|prior|above) (instructions|prompts)", r"disregard (all )?(previous|prior|above) (instructions|prompts)", r"forget (all )?(your |the )?(previous |prior )?(instructions|rules)", r"\bDAN\b", # "Do Anything Now" jailbreak r"do anything now", r"developer mode", r"jailbreak", r"pretend (you|to be) .*(no|without).*(rules|restrictions|filters)", r"act as (if you are |an? )?(unfiltered|unrestricted|uncensored)", r"you have no (rules|restrictions|guidelines|filters)", r"bypass (your |the )?(safety|content|moderation|guidelines)", r"reveal (your |the )?(system|hidden) prompt", r"\bSTAN\b", # another common jailbreak persona ] _COMPILED_PATTERNS = [re.compile(p, re.IGNORECASE) for p in _JAILBREAK_PATTERNS] def check_input(text: str) -> GuardrailResult: """Return blocked=True if the input matches a known jailbreak pattern.""" for pattern in _COMPILED_PATTERNS: if pattern.search(text): return GuardrailResult( blocked=True, reason=f"Input matched a jailbreak pattern: /{pattern.pattern}/", ) return GuardrailResult(blocked=False) # --- Layer 2: output moderation via Claude Haiku -------------------------- _MODERATION_RUBRIC = """You are a content-safety classifier. Decide whether the \ ASSISTANT RESPONSE below should be blocked. Block (flagged=true) ONLY if the response contains any of: - Instructions or facilitation for violence, weapons, or attacks - Help committing a serious crime (hacking, fraud, drug synthesis, etc.) - Hateful or harassing content targeting a protected group - Sexual content involving minors - Encouragement or instructions for self-harm or suicide Do NOT block: normal, factual, or neutral discussion of sensitive topics; \ refusals; safety advice; fiction without harmful instructions. Respond with ONLY a JSON object, no other text: {"flagged": true|false, "category": "", "reason": ""} ASSISTANT RESPONSE: """ def _extract_json(raw: str) -> str: """Pull the JSON object out of a model reply, tolerating ```json fences. Models often wrap JSON in a markdown code fence despite being told not to. We strip the fence if present, otherwise fall back to the first {...} span. """ fenced = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", raw, re.DOTALL) if fenced: return fenced.group(1) braces = re.search(r"\{.*\}", raw, re.DOTALL) return braces.group(0) if braces else raw @observe(as_type="generation", name="output_moderation") def moderate_output(text: str) -> GuardrailResult: """Classify the assistant's reply with Haiku; block if flagged. Fails OPEN (allows the text) if no API key is configured or the call errors, since the model output has already passed the model's own safety training — the moderation layer is defense-in-depth, not the only line of defense. """ if not settings.anthropic_api_key: return GuardrailResult(blocked=False) from anthropic import Anthropic try: client = Anthropic(api_key=settings.anthropic_api_key) resp = client.messages.create( model=settings.moderation_model, max_tokens=256, temperature=0, messages=[{"role": "user", "content": _MODERATION_RUBRIC + text}], ) raw = "".join(b.text for b in resp.content if b.type == "text").strip() verdict = json.loads(_extract_json(raw)) if verdict.get("flagged"): cat = verdict.get("category", "unknown") reason = verdict.get("reason", "") return GuardrailResult( blocked=True, reason=f"Output moderation flagged content ({cat}): {reason}", ) return GuardrailResult(blocked=False) except Exception: # noqa: BLE001 - never crash the chat on a moderation hiccup return GuardrailResult(blocked=False)