LangGraph-Agent / app /nodes /guardrails.py
Pawan Mane
Code optimization
2633945
"""
app/nodes/guardrails.py β€” CHECKPOINT 8: Guardrails
Final backstop β€” checks the LLM's response for harmful output.
Safety node handles query intent. Guardrails handles response content.
Stricter than before β€” flags self-defense instructions targeting body parts,
detailed fight techniques, and similar content that safety node may have missed.
"""
import json
from langchain_core.messages import SystemMessage, HumanMessage
from app.state import AgentState
from app.utils.llm import llm
SAFE_FALLBACK = "I'm sorry, I can't help with that request."
GUARDRAIL_SYSTEM = """You are a strict AI response safety checker.
Your job is to catch harmful content in AI responses before they reach users."""
GUARDRAIL_PROMPT = """Does this AI response contain harmful content?
Flag if the response provides ANY of:
- Instructions to physically harm people (including "self-defense" techniques targeting body parts)
- Hacking, credential theft, or unauthorized access instructions
- Weapon construction or acquisition methods
- Drug synthesis or procurement
- Detailed methods to stalk, harass, or surveil someone
- Suicide or self-harm methods
DO NOT flag:
- General conceptual explanations (e.g., "SQL injection is a type of attack")
- Refusals and safety warnings
- Medical information from a health perspective
- Historical or news context
Response to check:
"{response}"
JSON only: {{"harmful": true/false, "reason": "one sentence"}}"""
def guardrails_node(state: AgentState) -> AgentState:
response = state.get("response", "")
log = state.get("node_log", [])
if state.get("is_harmful"):
return state
try:
check = llm.invoke([
SystemMessage(content=GUARDRAIL_SYSTEM),
HumanMessage(content=GUARDRAIL_PROMPT.format(response=response[:2000])),
])
raw = check.content.strip().removeprefix("```json").removesuffix("```").strip()
result = json.loads(raw)
if result.get("harmful", False):
reason = result.get("reason", "")
print(f"[GUARDRAILS] ⚠️ Response blocked β€” {reason}")
return {
**state,
"guardrail_passed": False,
"response": SAFE_FALLBACK,
"node_log": log + [f"guardrails (BLOCKED: {reason})"],
}
except Exception as e:
print(f"[GUARDRAILS] Check error ({e}) β€” passing through.")
print("[GUARDRAILS] βœ… Passed.")
return {**state, "guardrail_passed": True, "node_log": log + ["guardrails βœ…"]}