Spaces:

HF-Pawan
/

LangGraph-Agent

Running

LangGraph-Agent / app /nodes /guardrails.py

Pawan Mane

Code optimization

2633945 about 1 month ago

2.57 kB

	"""
	app/nodes/guardrails.py — CHECKPOINT 8: Guardrails

	Final backstop — checks the LLM's response for harmful output.
	Safety node handles query intent. Guardrails handles response content.

	Stricter than before — flags self-defense instructions targeting body parts,
	detailed fight techniques, and similar content that safety node may have missed.
	"""
	import json
	from langchain_core.messages import SystemMessage, HumanMessage
	from app.state import AgentState
	from app.utils.llm import llm

	SAFE_FALLBACK = "I'm sorry, I can't help with that request."

	GUARDRAIL_SYSTEM = """You are a strict AI response safety checker.
	Your job is to catch harmful content in AI responses before they reach users."""

	GUARDRAIL_PROMPT = """Does this AI response contain harmful content?

	Flag if the response provides ANY of:
	- Instructions to physically harm people (including "self-defense" techniques targeting body parts)
	- Hacking, credential theft, or unauthorized access instructions
	- Weapon construction or acquisition methods
	- Drug synthesis or procurement
	- Detailed methods to stalk, harass, or surveil someone
	- Suicide or self-harm methods

	DO NOT flag:
	- General conceptual explanations (e.g., "SQL injection is a type of attack")
	- Refusals and safety warnings
	- Medical information from a health perspective
	- Historical or news context

	Response to check:
	"{response}"

	JSON only: {{"harmful": true/false, "reason": "one sentence"}}"""


	def guardrails_node(state: AgentState) -> AgentState:
	response = state.get("response", "")
	log = state.get("node_log", [])

	if state.get("is_harmful"):
	return state

	try:
	check = llm.invoke([
	SystemMessage(content=GUARDRAIL_SYSTEM),
	HumanMessage(content=GUARDRAIL_PROMPT.format(response=response[:2000])),
	])
	raw = check.content.strip().removeprefix("```json").removesuffix("```").strip()
	result = json.loads(raw)

	if result.get("harmful", False):
	reason = result.get("reason", "")
	print(f"[GUARDRAILS] ⚠️ Response blocked — {reason}")
	return {
	**state,
	"guardrail_passed": False,
	"response": SAFE_FALLBACK,
	"node_log": log + [f"guardrails (BLOCKED: {reason})"],
	}
	except Exception as e:
	print(f"[GUARDRAILS] Check error ({e}) — passing through.")

	print("[GUARDRAILS] ✅ Passed.")
	return {**state, "guardrail_passed": True, "node_log": log + ["guardrails ✅"]}