Spaces:
Running
Running
| """ | |
| app/nodes/guardrails.py β CHECKPOINT 8: Guardrails | |
| Final backstop β checks the LLM's response for harmful output. | |
| Safety node handles query intent. Guardrails handles response content. | |
| Stricter than before β flags self-defense instructions targeting body parts, | |
| detailed fight techniques, and similar content that safety node may have missed. | |
| """ | |
| import json | |
| from langchain_core.messages import SystemMessage, HumanMessage | |
| from app.state import AgentState | |
| from app.utils.llm import llm | |
| SAFE_FALLBACK = "I'm sorry, I can't help with that request." | |
| GUARDRAIL_SYSTEM = """You are a strict AI response safety checker. | |
| Your job is to catch harmful content in AI responses before they reach users.""" | |
| GUARDRAIL_PROMPT = """Does this AI response contain harmful content? | |
| Flag if the response provides ANY of: | |
| - Instructions to physically harm people (including "self-defense" techniques targeting body parts) | |
| - Hacking, credential theft, or unauthorized access instructions | |
| - Weapon construction or acquisition methods | |
| - Drug synthesis or procurement | |
| - Detailed methods to stalk, harass, or surveil someone | |
| - Suicide or self-harm methods | |
| DO NOT flag: | |
| - General conceptual explanations (e.g., "SQL injection is a type of attack") | |
| - Refusals and safety warnings | |
| - Medical information from a health perspective | |
| - Historical or news context | |
| Response to check: | |
| "{response}" | |
| JSON only: {{"harmful": true/false, "reason": "one sentence"}}""" | |
| def guardrails_node(state: AgentState) -> AgentState: | |
| response = state.get("response", "") | |
| log = state.get("node_log", []) | |
| if state.get("is_harmful"): | |
| return state | |
| try: | |
| check = llm.invoke([ | |
| SystemMessage(content=GUARDRAIL_SYSTEM), | |
| HumanMessage(content=GUARDRAIL_PROMPT.format(response=response[:2000])), | |
| ]) | |
| raw = check.content.strip().removeprefix("```json").removesuffix("```").strip() | |
| result = json.loads(raw) | |
| if result.get("harmful", False): | |
| reason = result.get("reason", "") | |
| print(f"[GUARDRAILS] β οΈ Response blocked β {reason}") | |
| return { | |
| **state, | |
| "guardrail_passed": False, | |
| "response": SAFE_FALLBACK, | |
| "node_log": log + [f"guardrails (BLOCKED: {reason})"], | |
| } | |
| except Exception as e: | |
| print(f"[GUARDRAILS] Check error ({e}) β passing through.") | |
| print("[GUARDRAILS] β Passed.") | |
| return {**state, "guardrail_passed": True, "node_log": log + ["guardrails β "]} |