Spaces:

HF-Pawan
/

LangGraph-Agent

Running

App Files Files Community

Pawan Mane commited on Mar 6

Commit

ceb563c

1 Parent(s): d80f659

Code optimization

Browse files

Files changed (5) hide show

app/frontend/gradio_app.py +10 -16
app/frontend/gradio_app_hf.py +31 -42
app/nodes/llm_node.py +17 -9
app/nodes/output.py +34 -7
app/nodes/safety.py +37 -37

app/frontend/gradio_app.py CHANGED Viewed

@@ -14,25 +14,26 @@ from app.nodes.hitl import HITLPauseException
 _graph = build_graph()
 _thread_config = {"configurable": {"thread_id": "gradio-session-001"}}
-_conversation_history = []
 _pending_hitl_state: AgentState | None = None
 def run_graph(query: str) -> AgentState:
-    global _conversation_history
-    _conversation_history.append(HumanMessage(content=query))
     initial_state: AgentState = {
-        "messages": _conversation_history.copy(), "query": query,
         "route": "", "rag_context": "", "tool_calls": [], "tool_results": [],
         "response": "", "retry_count": 0, "hitl_approved": False,
-        "evaluation_score": 0.0, "guardrail_passed": True,
         "memory_summary": "", "node_log": [],
     }
     return _graph.invoke(initial_state, config=_thread_config)
 def resume_graph_after_hitl(state: AgentState, approved: bool) -> AgentState:
-    global _conversation_history
     from app.nodes.evaluation import evaluation_node, eval_route
     from app.nodes.guardrails import guardrails_node
     from app.nodes.output import output_node
@@ -44,7 +45,6 @@ def resume_graph_after_hitl(state: AgentState, approved: bool) -> AgentState:
         s = llm_node(s)
     s = guardrails_node(s)
     s = output_node(s)
-    _conversation_history = s["messages"]
     return s
@@ -81,13 +81,7 @@ def handle_submit(user_message, chat_history):
         score = fs.get("evaluation_score", 0.0)
         g_ok  = fs.get("guardrail_passed", True)
-        # Guardrail blocked — remove this exchange from history so it
-        # doesn't poison the memory summary for future innocent queries
-        if not g_ok:
-            global _conversation_history
-            if _conversation_history:
-                _conversation_history.pop()
         chat_history = chat_history + [bot_msg(fs.get("response", ""))]
         meta  = f"**Route:** {route.upper() or '—'}  ·  **Eval:** {score:.2f}  ·  **Guardrail:** {'✅ Passed' if g_ok else '🚫 Blocked'}"
         return (chat_history, "", format_trace(fs.get("node_log", [])),
@@ -130,8 +124,8 @@ def handle_reject(chat_history):
 def handle_clear():
-    global _conversation_history, _pending_hitl_state
-    _conversation_history, _pending_hitl_state = [], None
     return [], "", "*Waiting for a query...*", "", gr.update(visible=False)

 _graph = build_graph()
 _thread_config = {"configurable": {"thread_id": "gradio-session-001"}}
+# Frontend holds NO conversation history.
+# All message history is managed inside the graph via output_node.
+# LangGraph MemorySaver persists state across invocations automatically.
 _pending_hitl_state: AgentState | None = None
 def run_graph(query: str) -> AgentState:
+    # Just pass the query — graph manages its own message history via state
     initial_state: AgentState = {
+        "messages": [],  # MemorySaver restores history; safety_node adds HumanMessage
+        "query": query,
         "route": "", "rag_context": "", "tool_calls": [], "tool_results": [],
         "response": "", "retry_count": 0, "hitl_approved": False,
+        "evaluation_score": 0.0, "guardrail_passed": True, "is_harmful": False,
         "memory_summary": "", "node_log": [],
     }
     return _graph.invoke(initial_state, config=_thread_config)
 def resume_graph_after_hitl(state: AgentState, approved: bool) -> AgentState:
     from app.nodes.evaluation import evaluation_node, eval_route
     from app.nodes.guardrails import guardrails_node
     from app.nodes.output import output_node
         s = llm_node(s)
     s = guardrails_node(s)
     s = output_node(s)
     return s
         score = fs.get("evaluation_score", 0.0)
         g_ok  = fs.get("guardrail_passed", True)
+        # History is managed entirely by output_node inside the graph
         chat_history = chat_history + [bot_msg(fs.get("response", ""))]
         meta  = f"**Route:** {route.upper() or '—'}  ·  **Eval:** {score:.2f}  ·  **Guardrail:** {'✅ Passed' if g_ok else '🚫 Blocked'}"
         return (chat_history, "", format_trace(fs.get("node_log", [])),
 def handle_clear():
+    global _pending_hitl_state
+    _pending_hitl_state = None
     return [], "", "*Waiting for a query...*", "", gr.update(visible=False)

app/frontend/gradio_app_hf.py CHANGED Viewed

@@ -1,34 +1,29 @@
 """
 app/frontend/gradio_app_hf.py
 ──────────────────────────────
-HuggingFace Spaces entry point.
-Key differences from local gradio_app.py:
   - Reads all config from environment variables (HF injects secrets as env vars)
-  - No .env file available on HF Spaces — dotenv is silenced gracefully
-  - Runs on port 7860 (HF Spaces requirement)
-  - PYTHONPATH=/app must be set in Dockerfile so `from app.*` imports resolve
 """
 import os
-# ── Set env flags before any app imports ──────────────────────────────────
-os.environ["GRADIO_MODE"]   = "true"
-os.environ["PYTHONPATH"]    = "/app"
-# HITL defaults to false on public spaces — override via HF Space Variables
-# All other secrets (GROQ_API_KEY, WEATHER_API_KEY, LLM_MODEL etc.)
-# are set in HuggingFace Space → Settings → Variables and Secrets
-# ── Silence dotenv — no .env file exists on HF Spaces ─────────────────────
-# app/config.py calls load_dotenv() which would print a warning if .env
-# is missing. We patch it to a no-op before config is imported.
 import sys
 from unittest.mock import MagicMock
 if "dotenv" not in sys.modules:
     sys.modules["dotenv"] = MagicMock()
-# ── Import the full app (config, graph, nodes all load here) ───────────────
 import gradio as gr
 from langchain_core.messages import HumanMessage
@@ -39,19 +34,18 @@ from app.frontend.css import CSS
 # ── Graph singleton ────────────────────────────────────────────────────────
-_graph = build_graph()
 _thread_config = {"configurable": {"thread_id": "hf-session-001"}}
-_conversation_history = []
 _pending_hitl_state: AgentState | None = None
 # ── Core runner ────────────────────────────────────────────────────────────
 def run_graph(query: str) -> AgentState:
-    global _conversation_history
-    _conversation_history.append(HumanMessage(content=query))
     initial_state: AgentState = {
-        "messages":         _conversation_history.copy(),
         "query":            query,
         "route":            "",
         "rag_context":      "",
@@ -62,15 +56,14 @@ def run_graph(query: str) -> AgentState:
         "hitl_approved":    False,
         "evaluation_score": 0.0,
         "guardrail_passed": True,
         "memory_summary":   "",
         "node_log":         [],
-        "is_harmful": False,
     }
     return _graph.invoke(initial_state, config=_thread_config)
 def resume_graph_after_hitl(state: AgentState, approved: bool) -> AgentState:
-    global _conversation_history
     from app.nodes.evaluation import evaluation_node, eval_route
     from app.nodes.guardrails import guardrails_node
     from app.nodes.output import output_node
@@ -82,7 +75,6 @@ def resume_graph_after_hitl(state: AgentState, approved: bool) -> AgentState:
         s = llm_node(s)
     s = guardrails_node(s)
     s = output_node(s)
-    _conversation_history = s["messages"]
     return s
@@ -112,7 +104,7 @@ def bot_msg(t):  return {"role": "assistant",  "content": t}
 # ── Event handlers ─────────────────────────────────────────────────────────
 def handle_submit(user_message, chat_history):
-    global _pending_hitl_state, _conversation_history
     if not user_message.strip():
         return chat_history, "", "*Waiting for a query...*", "", gr.update(visible=False), gr.update(value="")
@@ -123,10 +115,7 @@ def handle_submit(user_message, chat_history):
         score = fs.get("evaluation_score", 0.0)
         g_ok  = fs.get("guardrail_passed", True)
-        # Drop blocked exchange from history to prevent memory poisoning
-        if not g_ok and _conversation_history:
-            _conversation_history.pop()
         chat_history = chat_history + [bot_msg(fs.get("response", ""))]
         meta = f"**Route:** {route.upper() or '—'}  ·  **Eval:** {score:.2f}  ·  **Guardrail:** {'✅ Passed' if g_ok else '🚫 Blocked'}"
         return (chat_history, "", format_trace(fs.get("node_log", [])),
@@ -169,8 +158,8 @@ def handle_reject(chat_history):
 def handle_clear():
-    global _conversation_history, _pending_hitl_state
-    _conversation_history, _pending_hitl_state = [], None
     return [], "", "*Waiting for a query...*", "", gr.update(visible=False)
@@ -183,7 +172,6 @@ def build_ui():
         with gr.Row(equal_height=True):
-            # ══ Main chat ═════════════════════════════════════════════
             with gr.Column(scale=4):
                 with gr.Group(elem_classes="section-box"):
@@ -222,7 +210,6 @@ def build_ui():
                         label="Examples",
                     )
-            # ══ Right sidebar ══════════════════════════════════════════
             with gr.Column(scale=1):
                 with gr.Group(elem_classes="section-box"):
@@ -232,15 +219,17 @@ def build_ui():
                 with gr.Group(elem_classes="section-box"):
                     gr.Markdown("""**🗺 Graph Topology**
 ```
-START → router
-  ├─ rag → llm
-  └─ tool/general → llm
-       ├─ tool_executor
-       └─ memory → hitl
-            ├─ evaluation
-            │    ├─ retry → llm
-            │    └─ guardrails → output
-            └─ END
 ```""")
         submit_outs = [chatbot, user_input, trace_display, meta_display, hitl_panel, hitl_content]

 """
 app/frontend/gradio_app_hf.py
 ──────────────────────────────
+HuggingFace Spaces entry point — fully synced with gradio_app.py.
+Key differences from gradio_app.py:
   - Reads all config from environment variables (HF injects secrets as env vars)
+  - No .env file — dotenv silenced gracefully
+  - Port 7860 (HF Spaces requirement)
+  - PYTHONPATH=/app set in Dockerfile
+History management: entirely inside the graph (output_node + MemorySaver).
+Frontend is stateless — no _conversation_history here.
 """
 import os
+os.environ["GRADIO_MODE"] = "true"
+os.environ["PYTHONPATH"]  = "/app"
+# Silence dotenv — no .env on HF Spaces
 import sys
 from unittest.mock import MagicMock
 if "dotenv" not in sys.modules:
     sys.modules["dotenv"] = MagicMock()
 import gradio as gr
 from langchain_core.messages import HumanMessage
 # ── Graph singleton ────────────────────────────────────────────────────────
+_graph         = build_graph()
 _thread_config = {"configurable": {"thread_id": "hf-session-001"}}
+# No _conversation_history — graph manages all history via output_node + MemorySaver
 _pending_hitl_state: AgentState | None = None
 # ── Core runner ────────────────────────────────────────────────────────────
 def run_graph(query: str) -> AgentState:
+    # messages=[] — MemorySaver restores prior history; safety_node adds HumanMessage
     initial_state: AgentState = {
+        "messages":         [],
         "query":            query,
         "route":            "",
         "rag_context":      "",
         "hitl_approved":    False,
         "evaluation_score": 0.0,
         "guardrail_passed": True,
+        "is_harmful":       False,
         "memory_summary":   "",
         "node_log":         [],
     }
     return _graph.invoke(initial_state, config=_thread_config)
 def resume_graph_after_hitl(state: AgentState, approved: bool) -> AgentState:
     from app.nodes.evaluation import evaluation_node, eval_route
     from app.nodes.guardrails import guardrails_node
     from app.nodes.output import output_node
         s = llm_node(s)
     s = guardrails_node(s)
     s = output_node(s)
     return s
 # ── Event handlers ─────────────────────────────────────────────────────────
 def handle_submit(user_message, chat_history):
+    global _pending_hitl_state
     if not user_message.strip():
         return chat_history, "", "*Waiting for a query...*", "", gr.update(visible=False), gr.update(value="")
         score = fs.get("evaluation_score", 0.0)
         g_ok  = fs.get("guardrail_passed", True)
+        # History managed entirely by output_node inside the graph
         chat_history = chat_history + [bot_msg(fs.get("response", ""))]
         meta = f"**Route:** {route.upper() or '—'}  ·  **Eval:** {score:.2f}  ·  **Guardrail:** {'✅ Passed' if g_ok else '🚫 Blocked'}"
         return (chat_history, "", format_trace(fs.get("node_log", [])),
 def handle_clear():
+    global _pending_hitl_state
+    _pending_hitl_state = None
     return [], "", "*Waiting for a query...*", "", gr.update(visible=False)
         with gr.Row(equal_height=True):
             with gr.Column(scale=4):
                 with gr.Group(elem_classes="section-box"):
                         label="Examples",
                     )
             with gr.Column(scale=1):
                 with gr.Group(elem_classes="section-box"):
                 with gr.Group(elem_classes="section-box"):
                     gr.Markdown("""**🗺 Graph Topology**
 ```
+START → safety
+  ├─ blocked → output → END
+  └─ continue → router
+       ├─ rag → llm
+       └─ tool/general → llm
+            ├─ tool_executor
+            └─ memory → hitl
+                 ├─ evaluation
+                 │    ├─ retry → llm
+                 │    └─ guardrails → output
+                 └─ END
 ```""")
         submit_outs = [chatbot, user_input, trace_display, meta_display, hitl_panel, hitl_content]

app/nodes/llm_node.py CHANGED Viewed

@@ -36,25 +36,33 @@ def llm_node(state: AgentState) -> AgentState:
         try:
             # Build system prompt
             system_parts = [
-                "You are a helpful AI assistant. Answer ONLY the current query concisely.",
-                "If a query is vague or has multiple possible meanings, ask ONE short clarifying question instead of assuming.",
-                "Do not reference or answer previous questions.",
             ]
             if state.get("rag_context"):
                 system_parts.append(f"\nUse the following context to answer:\n{state['rag_context']}")
             if state.get("memory_summary"):
-                system_parts.append(f"\nFor background context only (do NOT repeat or expand on this):\n{state['memory_summary']}")
             system_msg = SystemMessage(content="\n".join(system_parts))
-            # Always send only the current query — never full history
-            # Full history causes the LLM to blend previous topics
-            current_msg = HumanMessage(content=state["query"])
             if state["route"] == "tool":
-                ai_msg = _llm_with_tools.invoke([system_msg, current_msg])
             else:
-                ai_msg = llm.invoke([system_msg, current_msg])
             tool_calls    = getattr(ai_msg, "tool_calls", []) or []
             response_text = ai_msg.content or ""

         try:
             # Build system prompt
             system_parts = [
+                "You are a helpful AI assistant.",
+                "Answer the current query using the conversation history for context.",
+                "Keep responses concise and relevant.",
             ]
             if state.get("rag_context"):
                 system_parts.append(f"\nUse the following context to answer:\n{state['rag_context']}")
             if state.get("memory_summary"):
+                system_parts.append(f"\nConversation summary so far:\n{state['memory_summary']}")
             system_msg = SystemMessage(content="\n".join(system_parts))
+            # state["messages"] = prior safe history (from MemorySaver) + current HumanMessage
+            # Scrub tool noise, then build: [system, h1, a1, h2, a2, ..., current_query]
+            from langchain_core.messages import ToolMessage, AIMessage as AI
+            clean = [
+                m for m in state["messages"]
+                if not isinstance(m, ToolMessage)
+                and not (isinstance(m, AI) and getattr(m, "tool_calls", []))
+            ]
+            messages = [system_msg] + clean
             if state["route"] == "tool":
+                # Tool route: only current query to avoid re-firing old tool calls
+                ai_msg = _llm_with_tools.invoke([system_msg, HumanMessage(content=state["query"])])
             else:
+                # RAG / general: full clean history for context
+                ai_msg = llm.invoke(messages)
             tool_calls    = getattr(ai_msg, "tool_calls", []) or []
             response_text = ai_msg.content or ""

app/nodes/output.py CHANGED Viewed

@@ -1,11 +1,38 @@
-"""app/nodes/output.py — Final output node"""
-from langchain_core.messages import AIMessage
 from app.state import AgentState
 def output_node(state: AgentState) -> AgentState:
-    ai_message = AIMessage(content=state["response"])
-    updated_messages = state["messages"] + [ai_message]
-    log = state.get("node_log", []) + ["output"]
-    print(f"\n🤖  {state['response']}\n")
-    return {**state, "messages": updated_messages, "node_log": log}

+"""
+app/nodes/output.py
+────────────────────
+Single source of truth for message history.
+Flow per turn:
+  gradio sends:  messages=[]  (empty — MemorySaver restores checkpoint history)
+  safety adds:   HumanMessage(query) to messages
+  output_node:
+    - harmful/blocked  → drop the HumanMessage, keep prior history clean
+    - safe             → keep HumanMessage + append AIMessage(response)
+MemorySaver then persists the updated messages for next turn.
+"""
+from langchain_core.messages import AIMessage, HumanMessage
 from app.state import AgentState
 def output_node(state: AgentState) -> AgentState:
+    log          = state.get("node_log", []) + ["output"]
+    response     = state["response"]
+    messages     = list(state["messages"])
+    is_harmful   = state.get("is_harmful", False)
+    guardrail_ok = state.get("guardrail_passed", True)
+    if is_harmful or not guardrail_ok:
+        # Drop the HumanMessage for this turn — never pollute history
+        messages = [m for m in messages
+                    if not (isinstance(m, HumanMessage) and m.content == state["query"])]
+        print(f"\n🤖  {response}\n")
+        print("[OUTPUT] Harmful turn scrubbed from history.")
+    else:
+        # Safe — HumanMessage already in messages (added by safety_node)
+        # Just append the assistant response
+        messages = messages + [AIMessage(content=response)]
+        print(f"\n🤖  {response}\n")
+    return {**state, "messages": messages, "node_log": log}

app/nodes/safety.py CHANGED Viewed

@@ -1,20 +1,12 @@
 """
 app/nodes/safety.py
 ────────────────────
-Production safety pattern: Constitutional AI approach.
-The LLM that generates responses is also the best judge of whether
-a query is harmful — it has full language understanding and context.
-Pattern used by: Anthropic, Cohere, Mistral AI
-  → System prompt + structured classification prompt
-  → Confidence threshold to avoid false positives
-  → Fail-open on errors (don't block legitimate users)
-Two checks:
-  1. Query intent   — is the user asking for something harmful?
-  2. Response check — did the LLM accidentally produce harmful output?
-     (guardrails.py handles this after response is generated)
 """
 import json
 from langchain_core.messages import SystemMessage, HumanMessage
@@ -23,7 +15,7 @@ from app.utils.llm import llm
 SAFE_FALLBACK = "I'm sorry, I can't help with that request."
-SYSTEM_PROMPT = """You are a content safety classifier.
 Your only job is to analyze user queries and determine if they request harmful content.
 Be precise — do not over-block. Only flag clearly harmful requests."""
@@ -32,7 +24,7 @@ CLASSIFICATION_PROMPT = """Analyze this query and classify it.
 HARMFUL categories (flag these):
 - Violence: instructions to physically harm, kill, or assault people
 - Unauthorized access: hacking, stealing credentials, bypassing security
-- Weapons: making bombs, explosives, illegal weapons
 - Illegal substances: synthesizing or manufacturing drugs
 - Harassment: stalking, doxxing, non-consensual tracking
 - Self-harm: methods to hurt oneself or suicide instructions
@@ -41,27 +33,35 @@ HARMFUL categories (flag these):
 NOT harmful (do not flag these):
 - Competitive contexts: "beat my friend at chess", "win an argument"
 - Security education: conceptual explanations of how attacks work
-- Fiction: stories, games, hypotheticals with dark themes
-- Legitimate anger: "I'm so frustrated I could scream"
 - Medical: drug interactions, symptoms, treatments
 - History/news: discussing past violent events
 Query: "{query}"
-Respond ONLY in this exact JSON format:
-{{"harmful": true/false, "category": "violence|hacking|weapons|drugs|harassment|self_harm|hate|safe", "confidence": 0.0-1.0, "reason": "one sentence"}}"""
 def safety_node(state: AgentState) -> AgentState:
-    query = state.get("query", "")
-    log   = state.get("node_log", [])
     try:
         response = llm.invoke([
             SystemMessage(content=SYSTEM_PROMPT),
             HumanMessage(content=CLASSIFICATION_PROMPT.format(query=query)),
         ])
         raw    = response.content.strip().removeprefix("```json").removesuffix("```").strip()
         result = json.loads(raw)
@@ -70,18 +70,11 @@ def safety_node(state: AgentState) -> AgentState:
         category   = result.get("category", "safe")
         reason     = result.get("reason", "")
-        # IST timestamp for every query
-        from datetime import datetime, timezone, timedelta
-        IST = timezone(timedelta(hours=5, minutes=30))
-        ts  = datetime.now(IST).strftime("%d %b %Y %I:%M:%S %p IST")
-        print(f"[{ts}] [User Query] — {query}")
-        # Require high confidence to avoid false positives on edge cases
-        # e.g. "how to beat someone at chess" should NOT be blocked
         if harmful and confidence >= 0.85:
             print(f"[SAFETY] 🚫 Blocked — {category} ({confidence:.0%}): {reason}")
             return {
                 **state,
                 "is_harmful":       True,
                 "guardrail_passed": False,
                 "response":         SAFE_FALLBACK,
@@ -89,17 +82,24 @@ def safety_node(state: AgentState) -> AgentState:
             }
         if harmful and confidence < 0.85:
-            # Ambiguous — let it through, LLM + guardrails will handle
-            print(f"[SAFETY] ⚠️  Ambiguous ({category} {confidence:.0%}) — allowing through: {reason}")
         print(f"[SAFETY] ✅ Safe — {reason}")
-        return {**state, "is_harmful": False, "node_log": log + ["safety ✅"]}
     except Exception as e:
-        # Fail-open: if classifier breaks, don't block legitimate users
-        # Guardrails backstop will still catch harmful responses
-        print(f"[SAFETY] Classifier error ({e}) — fail-open, passing through")
-        return {**state, "is_harmful": False, "node_log": log + ["safety (error→allowed)"]}
 def safety_route(state: AgentState) -> str:

 """
 app/nodes/safety.py
 ────────────────────
+FIRST node in graph. Does two things:
+  1. Adds HumanMessage(query) to messages — so history is always updated here
+  2. Classifies query intent — harmful queries short-circuit to output_node
+output_node will scrub the HumanMessage if harmful.
+output_node will keep it + add AIMessage if safe.
 """
 import json
 from langchain_core.messages import SystemMessage, HumanMessage
 SAFE_FALLBACK = "I'm sorry, I can't help with that request."
+SYSTEM_PROMPT = """You are a content safety classifier.
 Your only job is to analyze user queries and determine if they request harmful content.
 Be precise — do not over-block. Only flag clearly harmful requests."""
 HARMFUL categories (flag these):
 - Violence: instructions to physically harm, kill, or assault people
 - Unauthorized access: hacking, stealing credentials, bypassing security
+- Weapons: making bombs, explosives, illegal weapons
 - Illegal substances: synthesizing or manufacturing drugs
 - Harassment: stalking, doxxing, non-consensual tracking
 - Self-harm: methods to hurt oneself or suicide instructions
 NOT harmful (do not flag these):
 - Competitive contexts: "beat my friend at chess", "win an argument"
 - Security education: conceptual explanations of how attacks work
+- Fiction/creative writing with dark themes
 - Medical: drug interactions, symptoms, treatments
 - History/news: discussing past violent events
 Query: "{query}"
+JSON only: {{"harmful": true/false, "category": "violence|hacking|weapons|drugs|harassment|self_harm|hate|safe", "confidence": 0.0-1.0, "reason": "one sentence"}}"""
 def safety_node(state: AgentState) -> AgentState:
+    query    = state.get("query", "")
+    messages = list(state.get("messages", []))
+    log      = state.get("node_log", [])
+    # ── Add HumanMessage to history first ────────────────────────────────
+    # output_node will scrub it if harmful, keep it if safe
+    messages = messages + [HumanMessage(content=query)]
+    # ── IST timestamp ─────────────────────────────────────────────────────
+    from datetime import datetime, timezone, timedelta
+    IST = timezone(timedelta(hours=5, minutes=30))
+    ts  = datetime.now(IST).strftime("%d %b %Y %I:%M:%S %p IST")
+    print(f"[{ts}] [User Query] — {query}")
     try:
         response = llm.invoke([
             SystemMessage(content=SYSTEM_PROMPT),
             HumanMessage(content=CLASSIFICATION_PROMPT.format(query=query)),
         ])
         raw    = response.content.strip().removeprefix("```json").removesuffix("```").strip()
         result = json.loads(raw)
         category   = result.get("category", "safe")
         reason     = result.get("reason", "")
         if harmful and confidence >= 0.85:
             print(f"[SAFETY] 🚫 Blocked — {category} ({confidence:.0%}): {reason}")
             return {
                 **state,
+                "messages":         messages,   # HumanMessage included — output_node will scrub
                 "is_harmful":       True,
                 "guardrail_passed": False,
                 "response":         SAFE_FALLBACK,
             }
         if harmful and confidence < 0.85:
+            print(f"[SAFETY] ⚠️  Ambiguous ({category} {confidence:.0%}) — allowing: {reason}")
         print(f"[SAFETY] ✅ Safe — {reason}")
+        return {
+            **state,
+            "messages":   messages,
+            "is_harmful": False,
+            "node_log":   log + ["safety ✅"],
+        }
     except Exception as e:
+        print(f"[SAFETY] Classifier error ({e}) — fail-open")
+        return {
+            **state,
+            "messages":   messages,
+            "is_harmful": False,
+            "node_log":   log + ["safety (error→allowed)"],
+        }
 def safety_route(state: AgentState) -> str: