Spaces:

HF-Pawan
/

LangGraph-Agent

Running

App Files Files Community

Pawan Mane commited on Mar 5

Commit

1d6b948

1 Parent(s): 18deb87

Memory Poisoning Error Fixising

Browse files

Files changed (7) hide show

app/frontend/gradio_app_hf.py +1 -0
app/graph/builder.py +40 -30
app/nodes/evaluation.py +18 -45
app/nodes/guardrails.py +37 -11
app/nodes/llm_node.py +7 -0
app/nodes/safety.py +98 -0
app/state.py +2 -1

app/frontend/gradio_app_hf.py CHANGED Viewed

@@ -64,6 +64,7 @@ def run_graph(query: str) -> AgentState:
         "guardrail_passed": True,
         "memory_summary":   "",
         "node_log":         [],
     }
     return _graph.invoke(initial_state, config=_thread_config)

         "guardrail_passed": True,
         "memory_summary":   "",
         "node_log":         [],
+        "is_harmful": False,
     }
     return _graph.invoke(initial_state, config=_thread_config)

app/graph/builder.py CHANGED Viewed

@@ -1,39 +1,41 @@
 """
 app/graph/builder.py
 ─────────────────────
-Assembles the LangGraph StateGraph from all nodes and edges.
-This is the only file that knows about graph topology.
 Graph topology:
-                    ┌──────────┐
-              ┌────►│   rag    │────┐
-              │     └──────────┘    │
- [START] ─► router                  ▼
-              │     ┌──────────────────────────────┐
-              └────►│     llm (tool / general)     │
-                    └──────────────────────────────┘
-                         │          │
-                  tool_calls?      none
-                         │          │
-                    tool_executor   │
-                         │          │
-                         ▼          ▼
-                        memory ◄────┘
-                          │
-                         hitl ──(rejected)──► END
-                          │
-                       evaluation ──(retry)──► llm
-                          │
-                       guardrails
-                          │
-                        output
-                          │
-                         END
 """
 from langgraph.graph import StateGraph, END
 from langgraph.checkpoint.memory import MemorySaver
 from app.state import AgentState
 from app.nodes import (
     router_node, route_selector,
     rag_node,
@@ -52,6 +54,7 @@ def build_graph():
     builder = StateGraph(AgentState)
     # ── Register nodes ────────────────────────────────────────────────────
     builder.add_node("router",        router_node)
     builder.add_node("rag",           rag_node)
     builder.add_node("llm",           llm_node)
@@ -62,10 +65,17 @@ def build_graph():
     builder.add_node("guardrails",    guardrails_node)
     builder.add_node("output",        output_node)
-    # ── Entry point ───────────────────────────────────────────────────────
-    builder.set_entry_point("router")
-    # ── Conditional routing (CHECKPOINT 3) ────────────────────────────────
     builder.add_conditional_edges(
         "router",
         route_selector,

 """
 app/graph/builder.py
 ─────────────────────
 Graph topology:
+ [START] ─► safety ──(blocked)──► output ──► END
+               │
+           (continue)
+               │
+             router
+              / | \
+           rag  |  tool/general
+            \   |   /
+             ▼  ▼  ▼
+              llm
+            /       \
+    tool_calls?     none
+         |            |
+   tool_executor      |
+         |            |
+         ▼            ▼
+           memory
+              |
+            hitl ──(rejected)──► END
+              |
+          evaluation ──(retry)──► llm
+              |
+          guardrails
+              |
+            output
+              |
+             END
 """
 from langgraph.graph import StateGraph, END
 from langgraph.checkpoint.memory import MemorySaver
 from app.state import AgentState
+from app.nodes.safety import safety_node, safety_route
 from app.nodes import (
     router_node, route_selector,
     rag_node,
     builder = StateGraph(AgentState)
     # ── Register nodes ────────────────────────────────────────────────────
+    builder.add_node("safety",        safety_node)
     builder.add_node("router",        router_node)
     builder.add_node("rag",           rag_node)
     builder.add_node("llm",           llm_node)
     builder.add_node("guardrails",    guardrails_node)
     builder.add_node("output",        output_node)
+    # ── Entry: safety first ───────────────────────────────────────────────
+    builder.set_entry_point("safety")
+    # Safety gate — blocked queries skip everything and go straight to output
+    builder.add_conditional_edges(
+        "safety",
+        safety_route,
+        {"blocked": "output", "continue": "router"},
+    )
+    # ── Routing ───────────────────────────────────────────────────────────
     builder.add_conditional_edges(
         "router",
         route_selector,

app/nodes/evaluation.py CHANGED Viewed

@@ -1,65 +1,40 @@
 """
 app/nodes/evaluation.py — CHECKPOINT 7: Evaluation
-Auto-pass cases (no LLM scoring needed):
-  1. Tool responses        — always valid
-  2. LLM refusals         — intentional, let guardrails handle
-  3. General/casual route — jokes, greetings, opinions don't need quality scoring
-     (evaluator would unfairly penalise short creative responses)
 """
 from langchain_core.messages import HumanMessage
 from app.state import AgentState
 from app.utils.llm import llm
 from app.config import settings
-# Phrases that indicate the LLM intentionally refused — don't retry these
-REFUSAL_PHRASES = [
-    "sensitive", "harmful", "hate", "threat", "negative", "i can't help with that."
-    "i cannot provide information on",
-    "i can't help", "i cannot help", "i'm unable", "i am unable",
-    "i won't", "i will not", "not able to assist", "can't assist",
-    "cannot assist", "i'm sorry, i can't", "i'm not able",
-    "i must decline", "i'd rather not", "i don't think i should",
-    "i cannot provide", "i can't provide", "i'm not comfortable",
-    "that's not something i", "not something i can",
-    "i'm an ai", "as an ai",
-]
-def _is_refusal(response: str) -> bool:
-    low = response.lower()
-    return any(phrase in low for phrase in REFUSAL_PHRASES)
 def evaluation_node(state: AgentState) -> AgentState:
     log      = state.get("node_log", [])
     response = state.get("response", "")
     route    = state.get("route", "")
-    # ── 1. Tool responses — always valid ──────────────────────────────────
     if route == "tool" or state.get("tool_results"):
         print("[EVAL] Tool response — auto-passed.")
         return {**state, "evaluation_score": 1.0,
                 "node_log": log + ["evaluation (tool auto-pass ✅)"]}
-    # ── 2. General/casual — auto-pass ─────────────────────────────────────
-    # Jokes, greetings, opinions are subjective — LLM scorer would unfairly
-    # give 0.2 to a perfectly good joke. Let guardrails handle safety.
     if route == "general":
         print("[EVAL] General route — auto-passed.")
         return {**state, "evaluation_score": 1.0,
                 "node_log": log + ["evaluation (general auto-pass ✅)"]}
-    # ── 3. LLM refusal — skip retries, forward to guardrails ──────────────
-    if _is_refusal(response):
-        print(f"[EVAL] Refusal detected → skipping retries.")
-        return {**state, "evaluation_score": 1.0,
-                "node_log": log + ["evaluation (refusal auto-pass ✅)"]}
-    # ── 4. RAG responses — score with LLM ─────────────────────────────────
-    eval_prompt = f"""Rate the following AI response on a scale of 0.0 to 1.0
-for relevance and quality relative to the query.
-Return ONLY a float number between 0.0 and 1.0 — no other text.
 Query: {state['query']}
 Response: {response}
@@ -70,29 +45,27 @@ Score:"""
         raw   = llm.invoke([HumanMessage(content=eval_prompt)]).content.strip()
         score = max(0.0, min(1.0, float(raw)))
     except Exception:
-        score = 0.8
     current_retries = state.get("retry_count", 0)
-    below_threshold = score < settings.EVAL_THRESHOLD
-    new_retry_count = (current_retries + 1) if below_threshold else current_retries
     print(f"[EVAL] Score: {score:.2f} (threshold: {settings.EVAL_THRESHOLD}, retries: {current_retries})")
     return {
         **state,
         "evaluation_score": score,
-        "retry_count":      new_retry_count,
-        "node_log":         log + [f"evaluation (score={score:.2f}, retry={new_retry_count})"],
     }
 def eval_route(state: AgentState) -> str:
     score       = state["evaluation_score"]
     retry_count = state.get("retry_count", 0)
     if score < settings.EVAL_THRESHOLD and retry_count <= settings.MAX_RETRIES:
         print(f"[EVAL] Score {score:.2f} below threshold — retry {retry_count}/{settings.MAX_RETRIES}")
         return "retry"
     if score < settings.EVAL_THRESHOLD:
-        print(f"[EVAL] Max retries ({settings.MAX_RETRIES}) reached — proceeding anyway.")
     return "guardrails"

 """
 app/nodes/evaluation.py — CHECKPOINT 7: Evaluation
+Clean, simple quality scoring with no hardcoded safety logic.
+Safety is handled entirely by safety_node (before the graph runs)
+and guardrails_node (after the response is generated).
+Auto-pass cases:
+  - tool route: tool outputs are always valid, not prose
+  - general route: subjective responses (jokes, opinions) can't be scored fairly
 """
 from langchain_core.messages import HumanMessage
 from app.state import AgentState
 from app.utils.llm import llm
 from app.config import settings
 def evaluation_node(state: AgentState) -> AgentState:
     log      = state.get("node_log", [])
     response = state.get("response", "")
     route    = state.get("route", "")
+    # Tool outputs are structured data, not prose — always pass
     if route == "tool" or state.get("tool_results"):
         print("[EVAL] Tool response — auto-passed.")
         return {**state, "evaluation_score": 1.0,
                 "node_log": log + ["evaluation (tool auto-pass ✅)"]}
+    # General/casual — subjective, not scoreable fairly
     if route == "general":
         print("[EVAL] General route — auto-passed.")
         return {**state, "evaluation_score": 1.0,
                 "node_log": log + ["evaluation (general auto-pass ✅)"]}
+    # RAG responses — score relevance and quality
+    eval_prompt = f"""Rate this AI response for relevance and quality on a scale of 0.0 to 1.0.
+Return ONLY a float between 0.0 and 1.0 — nothing else.
 Query: {state['query']}
 Response: {response}
         raw   = llm.invoke([HumanMessage(content=eval_prompt)]).content.strip()
         score = max(0.0, min(1.0, float(raw)))
     except Exception:
+        score = 0.8  # safe fallback
     current_retries = state.get("retry_count", 0)
+    below           = score < settings.EVAL_THRESHOLD
+    new_retries     = (current_retries + 1) if below else current_retries
     print(f"[EVAL] Score: {score:.2f} (threshold: {settings.EVAL_THRESHOLD}, retries: {current_retries})")
     return {
         **state,
         "evaluation_score": score,
+        "retry_count":      new_retries,
+        "node_log":         log + [f"evaluation (score={score:.2f}, retry={new_retries})"],
     }
 def eval_route(state: AgentState) -> str:
     score       = state["evaluation_score"]
     retry_count = state.get("retry_count", 0)
     if score < settings.EVAL_THRESHOLD and retry_count <= settings.MAX_RETRIES:
         print(f"[EVAL] Score {score:.2f} below threshold — retry {retry_count}/{settings.MAX_RETRIES}")
         return "retry"
     if score < settings.EVAL_THRESHOLD:
+        print(f"[EVAL] Max retries reached — proceeding.")
     return "guardrails"

app/nodes/guardrails.py CHANGED Viewed

@@ -1,18 +1,44 @@
-"""app/nodes/guardrails.py — CHECKPOINT 8: Guardrails"""
 from app.state import AgentState
-from app.config import settings
 SAFE_FALLBACK = "I'm sorry, I can't help with that request."
 def guardrails_node(state: AgentState) -> AgentState:
-    response_lower = state.get("response", "").lower()
-    triggered = [p for p in settings.BLOCKED_PHRASES if p in response_lower]
-    log = state.get("node_log", [])
-    if triggered:
-        print(f"[GUARDRAILS] ⚠️  Blocked — matched phrases: {triggered}")
-        log = log + [f"guardrails (BLOCKED: {triggered})"]
-        return {**state, "guardrail_passed": False, "response": SAFE_FALLBACK, "node_log": log}
     print("[GUARDRAILS] ✅ Passed.")
-    log = log + ["guardrails ✅"]
-    return {**state, "guardrail_passed": True, "node_log": log}

+"""
+app/nodes/guardrails.py — CHECKPOINT 8: Guardrails
+Simplified — safety_node already handled query-level intent.
+This node only checks the final response for accidental harmful content
+using the LLM (not keywords) as a final backstop.
+"""
+from langchain_core.messages import HumanMessage
 from app.state import AgentState
+from app.utils.llm import llm
 SAFE_FALLBACK = "I'm sorry, I can't help with that request."
+GUARDRAIL_PROMPT = """Does the following AI response contain harmful, dangerous, or illegal instructions?
+Answer ONLY with "yes" or "no".
+Response: """
 def guardrails_node(state: AgentState) -> AgentState:
+    response = state.get("response", "")
+    log      = state.get("node_log", [])
+    # If already blocked by safety_node, just pass through
+    if not state.get("guardrail_passed", True) or state.get("is_harmful"):
+        return state
+    try:
+        check = llm.invoke([HumanMessage(content=GUARDRAIL_PROMPT + response)])
+        verdict = check.content.strip().lower()
+        if verdict.startswith("yes"):
+            print("[GUARDRAILS] ⚠️  Response contains harmful content — blocked.")
+            return {
+                **state,
+                "guardrail_passed": False,
+                "response":         SAFE_FALLBACK,
+                "node_log":         log + ["guardrails (RESPONSE BLOCKED ⚠️)"],
+            }
+    except Exception as e:
+        print(f"[GUARDRAILS] Check failed ({e}) — passing through.")
     print("[GUARDRAILS] ✅ Passed.")
+    return {**state, "guardrail_passed": True, "node_log": log + ["guardrails ✅"]}

app/nodes/llm_node.py CHANGED Viewed

@@ -43,6 +43,9 @@ def llm_node(state: AgentState) -> AgentState:
             system_msg = SystemMessage(content="\n".join(system_parts))
             # Always send only the current query — never full history
             # Full history causes the LLM to blend previous topics
             current_msg = HumanMessage(content=state["query"])
@@ -55,6 +58,10 @@ def llm_node(state: AgentState) -> AgentState:
             tool_calls    = getattr(ai_msg, "tool_calls", []) or []
             response_text = ai_msg.content or ""
             print(f"[LLM] Attempt {attempt} succeeded. Tool calls: {len(tool_calls)}")
             print(f"[LLM] Generated Output for User Query ({state['query'][:60]}) : {response_text[:200]}")
             log = state.get("node_log", []) + [f"llm (attempt={attempt}, route={state['route']})"]

             system_msg = SystemMessage(content="\n".join(system_parts))
+            print("\n\n\n")
+            print(system_msg)
+            print("\n\n\n")
             # Always send only the current query — never full history
             # Full history causes the LLM to blend previous topics
             current_msg = HumanMessage(content=state["query"])
             tool_calls    = getattr(ai_msg, "tool_calls", []) or []
             response_text = ai_msg.content or ""
+            print("\n\n\n")
+            print(response_text)
+            print("\n\n\n")
             print(f"[LLM] Attempt {attempt} succeeded. Tool calls: {len(tool_calls)}")
             print(f"[LLM] Generated Output for User Query ({state['query'][:60]}) : {response_text[:200]}")
             log = state.get("node_log", []) + [f"llm (attempt={attempt}, route={state['route']})"]

app/nodes/safety.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""
+app/nodes/safety.py — SAFETY CLASSIFIER (runs first in graph)
+Single source of truth for all safety decisions.
+Uses the LLM to dynamically judge intent — no hardcoded phrase lists.
+Returns:
+  is_harmful: bool   — True if query should be blocked
+  reason: str        — why it was flagged
+  safe_response: str — what to tell the user if blocked
+By doing this ONCE at the start, every other node stays clean:
+  - router    → just routes, no safety logic
+  - llm_node  → just generates, no safety logic
+  - evaluation → just scores quality, no safety logic
+  - guardrails → just checks response quality, no safety logic
+"""
+import json
+from langchain_core.messages import HumanMessage
+from app.state import AgentState
+from app.utils.llm import llm
+SAFE_FALLBACK = "I'm sorry, I can't help with that request."
+SAFETY_PROMPT = """You are a safety classifier for an AI assistant.
+Analyze the user query and determine if it requests harmful, illegal, or dangerous content.
+Categories to BLOCK:
+- Violence: instructions to physically harm, attack, beat, assault, or kill people or animals
+- Hacking: unauthorized access, stealing credentials, bypassing security systems
+- Weapons: making bombs, explosives, or illegal weapons
+- Illegal drugs: synthesizing or manufacturing controlled substances
+- Harassment: stalking, doxxing, spying on people without consent
+- Self-harm: methods to harm oneself or commit suicide
+- Hate speech: content targeting people based on race, religion, gender, etc.
+Do NOT block:
+- Competitive contexts ("how to beat my friend at chess")
+- Security education ("explain how SQL injection works conceptually")
+- Medical questions ("what drugs interact with aspirin")
+- Fiction/creative writing with dark themes
+- News/historical discussion of violence
+- Legitimate anger expressions ("I'm so mad I could scream")
+Think carefully about INTENT and CONTEXT before deciding.
+Respond ONLY with valid JSON:
+{
+  "is_harmful": true or false,
+  "category": "violence|hacking|weapons|drugs|harassment|self_harm|hate_speech|safe",
+  "reason": "one sentence explanation",
+  "confidence": 0.0 to 1.0
+}
+User query: """
+def safety_node(state: AgentState) -> AgentState:
+    query = state.get("query", "")
+    log   = state.get("node_log", [])
+    try:
+        response = llm.invoke([HumanMessage(content=SAFETY_PROMPT + query)])
+        raw      = response.content.strip().removeprefix("```json").removesuffix("```").strip()
+        result   = json.loads(raw)
+        is_harmful = result.get("is_harmful", False)
+        category   = result.get("category", "safe")
+        reason     = result.get("reason", "")
+        confidence = float(result.get("confidence", 0.0))
+        if is_harmful and confidence >= 0.7:
+            print(f"[SAFETY] 🚫 Blocked — {category} ({confidence:.0%}): {reason}")
+            return {
+                **state,
+                "is_harmful":       True,
+                "guardrail_passed": False,
+                "response":         SAFE_FALLBACK,
+                "node_log":         log + [f"safety (BLOCKED: {category} {confidence:.0%})"],
+            }
+        print(f"[SAFETY] ✅ Safe — {reason}")
+        return {
+            **state,
+            "is_harmful": False,
+            "node_log":   log + ["safety ✅"],
+        }
+    except Exception as e:
+        # On parse failure, allow through — LLM + guardrails still backstop
+        print(f"[SAFETY] Parse error ({e}) — allowing through")
+        return {**state, "is_harmful": False, "node_log": log + ["safety (parse error — allowed)"]}
+def safety_route(state: AgentState) -> str:
+    """Edge function — short-circuit to output if harmful."""
+    return "blocked" if state.get("is_harmful") else "continue"

app/state.py CHANGED Viewed

@@ -21,4 +21,5 @@ class AgentState(TypedDict):
     evaluation_score: float # LLM self-evaluation score 0.0 – 1.0
     guardrail_passed: bool  # Did the safety filter pass?
     memory_summary: str     # Compressed summary of older conversation turns
-    node_log: List[str]     # Ordered list of nodes visited — shown in Gradio UI

     evaluation_score: float # LLM self-evaluation score 0.0 – 1.0
     guardrail_passed: bool  # Did the safety filter pass?
     memory_summary: str     # Compressed summary of older conversation turns
+    node_log: List[str]     # Ordered list of nodes visited — shown in Gradio UI
+    is_harmful: bool        # Did safety_node flag this query as harmful?