""" app/nodes/evaluation.py — CHECKPOINT 7: Evaluation Clean, simple quality scoring with no hardcoded safety logic. Safety is handled entirely by safety_node (before the graph runs) and guardrails_node (after the response is generated). Auto-pass cases: - tool route: tool outputs are always valid, not prose - general route: subjective responses (jokes, opinions) can't be scored fairly """ from langchain_core.messages import HumanMessage from app.state import AgentState from app.utils.llm import llm from app.config import settings def evaluation_node(state: AgentState) -> AgentState: log = state.get("node_log", []) response = state.get("response", "") route = state.get("route", "") # Tool outputs are structured data, not prose — always pass if route == "tool" or state.get("tool_results"): print("[EVAL] Tool response — auto-passed.") return {**state, "evaluation_score": 1.0, "node_log": log + ["evaluation (tool auto-pass ✅)"]} # General/casual — subjective, not scoreable fairly if route == "general": print("[EVAL] General route — auto-passed.") return {**state, "evaluation_score": 1.0, "node_log": log + ["evaluation (general auto-pass ✅)"]} # RAG responses — score relevance and quality eval_prompt = f"""Rate this AI response for relevance and quality on a scale of 0.0 to 1.0. Return ONLY a float between 0.0 and 1.0 — nothing else. Query: {state['query']} Response: {response} Score:""" try: raw = llm.invoke([HumanMessage(content=eval_prompt)]).content.strip() score = max(0.0, min(1.0, float(raw))) except Exception: score = 0.8 # safe fallback current_retries = state.get("retry_count", 0) below = score < settings.EVAL_THRESHOLD new_retries = (current_retries + 1) if below else current_retries print(f"[EVAL] Score: {score:.2f} (threshold: {settings.EVAL_THRESHOLD}, retries: {current_retries})") return { **state, "evaluation_score": score, "retry_count": new_retries, "node_log": log + [f"evaluation (score={score:.2f}, retry={new_retries})"], } def eval_route(state: AgentState) -> str: score = state["evaluation_score"] retry_count = state.get("retry_count", 0) if score < settings.EVAL_THRESHOLD and retry_count <= settings.MAX_RETRIES: print(f"[EVAL] Score {score:.2f} below threshold — retry {retry_count}/{settings.MAX_RETRIES}") return "retry" if score < settings.EVAL_THRESHOLD: print(f"[EVAL] Max retries reached — proceeding.") return "guardrails"