Spaces:
Sleeping
Sleeping
| """ | |
| app/nodes/evaluation.py β CHECKPOINT 7: Evaluation | |
| Clean, simple quality scoring with no hardcoded safety logic. | |
| Safety is handled entirely by safety_node (before the graph runs) | |
| and guardrails_node (after the response is generated). | |
| Auto-pass cases: | |
| - tool route: tool outputs are always valid, not prose | |
| - general route: subjective responses (jokes, opinions) can't be scored fairly | |
| """ | |
| from langchain_core.messages import HumanMessage | |
| from app.state import AgentState | |
| from app.utils.llm import llm | |
| from app.config import settings | |
| def evaluation_node(state: AgentState) -> AgentState: | |
| log = state.get("node_log", []) | |
| response = state.get("response", "") | |
| route = state.get("route", "") | |
| # Tool outputs are structured data, not prose β always pass | |
| if route == "tool" or state.get("tool_results"): | |
| print("[EVAL] Tool response β auto-passed.") | |
| return {**state, "evaluation_score": 1.0, | |
| "node_log": log + ["evaluation (tool auto-pass β )"]} | |
| # General/casual β subjective, not scoreable fairly | |
| if route == "general": | |
| print("[EVAL] General route β auto-passed.") | |
| return {**state, "evaluation_score": 1.0, | |
| "node_log": log + ["evaluation (general auto-pass β )"]} | |
| # RAG responses β score relevance and quality | |
| eval_prompt = f"""Rate this AI response for relevance and quality on a scale of 0.0 to 1.0. | |
| Return ONLY a float between 0.0 and 1.0 β nothing else. | |
| Query: {state['query']} | |
| Response: {response} | |
| Score:""" | |
| try: | |
| raw = llm.invoke([HumanMessage(content=eval_prompt)]).content.strip() | |
| score = max(0.0, min(1.0, float(raw))) | |
| except Exception: | |
| score = 0.8 # safe fallback | |
| current_retries = state.get("retry_count", 0) | |
| below = score < settings.EVAL_THRESHOLD | |
| new_retries = (current_retries + 1) if below else current_retries | |
| print(f"[EVAL] Score: {score:.2f} (threshold: {settings.EVAL_THRESHOLD}, retries: {current_retries})") | |
| return { | |
| **state, | |
| "evaluation_score": score, | |
| "retry_count": new_retries, | |
| "node_log": log + [f"evaluation (score={score:.2f}, retry={new_retries})"], | |
| } | |
| def eval_route(state: AgentState) -> str: | |
| score = state["evaluation_score"] | |
| retry_count = state.get("retry_count", 0) | |
| if score < settings.EVAL_THRESHOLD and retry_count <= settings.MAX_RETRIES: | |
| print(f"[EVAL] Score {score:.2f} below threshold β retry {retry_count}/{settings.MAX_RETRIES}") | |
| return "retry" | |
| if score < settings.EVAL_THRESHOLD: | |
| print(f"[EVAL] Max retries reached β proceeding.") | |
| return "guardrails" |