Spaces:
Running
Running
File size: 2,700 Bytes
8986591 1d6b948 8986591 18deb87 8986591 1d6b948 18deb87 1d6b948 18deb87 1d6b948 8986591 18deb87 8986591 18deb87 8986591 1d6b948 8986591 1d6b948 8986591 1d6b948 8986591 1d6b948 8986591 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | """
app/nodes/evaluation.py β CHECKPOINT 7: Evaluation
Clean, simple quality scoring with no hardcoded safety logic.
Safety is handled entirely by safety_node (before the graph runs)
and guardrails_node (after the response is generated).
Auto-pass cases:
- tool route: tool outputs are always valid, not prose
- general route: subjective responses (jokes, opinions) can't be scored fairly
"""
from langchain_core.messages import HumanMessage
from app.state import AgentState
from app.utils.llm import llm
from app.config import settings
def evaluation_node(state: AgentState) -> AgentState:
log = state.get("node_log", [])
response = state.get("response", "")
route = state.get("route", "")
# Tool outputs are structured data, not prose β always pass
if route == "tool" or state.get("tool_results"):
print("[EVAL] Tool response β auto-passed.")
return {**state, "evaluation_score": 1.0,
"node_log": log + ["evaluation (tool auto-pass β
)"]}
# General/casual β subjective, not scoreable fairly
if route == "general":
print("[EVAL] General route β auto-passed.")
return {**state, "evaluation_score": 1.0,
"node_log": log + ["evaluation (general auto-pass β
)"]}
# RAG responses β score relevance and quality
eval_prompt = f"""Rate this AI response for relevance and quality on a scale of 0.0 to 1.0.
Return ONLY a float between 0.0 and 1.0 β nothing else.
Query: {state['query']}
Response: {response}
Score:"""
try:
raw = llm.invoke([HumanMessage(content=eval_prompt)]).content.strip()
score = max(0.0, min(1.0, float(raw)))
except Exception:
score = 0.8 # safe fallback
current_retries = state.get("retry_count", 0)
below = score < settings.EVAL_THRESHOLD
new_retries = (current_retries + 1) if below else current_retries
print(f"[EVAL] Score: {score:.2f} (threshold: {settings.EVAL_THRESHOLD}, retries: {current_retries})")
return {
**state,
"evaluation_score": score,
"retry_count": new_retries,
"node_log": log + [f"evaluation (score={score:.2f}, retry={new_retries})"],
}
def eval_route(state: AgentState) -> str:
score = state["evaluation_score"]
retry_count = state.get("retry_count", 0)
if score < settings.EVAL_THRESHOLD and retry_count <= settings.MAX_RETRIES:
print(f"[EVAL] Score {score:.2f} below threshold β retry {retry_count}/{settings.MAX_RETRIES}")
return "retry"
if score < settings.EVAL_THRESHOLD:
print(f"[EVAL] Max retries reached β proceeding.")
return "guardrails" |