File size: 2,700 Bytes
8986591
 
 
1d6b948
 
 
 
 
 
 
8986591
 
 
 
 
 
 
18deb87
 
 
 
8986591
1d6b948
18deb87
 
 
 
 
1d6b948
18deb87
 
 
 
 
1d6b948
 
 
8986591
18deb87
 
8986591
18deb87
8986591
 
 
 
 
1d6b948
8986591
 
1d6b948
 
8986591
 
 
 
 
1d6b948
 
8986591
 
 
 
 
 
 
 
 
 
1d6b948
8986591
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
"""
app/nodes/evaluation.py β€” CHECKPOINT 7: Evaluation

Clean, simple quality scoring with no hardcoded safety logic.
Safety is handled entirely by safety_node (before the graph runs)
and guardrails_node (after the response is generated).

Auto-pass cases:
  - tool route: tool outputs are always valid, not prose
  - general route: subjective responses (jokes, opinions) can't be scored fairly
"""
from langchain_core.messages import HumanMessage
from app.state import AgentState
from app.utils.llm import llm
from app.config import settings


def evaluation_node(state: AgentState) -> AgentState:
    log      = state.get("node_log", [])
    response = state.get("response", "")
    route    = state.get("route", "")

    # Tool outputs are structured data, not prose β€” always pass
    if route == "tool" or state.get("tool_results"):
        print("[EVAL] Tool response β€” auto-passed.")
        return {**state, "evaluation_score": 1.0,
                "node_log": log + ["evaluation (tool auto-pass βœ…)"]}

    # General/casual β€” subjective, not scoreable fairly
    if route == "general":
        print("[EVAL] General route β€” auto-passed.")
        return {**state, "evaluation_score": 1.0,
                "node_log": log + ["evaluation (general auto-pass βœ…)"]}

    # RAG responses β€” score relevance and quality
    eval_prompt = f"""Rate this AI response for relevance and quality on a scale of 0.0 to 1.0.
Return ONLY a float between 0.0 and 1.0 β€” nothing else.

Query: {state['query']}
Response: {response}

Score:"""

    try:
        raw   = llm.invoke([HumanMessage(content=eval_prompt)]).content.strip()
        score = max(0.0, min(1.0, float(raw)))
    except Exception:
        score = 0.8  # safe fallback

    current_retries = state.get("retry_count", 0)
    below           = score < settings.EVAL_THRESHOLD
    new_retries     = (current_retries + 1) if below else current_retries

    print(f"[EVAL] Score: {score:.2f} (threshold: {settings.EVAL_THRESHOLD}, retries: {current_retries})")
    return {
        **state,
        "evaluation_score": score,
        "retry_count":      new_retries,
        "node_log":         log + [f"evaluation (score={score:.2f}, retry={new_retries})"],
    }


def eval_route(state: AgentState) -> str:
    score       = state["evaluation_score"]
    retry_count = state.get("retry_count", 0)
    if score < settings.EVAL_THRESHOLD and retry_count <= settings.MAX_RETRIES:
        print(f"[EVAL] Score {score:.2f} below threshold β€” retry {retry_count}/{settings.MAX_RETRIES}")
        return "retry"
    if score < settings.EVAL_THRESHOLD:
        print(f"[EVAL] Max retries reached β€” proceeding.")
    return "guardrails"