File size: 2,848 Bytes
7a24d7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd3972e
7a24d7f
 
 
dd3972e
7a24d7f
 
 
 
 
dd3972e
7a24d7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd3972e
7a24d7f
 
 
 
 
 
 
dd3972e
7a24d7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# app/graph/nodes/evaluator.py

from app.core.llm_engine import eval_llm
from app.core.prompts.evaluator_prompt import evaluator_prompt
from langchain_core.output_parsers import StrOutputParser
import json, re

chain = evaluator_prompt | eval_llm | StrOutputParser()


def _extract_json(text: str) -> dict:
    """Robustly extract JSON from LLM response, handling thinking blocks."""

    # βœ… Strip Gemini thinking/reasoning blocks
    text = re.sub(r"<thinking>.*?</thinking>", "", text, flags=re.DOTALL)
    text = re.sub(r"<thought>.*?</thought>", "", text, flags=re.DOTALL)

    # βœ… Strip markdown code fences
    text = re.sub(r"```(?:json)?", "", text)
    text = text.strip()

    # βœ… Greedy match β€” finds outermost { ... } correctly
    # [^{}]* fails on any nested structure, use .* with DOTALL instead
    match = re.search(r"\{.*\}", text, re.DOTALL)
    if not match:
        raise ValueError(f"No JSON found. Raw: {text[:300]}")

    raw_json = match.group(0).strip()
    return json.loads(raw_json)


def _fallback_evaluation():
    """Explicit fallback β€” always returns a valid dict."""
    return {
        "relevance_score": 0.5,
        "context_usage": 0.5,
        "hallucination": True,
        "route": "rag"
    }



def evaluator_node(state):
    query = state.get("query")
    answer = state.get("final_answer")
    context = state.get("context", "")
    route = state.get("route", "general")

    # βœ… Don't evaluate general answers against RAG context β€” they'll always score 0
    if route == "general" or not context:
        return {
            **state,
            "evaluation": {
                "relevance_score": 1.0,
                "context_usage": None,   # N/A for general
                "hallucination": False,
                "route": "general"
            }
        }

    try:
        raw_response = chain.invoke({
            "query": query,
            "answer": answer,
            "context": context[:600]
        }).strip()

        print(f"EVALUATOR RAW β†’ {raw_response[:300]}")  # βœ… log first 200 chars to debug

        parsed= _extract_json(raw_response)

        evaluation = {
            "relevance_score": round(min(max(float(parsed.get("relevance_score", 0)), 0), 1), 3),
            "context_usage": round(min(max(float(parsed.get("context_usage", 0)), 0), 1), 3),
            "hallucination": bool(parsed.get("hallucination", True)),
            "route": "rag"
        }

        print(f"EVALUATOR SUCCESS β†’ {evaluation}")

        # βœ… return is INSIDE try β€” only reached if no exception above
        return {**state, "evaluation": evaluation}



    except Exception as e:
        print("EVALUATOR ERROR β†’", e)
        
        # βœ… return is INSIDE except β€” evaluation variable always defined
        return {**state, "evaluation": _fallback_evaluation()}