File size: 4,089 Bytes
e266561
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""
analyze_node.py β€” Evaluates the user's reasoning and identifies the primary gap.

Improvements over v1:
  - Uses llm.with_structured_output() for guaranteed schema compliance
  - Clamps gap_magnitude to [0, 10] as a safety guard
  - Runs sandboxed code evaluation and blends result into hybrid gap score
  - Loads and updates UserProfile in SQLite for persistent memory
  - Populates explain-why-wrong fields (mistake, why_wrong, correct_thinking)
"""

from agent.models import AgentState, EvaluationOutput
from agent.llm_factory import get_llm
from agent.prompts import ANALYZE_PROMPT
from agent.knowledge import get_misconceptions
from agent.memory import load_profile, update_profile, persist_profile
from agent.sandbox import run_code_safely, get_test_cases_for_topic

_llm = get_llm()
_structured_llm = _llm.with_structured_output(EvaluationOutput, method="function_calling")


def evaluate_reasoning(state: AgentState) -> dict:
    """
    Analyzes user's thought process and code.
    Updates the UserProfile in the DB with the latest gap scores.
    Returns identified_gap, gap_magnitude, explain-why-wrong fields, and test_pass_rate.
    """
    topic = state.get("problem_topic", "Unknown")
    code = state.get("code", "") or ""
    session_id = state.get("session_id", "anonymous")

    # ── 1. Run sandbox evaluation if code is provided ───────────────────────
    test_results_summary = "No code submitted."
    test_pass_rate = None
    if code.strip():
        test_cases = get_test_cases_for_topic(topic)
        if test_cases:
            run_result = run_code_safely(code, test_cases)
            test_pass_rate = run_result["pass_rate"]
            test_results_summary = (
                f"Passed {run_result['passed']}/{run_result['total']} test cases. "
                f"Errors: {run_result['errors'][:2]}"
            )
        else:
            test_results_summary = "No built-in test cases for this topic β€” using LLM evaluation only."

    # ── 2. Fetch misconceptions for topic context ────────────────────────────
    misconceptions = "; ".join(get_misconceptions(topic))

    # ── 3. LLM evaluation with structured output ────────────────────────────
    try:
        result: EvaluationOutput = _structured_llm.invoke(
            ANALYZE_PROMPT.format_messages(
                topic=topic,
                problem=state["problem"],
                thought=state["user_thought"],
                code=code or "No code provided",
                misconceptions=misconceptions,
                test_results=test_results_summary,
            )
        )
        gap = max(0, min(10, result.gap_magnitude))  # Clamp to [0, 10]

        # ── 4. Hybrid scoring: blend LLM gap with code test pass rate ───────
        if test_pass_rate is not None:
            gap = int(round(0.6 * gap + 0.4 * (10 - test_pass_rate * 10)))
            gap = max(0, min(10, gap))

    except Exception as e:
        print(f"[analyze_node] Structured output error: {e}")
        gap = 5
        result = EvaluationOutput(
            problem_topic=topic,
            identified_gap="Could not parse analysis",
            gap_magnitude=5,
            reasoning="Parse error fallback",
        )

    # ── 5. Update persistent UserProfile ────────────────────────────────────
    try:
        profile = load_profile(session_id)
        profile = update_profile(profile, topic, gap, solved=(gap == 0))
        persist_profile(profile)
    except Exception as e:
        print(f"[analyze_node] Memory update error: {e}")

    return {
        "identified_gap":   result.identified_gap,
        "gap_magnitude":    gap,
        "mistake":          result.mistake,
        "why_wrong":        result.why_wrong,
        "correct_thinking": result.correct_thinking,
        "test_pass_rate":   test_pass_rate,
    }