""" analyze_node.py — Evaluates the user's reasoning and identifies the primary gap. Improvements over v1: - Uses llm.with_structured_output() for guaranteed schema compliance - Clamps gap_magnitude to [0, 10] as a safety guard - Runs sandboxed code evaluation and blends result into hybrid gap score - Loads and updates UserProfile in SQLite for persistent memory - Populates explain-why-wrong fields (mistake, why_wrong, correct_thinking) """ from agent.models import AgentState, EvaluationOutput from agent.llm_factory import get_llm from agent.prompts import ANALYZE_PROMPT from agent.knowledge import get_misconceptions from agent.memory import load_profile, update_profile, persist_profile from agent.sandbox import run_code_safely, get_test_cases_for_topic _llm = get_llm() _structured_llm = _llm.with_structured_output(EvaluationOutput, method="function_calling") def evaluate_reasoning(state: AgentState) -> dict: """ Analyzes user's thought process and code. Updates the UserProfile in the DB with the latest gap scores. Returns identified_gap, gap_magnitude, explain-why-wrong fields, and test_pass_rate. """ topic = state.get("problem_topic", "Unknown") code = state.get("code", "") or "" session_id = state.get("session_id", "anonymous") # ── 1. Run sandbox evaluation if code is provided ─────────────────────── test_results_summary = "No code submitted." test_pass_rate = None if code.strip(): test_cases = get_test_cases_for_topic(topic) if test_cases: run_result = run_code_safely(code, test_cases) test_pass_rate = run_result["pass_rate"] test_results_summary = ( f"Passed {run_result['passed']}/{run_result['total']} test cases. " f"Errors: {run_result['errors'][:2]}" ) else: test_results_summary = "No built-in test cases for this topic — using LLM evaluation only." # ── 2. Fetch misconceptions for topic context ──────────────────────────── misconceptions = "; ".join(get_misconceptions(topic)) # ── 3. LLM evaluation with structured output ──────────────────────────── try: result: EvaluationOutput = _structured_llm.invoke( ANALYZE_PROMPT.format_messages( topic=topic, problem=state["problem"], thought=state["user_thought"], code=code or "No code provided", misconceptions=misconceptions, test_results=test_results_summary, ) ) gap = max(0, min(10, result.gap_magnitude)) # Clamp to [0, 10] # ── 4. Hybrid scoring: blend LLM gap with code test pass rate ─────── if test_pass_rate is not None: gap = int(round(0.6 * gap + 0.4 * (10 - test_pass_rate * 10))) gap = max(0, min(10, gap)) except Exception as e: print(f"[analyze_node] Structured output error: {e}") gap = 5 result = EvaluationOutput( problem_topic=topic, identified_gap="Could not parse analysis", gap_magnitude=5, reasoning="Parse error fallback", ) # ── 5. Update persistent UserProfile ──────────────────────────────────── try: profile = load_profile(session_id) profile = update_profile(profile, topic, gap, solved=(gap == 0)) persist_profile(profile) except Exception as e: print(f"[analyze_node] Memory update error: {e}") return { "identified_gap": result.identified_gap, "gap_magnitude": gap, "mistake": result.mistake, "why_wrong": result.why_wrong, "correct_thinking": result.correct_thinking, "test_pass_rate": test_pass_rate, }