Spaces:
Sleeping
Sleeping
| """ | |
| analyze_node.py β Evaluates the user's reasoning and identifies the primary gap. | |
| Improvements over v1: | |
| - Uses llm.with_structured_output() for guaranteed schema compliance | |
| - Clamps gap_magnitude to [0, 10] as a safety guard | |
| - Runs sandboxed code evaluation and blends result into hybrid gap score | |
| - Loads and updates UserProfile in SQLite for persistent memory | |
| - Populates explain-why-wrong fields (mistake, why_wrong, correct_thinking) | |
| """ | |
| from agent.models import AgentState, EvaluationOutput | |
| from agent.llm_factory import get_llm | |
| from agent.prompts import ANALYZE_PROMPT | |
| from agent.knowledge import get_misconceptions | |
| from agent.memory import load_profile, update_profile, persist_profile | |
| from agent.sandbox import run_code_safely, get_test_cases_for_topic | |
| _llm = get_llm() | |
| _structured_llm = _llm.with_structured_output(EvaluationOutput, method="function_calling") | |
| def evaluate_reasoning(state: AgentState) -> dict: | |
| """ | |
| Analyzes user's thought process and code. | |
| Updates the UserProfile in the DB with the latest gap scores. | |
| Returns identified_gap, gap_magnitude, explain-why-wrong fields, and test_pass_rate. | |
| """ | |
| topic = state.get("problem_topic", "Unknown") | |
| code = state.get("code", "") or "" | |
| session_id = state.get("session_id", "anonymous") | |
| # ββ 1. Run sandbox evaluation if code is provided βββββββββββββββββββββββ | |
| test_results_summary = "No code submitted." | |
| test_pass_rate = None | |
| if code.strip(): | |
| test_cases = get_test_cases_for_topic(topic) | |
| if test_cases: | |
| run_result = run_code_safely(code, test_cases) | |
| test_pass_rate = run_result["pass_rate"] | |
| test_results_summary = ( | |
| f"Passed {run_result['passed']}/{run_result['total']} test cases. " | |
| f"Errors: {run_result['errors'][:2]}" | |
| ) | |
| else: | |
| test_results_summary = "No built-in test cases for this topic β using LLM evaluation only." | |
| # ββ 2. Fetch misconceptions for topic context ββββββββββββββββββββββββββββ | |
| misconceptions = "; ".join(get_misconceptions(topic)) | |
| # ββ 3. LLM evaluation with structured output ββββββββββββββββββββββββββββ | |
| try: | |
| result: EvaluationOutput = _structured_llm.invoke( | |
| ANALYZE_PROMPT.format_messages( | |
| topic=topic, | |
| problem=state["problem"], | |
| thought=state["user_thought"], | |
| code=code or "No code provided", | |
| misconceptions=misconceptions, | |
| test_results=test_results_summary, | |
| ) | |
| ) | |
| gap = max(0, min(10, result.gap_magnitude)) # Clamp to [0, 10] | |
| # ββ 4. Hybrid scoring: blend LLM gap with code test pass rate βββββββ | |
| if test_pass_rate is not None: | |
| gap = int(round(0.6 * gap + 0.4 * (10 - test_pass_rate * 10))) | |
| gap = max(0, min(10, gap)) | |
| except Exception as e: | |
| print(f"[analyze_node] Structured output error: {e}") | |
| gap = 5 | |
| result = EvaluationOutput( | |
| problem_topic=topic, | |
| identified_gap="Could not parse analysis", | |
| gap_magnitude=5, | |
| reasoning="Parse error fallback", | |
| ) | |
| # ββ 5. Update persistent UserProfile ββββββββββββββββββββββββββββββββββββ | |
| try: | |
| profile = load_profile(session_id) | |
| profile = update_profile(profile, topic, gap, solved=(gap == 0)) | |
| persist_profile(profile) | |
| except Exception as e: | |
| print(f"[analyze_node] Memory update error: {e}") | |
| return { | |
| "identified_gap": result.identified_gap, | |
| "gap_magnitude": gap, | |
| "mistake": result.mistake, | |
| "why_wrong": result.why_wrong, | |
| "correct_thinking": result.correct_thinking, | |
| "test_pass_rate": test_pass_rate, | |
| } | |