"""Offline eval runner. Validates the *deterministic* parts of the system end-to-end without paying for Gemini calls. Walks each fixture, builds the calculations the Medical agent would emit, runs :func:`agents.check_plan` over a hand-built plan, and asserts that the per-fixture ``expected`` dict matches. This is the same ``check_plan`` the Planner runs internally after the LP solver — keeping the eval surface aligned with production behaviour. Two ways to invoke: * As a script: ``python -m evals.runner`` -> prints a per-fixture report. * From pytest: imported and called by ``tests/test_evals.py``. """ from __future__ import annotations import sys from dataclasses import dataclass, field from typing import Any, Dict, List from agents import check_plan from evals.fixtures import all_fixtures from guardrails import detect_prompt_injection from nutrition_formulas import full_assessment from observability import get_metrics from utils import get_parse_metrics @dataclass class FixtureResult: name: str passed: bool failures: List[str] = field(default_factory=list) info: Dict[str, Any] = field(default_factory=dict) def _build_assessment(fixture: Dict[str, Any]) -> Dict[str, Any]: p = fixture["user_profile"] return full_assessment( weight_kg=p["weight"], height_cm=p["height"], age_years=p["age"], sex=p["sex"], activity_level=p["activity_level"], goal=p["goal"], ) def _build_plan_from_assessment(assessment: Dict[str, Any]) -> Dict[str, Any]: """Hand-build a plan that hits the assessment targets within tolerance.""" target_cal = assessment["daily_target_calories"] macros = assessment["macro_targets"] return { "days": [ { "name": "balanced day", "calories": target_cal, "protein_g": macros["protein_g"], "fat_g": macros["fat_g"], "carbohydrates_g": macros["carbohydrates_g"], } ], "trace": "eval-fixture: hand-built plan hitting targets exactly.", } def _check_expected( fixture: Dict[str, Any], assessment: Dict[str, Any], issues: List[Dict[str, Any]], requires_human_review: bool, ) -> List[str]: failures: List[str] = [] expected = fixture.get("expected", {}) cal = assessment["daily_target_calories"] if "min_calories" in expected and cal < expected["min_calories"]: failures.append(f"calories {cal} below expected min {expected['min_calories']}") if "max_calories" in expected and cal > expected["max_calories"]: failures.append(f"calories {cal} above expected max {expected['max_calories']}") if "min_protein_g" in expected: actual = assessment["macro_targets"]["protein_g"] if actual < expected["min_protein_g"]: failures.append( f"protein_g {actual} below expected min {expected['min_protein_g']}" ) if expected.get("requires_human_review") and not requires_human_review: failures.append("expected requires_human_review=True") # No deterministic issue should fire for a hand-built plan that hits # targets exactly — if one does, the check_plan contract has drifted. if issues: failures.append( f"unexpected post-LP issues on a target-hitting plan: " f"{[i['code'] for i in issues]}" ) return failures def run_offline() -> List[FixtureResult]: results: List[FixtureResult] = [] for fixture in all_fixtures(): name = fixture["user_profile"]["name"] # 1. Sanity-check the user-question for prompt injection. injected = False for q in fixture["questions"]: verdict = detect_prompt_injection(q) if verdict.is_attempt: results.append( FixtureResult( name=name, passed=False, failures=[ f"fixture question flagged as injection: {verdict.matches}" ], ) ) injected = True break if injected: continue # 2. Compute the assessment deterministically. assessment = _build_assessment(fixture) # 3. Hand-build a plan that hits targets and run the deterministic # plan check — the same code the Planner runs internally. plan = _build_plan_from_assessment(assessment) requires_human_review = fixture["expected"].get("requires_human_review", False) memory: Dict[str, Any] = { "user_profile": fixture["user_profile"], "medical_history": fixture["medical_history"], "flags_and_assessments": { "assessment_status": "assessment_complete", "calculations": assessment, "flags": fixture["medical_history"]["conditions"], "recommendations": [], "requires_professional_consultation": requires_human_review, }, "plans": {"current_plan": plan}, } issues = check_plan(plan, memory) failures = _check_expected(fixture, assessment, issues, requires_human_review) results.append( FixtureResult( name=name, passed=not failures, failures=failures, info={ "calories": assessment["daily_target_calories"], "protein_g": assessment["macro_targets"]["protein_g"], "issues": [i["code"] for i in issues], "requires_human_review": requires_human_review, }, ) ) return results def print_report(results: List[FixtureResult]) -> int: passes = sum(1 for r in results if r.passed) fails = len(results) - passes for r in results: status = "[PASS]" if r.passed else "[FAIL]" print(f"{status} {r.name} -> {r.info}") for f in r.failures: print(f" - {f}") pm = get_parse_metrics() metrics = get_metrics().snapshot() print() print(f"Summary: {passes}/{len(results)} fixtures passed.") print( f"Parse metrics — native={pm.native_parses} fallback={pm.fallback_parses} " f"failure={pm.schema_failures}" ) print(f"Agent timings: {metrics['agents']}") return 0 if fails == 0 else 1 def main() -> int: # pragma: no cover return print_report(run_offline()) if __name__ == "__main__": # pragma: no cover sys.exit(main())