Spaces:
Sleeping
Sleeping
| """Offline eval runner. | |
| Validates the *deterministic* parts of the system end-to-end without paying | |
| for Gemini calls. Walks each fixture, builds the calculations the Medical | |
| agent would emit, runs :func:`agents.check_plan` over a hand-built plan, | |
| and asserts that the per-fixture ``expected`` dict matches. | |
| This is the same ``check_plan`` the Planner runs internally after the LP | |
| solver — keeping the eval surface aligned with production behaviour. | |
| Two ways to invoke: | |
| * As a script: ``python -m evals.runner`` -> prints a per-fixture report. | |
| * From pytest: imported and called by ``tests/test_evals.py``. | |
| """ | |
| from __future__ import annotations | |
| import sys | |
| from dataclasses import dataclass, field | |
| from typing import Any, Dict, List | |
| from agents import check_plan | |
| from evals.fixtures import all_fixtures | |
| from guardrails import detect_prompt_injection | |
| from nutrition_formulas import full_assessment | |
| from observability import get_metrics | |
| from utils import get_parse_metrics | |
| class FixtureResult: | |
| name: str | |
| passed: bool | |
| failures: List[str] = field(default_factory=list) | |
| info: Dict[str, Any] = field(default_factory=dict) | |
| def _build_assessment(fixture: Dict[str, Any]) -> Dict[str, Any]: | |
| p = fixture["user_profile"] | |
| return full_assessment( | |
| weight_kg=p["weight"], | |
| height_cm=p["height"], | |
| age_years=p["age"], | |
| sex=p["sex"], | |
| activity_level=p["activity_level"], | |
| goal=p["goal"], | |
| ) | |
| def _build_plan_from_assessment(assessment: Dict[str, Any]) -> Dict[str, Any]: | |
| """Hand-build a plan that hits the assessment targets within tolerance.""" | |
| target_cal = assessment["daily_target_calories"] | |
| macros = assessment["macro_targets"] | |
| return { | |
| "days": [ | |
| { | |
| "name": "balanced day", | |
| "calories": target_cal, | |
| "protein_g": macros["protein_g"], | |
| "fat_g": macros["fat_g"], | |
| "carbohydrates_g": macros["carbohydrates_g"], | |
| } | |
| ], | |
| "trace": "eval-fixture: hand-built plan hitting targets exactly.", | |
| } | |
| def _check_expected( | |
| fixture: Dict[str, Any], | |
| assessment: Dict[str, Any], | |
| issues: List[Dict[str, Any]], | |
| requires_human_review: bool, | |
| ) -> List[str]: | |
| failures: List[str] = [] | |
| expected = fixture.get("expected", {}) | |
| cal = assessment["daily_target_calories"] | |
| if "min_calories" in expected and cal < expected["min_calories"]: | |
| failures.append(f"calories {cal} below expected min {expected['min_calories']}") | |
| if "max_calories" in expected and cal > expected["max_calories"]: | |
| failures.append(f"calories {cal} above expected max {expected['max_calories']}") | |
| if "min_protein_g" in expected: | |
| actual = assessment["macro_targets"]["protein_g"] | |
| if actual < expected["min_protein_g"]: | |
| failures.append( | |
| f"protein_g {actual} below expected min {expected['min_protein_g']}" | |
| ) | |
| if expected.get("requires_human_review") and not requires_human_review: | |
| failures.append("expected requires_human_review=True") | |
| # No deterministic issue should fire for a hand-built plan that hits | |
| # targets exactly — if one does, the check_plan contract has drifted. | |
| if issues: | |
| failures.append( | |
| f"unexpected post-LP issues on a target-hitting plan: " | |
| f"{[i['code'] for i in issues]}" | |
| ) | |
| return failures | |
| def run_offline() -> List[FixtureResult]: | |
| results: List[FixtureResult] = [] | |
| for fixture in all_fixtures(): | |
| name = fixture["user_profile"]["name"] | |
| # 1. Sanity-check the user-question for prompt injection. | |
| injected = False | |
| for q in fixture["questions"]: | |
| verdict = detect_prompt_injection(q) | |
| if verdict.is_attempt: | |
| results.append( | |
| FixtureResult( | |
| name=name, | |
| passed=False, | |
| failures=[ | |
| f"fixture question flagged as injection: {verdict.matches}" | |
| ], | |
| ) | |
| ) | |
| injected = True | |
| break | |
| if injected: | |
| continue | |
| # 2. Compute the assessment deterministically. | |
| assessment = _build_assessment(fixture) | |
| # 3. Hand-build a plan that hits targets and run the deterministic | |
| # plan check — the same code the Planner runs internally. | |
| plan = _build_plan_from_assessment(assessment) | |
| requires_human_review = fixture["expected"].get("requires_human_review", False) | |
| memory: Dict[str, Any] = { | |
| "user_profile": fixture["user_profile"], | |
| "medical_history": fixture["medical_history"], | |
| "flags_and_assessments": { | |
| "assessment_status": "assessment_complete", | |
| "calculations": assessment, | |
| "flags": fixture["medical_history"]["conditions"], | |
| "recommendations": [], | |
| "requires_professional_consultation": requires_human_review, | |
| }, | |
| "plans": {"current_plan": plan}, | |
| } | |
| issues = check_plan(plan, memory) | |
| failures = _check_expected(fixture, assessment, issues, requires_human_review) | |
| results.append( | |
| FixtureResult( | |
| name=name, | |
| passed=not failures, | |
| failures=failures, | |
| info={ | |
| "calories": assessment["daily_target_calories"], | |
| "protein_g": assessment["macro_targets"]["protein_g"], | |
| "issues": [i["code"] for i in issues], | |
| "requires_human_review": requires_human_review, | |
| }, | |
| ) | |
| ) | |
| return results | |
| def print_report(results: List[FixtureResult]) -> int: | |
| passes = sum(1 for r in results if r.passed) | |
| fails = len(results) - passes | |
| for r in results: | |
| status = "[PASS]" if r.passed else "[FAIL]" | |
| print(f"{status} {r.name} -> {r.info}") | |
| for f in r.failures: | |
| print(f" - {f}") | |
| pm = get_parse_metrics() | |
| metrics = get_metrics().snapshot() | |
| print() | |
| print(f"Summary: {passes}/{len(results)} fixtures passed.") | |
| print( | |
| f"Parse metrics — native={pm.native_parses} fallback={pm.fallback_parses} " | |
| f"failure={pm.schema_failures}" | |
| ) | |
| print(f"Agent timings: {metrics['agents']}") | |
| return 0 if fails == 0 else 1 | |
| def main() -> int: # pragma: no cover | |
| return print_report(run_offline()) | |
| if __name__ == "__main__": # pragma: no cover | |
| sys.exit(main()) | |