Spaces:

moazeldegwy
/

mealgraph

Sleeping

App Files Files Community

mealgraph / evals /runner.py

moazeldegwy

Simplify topology to 3 agents + 2 tools

1933348 9 days ago

raw

history blame contribute delete

6.69 kB

	"""Offline eval runner.

	Validates the deterministic parts of the system end-to-end without paying
	for Gemini calls. Walks each fixture, builds the calculations the Medical
	agent would emit, runs :func:`agents.check_plan` over a hand-built plan,
	and asserts that the per-fixture ``expected`` dict matches.

	This is the same ``check_plan`` the Planner runs internally after the LP
	solver — keeping the eval surface aligned with production behaviour.

	Two ways to invoke:

	* As a script: ``python -m evals.runner`` -> prints a per-fixture report.
	* From pytest: imported and called by ``tests/test_evals.py``.
	"""

	from __future__ import annotations

	import sys
	from dataclasses import dataclass, field
	from typing import Any, Dict, List

	from agents import check_plan
	from evals.fixtures import all_fixtures
	from guardrails import detect_prompt_injection
	from nutrition_formulas import full_assessment
	from observability import get_metrics
	from utils import get_parse_metrics


	@dataclass
	class FixtureResult:
	name: str
	passed: bool
	failures: List[str] = field(default_factory=list)
	info: Dict[str, Any] = field(default_factory=dict)


	def _build_assessment(fixture: Dict[str, Any]) -> Dict[str, Any]:
	p = fixture["user_profile"]
	return full_assessment(
	weight_kg=p["weight"],
	height_cm=p["height"],
	age_years=p["age"],
	sex=p["sex"],
	activity_level=p["activity_level"],
	goal=p["goal"],
	)


	def _build_plan_from_assessment(assessment: Dict[str, Any]) -> Dict[str, Any]:
	"""Hand-build a plan that hits the assessment targets within tolerance."""
	target_cal = assessment["daily_target_calories"]
	macros = assessment["macro_targets"]
	return {
	"days": [
	{
	"name": "balanced day",
	"calories": target_cal,
	"protein_g": macros["protein_g"],
	"fat_g": macros["fat_g"],
	"carbohydrates_g": macros["carbohydrates_g"],
	}
	],
	"trace": "eval-fixture: hand-built plan hitting targets exactly.",
	}


	def _check_expected(
	fixture: Dict[str, Any],
	assessment: Dict[str, Any],
	issues: List[Dict[str, Any]],
	requires_human_review: bool,
	) -> List[str]:
	failures: List[str] = []
	expected = fixture.get("expected", {})

	cal = assessment["daily_target_calories"]
	if "min_calories" in expected and cal < expected["min_calories"]:
	failures.append(f"calories {cal} below expected min {expected['min_calories']}")
	if "max_calories" in expected and cal > expected["max_calories"]:
	failures.append(f"calories {cal} above expected max {expected['max_calories']}")

	if "min_protein_g" in expected:
	actual = assessment["macro_targets"]["protein_g"]
	if actual < expected["min_protein_g"]:
	failures.append(
	f"protein_g {actual} below expected min {expected['min_protein_g']}"
	)

	if expected.get("requires_human_review") and not requires_human_review:
	failures.append("expected requires_human_review=True")

	# No deterministic issue should fire for a hand-built plan that hits
	# targets exactly — if one does, the check_plan contract has drifted.
	if issues:
	failures.append(
	f"unexpected post-LP issues on a target-hitting plan: "
	f"{[i['code'] for i in issues]}"
	)

	return failures


	def run_offline() -> List[FixtureResult]:
	results: List[FixtureResult] = []
	for fixture in all_fixtures():
	name = fixture["user_profile"]["name"]

	# 1. Sanity-check the user-question for prompt injection.
	injected = False
	for q in fixture["questions"]:
	verdict = detect_prompt_injection(q)
	if verdict.is_attempt:
	results.append(
	FixtureResult(
	name=name,
	passed=False,
	failures=[
	f"fixture question flagged as injection: {verdict.matches}"
	],
	)
	)
	injected = True
	break
	if injected:
	continue

	# 2. Compute the assessment deterministically.
	assessment = _build_assessment(fixture)

	# 3. Hand-build a plan that hits targets and run the deterministic
	# plan check — the same code the Planner runs internally.
	plan = _build_plan_from_assessment(assessment)
	requires_human_review = fixture["expected"].get("requires_human_review", False)
	memory: Dict[str, Any] = {
	"user_profile": fixture["user_profile"],
	"medical_history": fixture["medical_history"],
	"flags_and_assessments": {
	"assessment_status": "assessment_complete",
	"calculations": assessment,
	"flags": fixture["medical_history"]["conditions"],
	"recommendations": [],
	"requires_professional_consultation": requires_human_review,
	},
	"plans": {"current_plan": plan},
	}

	issues = check_plan(plan, memory)

	failures = _check_expected(fixture, assessment, issues, requires_human_review)
	results.append(
	FixtureResult(
	name=name,
	passed=not failures,
	failures=failures,
	info={
	"calories": assessment["daily_target_calories"],
	"protein_g": assessment["macro_targets"]["protein_g"],
	"issues": [i["code"] for i in issues],
	"requires_human_review": requires_human_review,
	},
	)
	)
	return results


	def print_report(results: List[FixtureResult]) -> int:
	passes = sum(1 for r in results if r.passed)
	fails = len(results) - passes
	for r in results:
	status = "[PASS]" if r.passed else "[FAIL]"
	print(f"{status} {r.name} -> {r.info}")
	for f in r.failures:
	print(f" - {f}")

	pm = get_parse_metrics()
	metrics = get_metrics().snapshot()
	print()
	print(f"Summary: {passes}/{len(results)} fixtures passed.")
	print(
	f"Parse metrics — native={pm.native_parses} fallback={pm.fallback_parses} "
	f"failure={pm.schema_failures}"
	)
	print(f"Agent timings: {metrics['agents']}")
	return 0 if fails == 0 else 1


	def main() -> int: # pragma: no cover
	return print_report(run_offline())


	if __name__ == "__main__": # pragma: no cover
	sys.exit(main())