elder-care-copilot / src /app_kit /eval_runner.py
Abhishek
Add all folders and files
f9a9b47
Raw
History Blame Contribute Delete
4.2 kB
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import Any
import argparse
import json
import logging
import sys
from .config import load_app_config
from .demo_packs import load_demo_pack
from .logging_utils import setup_logging
from .storage import SQLiteStore
from .tracing import utc_now, write_trace_artifact
@dataclass
class EvalResult:
project: str
pack_id: str
passed: bool
findings: list[str]
result: dict[str, Any]
trace_path: str
def _expected_subset(expected: dict[str, Any], actual: dict[str, Any]) -> list[str]:
issues = []
for key, value in expected.items():
if key not in actual:
issues.append(f'missing key: {key}')
elif actual[key] != value:
issues.append(f'{key}: expected {value!r}, got {actual[key]!r}')
return issues
def run_eval_for_project(project_module: str, pack_path: str | Path, db_path: str | Path | None = None) -> EvalResult:
mod = __import__(project_module, fromlist=['create_project_spec'])
spec = mod.create_project_spec()
pack = load_demo_pack(pack_path)
config = load_app_config(project_key=spec.key, data_subdir=spec.data_subdir)
if db_path is not None:
config = config.__class__(
project_key=config.project_key,
app_mode=config.app_mode,
root_dir=config.root_dir,
data_dir=config.data_dir,
sqlite_path=Path(db_path),
artifact_dir=config.artifact_dir,
cache_dir=config.cache_dir,
model_registry_path=config.model_registry_path,
)
started_at = utc_now()
store = SQLiteStore(config.sqlite_path, config.artifact_dir)
try:
result = spec.run_pack(pack, store, config)
expected = pack.expected_signals
findings = _expected_subset(expected, result)
passed = not findings
finished_at = utc_now()
trace_path = write_trace_artifact(
config.artifact_dir,
{
'kind': 'eval',
'project': spec.key,
'pack_id': pack.pack_id,
'pack_path': str(pack_path),
'started_at': started_at,
'finished_at': finished_at,
'passed': passed,
'findings': findings,
'result': result,
},
)
finally:
store.close()
return EvalResult(project=spec.key, pack_id=pack.pack_id, passed=passed, findings=findings, result=result, trace_path=str(trace_path))
def main() -> int:
parser = argparse.ArgumentParser(description='Run golden-scenario evals for the ALL4 kit')
parser.add_argument('project_module', help='Python module path, e.g. apps.p1_elder_paperwork.app')
parser.add_argument('pack_path', help='Path to a demo pack folder')
parser.add_argument('--db-path', help='Optional SQLite path for the run')
parser.add_argument(
'--json-only',
action='store_true',
help='Emit exactly one JSON object to stdout (no logging, no pretty-print).',
)
parser.add_argument(
'--quiet',
'--no-log',
dest='quiet',
action='store_true',
help='Disable JSONL logging (useful when piping stdout).',
)
parser.add_argument(
'--log-level',
choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
default='INFO',
help='Logging threshold for the JSONL status line.',
)
args = parser.parse_args()
logger = None
if not args.quiet and not args.json_only:
logger = setup_logging('app_kit.eval_runner', level=getattr(logging, args.log_level), stream=sys.stderr)
result = run_eval_for_project(args.project_module, args.pack_path, args.db_path)
if logger is not None:
logger.info('eval completed: %s', json.dumps(result.__dict__, ensure_ascii=False))
if args.json_only:
print(json.dumps(result.__dict__, ensure_ascii=False))
else:
print(json.dumps(result.__dict__, indent=2, ensure_ascii=False))
return 0 if result.passed else 1
if __name__ == '__main__':
raise SystemExit(main())