| """Program evaluation framework. |
| |
| Evaluator hierarchy (pick one per benchmark):: |
| |
| Evaluator Python function: evaluate(program_path) -> dict |
| ContainerizedEvaluator Docker: Dockerfile + evaluate.sh -> JSON on stdout |
| βββ HarborEvaluator Harbor protocol: instruction.md + tests/test.sh + environment/ |
| |
| ``create_evaluator()`` auto-detects which one to use based on the benchmark |
| directory contents. Detection order: Harbor > Containerized > Python. |
| |
| Supporting modules: |
| |
| - ``evaluation_result`` β ``EvaluationResult`` dataclass (metrics + artifacts) |
| - ``llm_judge`` β optional LLM-as-a-judge scorer (adds ``llm_*`` metrics) |
| - ``wrapper`` β CLI bridge so old ``evaluate(path)->dict`` functions |
| can emit container-protocol JSON on stdout |
| """ |
|
|
| import os |
| from typing import Dict, Optional, Union |
|
|
| from skydiscover.evaluation.container_evaluator import ContainerizedEvaluator |
| from skydiscover.evaluation.evaluation_result import EvaluationResult |
| from skydiscover.evaluation.evaluator import Evaluator |
| from skydiscover.evaluation.harbor_evaluator import HarborEvaluator |
| from skydiscover.evaluation.llm_judge import LLMJudge |
|
|
| __all__ = [ |
| "EvaluationResult", |
| "Evaluator", |
| "ContainerizedEvaluator", |
| "HarborEvaluator", |
| "LLMJudge", |
| "create_evaluator", |
| ] |
|
|
|
|
| def _is_harbor_task(path: str) -> bool: |
| """Detect a Harbor task directory (instruction.md + tests/test.sh + environment/Dockerfile).""" |
| return ( |
| os.path.isdir(path) |
| and os.path.exists(os.path.join(path, "instruction.md")) |
| and os.path.exists(os.path.join(path, "tests", "test.sh")) |
| and os.path.isdir(os.path.join(path, "environment")) |
| and os.path.exists(os.path.join(path, "environment", "Dockerfile")) |
| ) |
|
|
|
|
| def _is_containerized(path: str) -> bool: |
| """Detect a standard containerized benchmark (Dockerfile + evaluate.sh).""" |
| return ( |
| os.path.isdir(path) |
| and os.path.exists(os.path.join(path, "Dockerfile")) |
| and os.path.exists(os.path.join(path, "evaluate.sh")) |
| ) |
|
|
|
|
| def create_evaluator( |
| config, |
| llm_judge: Optional[LLMJudge] = None, |
| max_concurrent: int = 4, |
| env_vars: Optional[Dict[str, str]] = None, |
| ) -> Union[Evaluator, ContainerizedEvaluator, HarborEvaluator]: |
| """Return the right evaluator for the given config. |
| |
| Detection order (most specific first): |
| 1. Harbor task β instruction.md + tests/ + environment/Dockerfile |
| 2. Containerized β Dockerfile + evaluate.sh |
| 3. Python evaluator β fallback |
| """ |
| path = config.evaluation_file or "" |
| if _is_harbor_task(path): |
| return HarborEvaluator(path, config, max_concurrent=max_concurrent, env_vars=env_vars) |
| if _is_containerized(path): |
| return ContainerizedEvaluator( |
| path, config, max_concurrent=max_concurrent, env_vars=env_vars |
| ) |
| return Evaluator(config, llm_judge=llm_judge, max_concurrent=max_concurrent, env_vars=env_vars) |
|
|