sky2 / skydiscover /evaluation /__init__.py
JustinTX's picture
Add files using upload-large-folder tool
7f611c5 verified
"""Program evaluation framework.
Evaluator hierarchy (pick one per benchmark)::
Evaluator Python function: evaluate(program_path) -> dict
ContainerizedEvaluator Docker: Dockerfile + evaluate.sh -> JSON on stdout
└── HarborEvaluator Harbor protocol: instruction.md + tests/test.sh + environment/
``create_evaluator()`` auto-detects which one to use based on the benchmark
directory contents. Detection order: Harbor > Containerized > Python.
Supporting modules:
- ``evaluation_result`` β€” ``EvaluationResult`` dataclass (metrics + artifacts)
- ``llm_judge`` β€” optional LLM-as-a-judge scorer (adds ``llm_*`` metrics)
- ``wrapper`` β€” CLI bridge so old ``evaluate(path)->dict`` functions
can emit container-protocol JSON on stdout
"""
import os
from typing import Dict, Optional, Union
from skydiscover.evaluation.container_evaluator import ContainerizedEvaluator
from skydiscover.evaluation.evaluation_result import EvaluationResult
from skydiscover.evaluation.evaluator import Evaluator
from skydiscover.evaluation.harbor_evaluator import HarborEvaluator
from skydiscover.evaluation.llm_judge import LLMJudge
__all__ = [
"EvaluationResult",
"Evaluator",
"ContainerizedEvaluator",
"HarborEvaluator",
"LLMJudge",
"create_evaluator",
]
def _is_harbor_task(path: str) -> bool:
"""Detect a Harbor task directory (instruction.md + tests/test.sh + environment/Dockerfile)."""
return (
os.path.isdir(path)
and os.path.exists(os.path.join(path, "instruction.md"))
and os.path.exists(os.path.join(path, "tests", "test.sh"))
and os.path.isdir(os.path.join(path, "environment"))
and os.path.exists(os.path.join(path, "environment", "Dockerfile"))
)
def _is_containerized(path: str) -> bool:
"""Detect a standard containerized benchmark (Dockerfile + evaluate.sh)."""
return (
os.path.isdir(path)
and os.path.exists(os.path.join(path, "Dockerfile"))
and os.path.exists(os.path.join(path, "evaluate.sh"))
)
def create_evaluator(
config,
llm_judge: Optional[LLMJudge] = None,
max_concurrent: int = 4,
env_vars: Optional[Dict[str, str]] = None,
) -> Union[Evaluator, ContainerizedEvaluator, HarborEvaluator]:
"""Return the right evaluator for the given config.
Detection order (most specific first):
1. Harbor task β€” instruction.md + tests/ + environment/Dockerfile
2. Containerized β€” Dockerfile + evaluate.sh
3. Python evaluator β€” fallback
"""
path = config.evaluation_file or ""
if _is_harbor_task(path):
return HarborEvaluator(path, config, max_concurrent=max_concurrent, env_vars=env_vars)
if _is_containerized(path):
return ContainerizedEvaluator(
path, config, max_concurrent=max_concurrent, env_vars=env_vars
)
return Evaluator(config, llm_judge=llm_judge, max_concurrent=max_concurrent, env_vars=env_vars)