File size: 3,029 Bytes

7f611c5

"""Program evaluation framework.

Evaluator hierarchy (pick one per benchmark)::

    Evaluator                  Python function: evaluate(program_path) -> dict
    ContainerizedEvaluator     Docker: Dockerfile + evaluate.sh  ->  JSON on stdout
    └── HarborEvaluator        Harbor protocol: instruction.md + tests/test.sh + environment/

``create_evaluator()`` auto-detects which one to use based on the benchmark
directory contents.  Detection order: Harbor > Containerized > Python.

Supporting modules:

- ``evaluation_result``  — ``EvaluationResult`` dataclass (metrics + artifacts)
- ``llm_judge``          — optional LLM-as-a-judge scorer (adds ``llm_*`` metrics)
- ``wrapper``            — CLI bridge so old ``evaluate(path)->dict`` functions
                           can emit container-protocol JSON on stdout
"""

import os
from typing import Dict, Optional, Union

from skydiscover.evaluation.container_evaluator import ContainerizedEvaluator
from skydiscover.evaluation.evaluation_result import EvaluationResult
from skydiscover.evaluation.evaluator import Evaluator
from skydiscover.evaluation.harbor_evaluator import HarborEvaluator
from skydiscover.evaluation.llm_judge import LLMJudge

__all__ = [
    "EvaluationResult",
    "Evaluator",
    "ContainerizedEvaluator",
    "HarborEvaluator",
    "LLMJudge",
    "create_evaluator",
]


def _is_harbor_task(path: str) -> bool:
    """Detect a Harbor task directory (instruction.md + tests/test.sh + environment/Dockerfile)."""
    return (
        os.path.isdir(path)
        and os.path.exists(os.path.join(path, "instruction.md"))
        and os.path.exists(os.path.join(path, "tests", "test.sh"))
        and os.path.isdir(os.path.join(path, "environment"))
        and os.path.exists(os.path.join(path, "environment", "Dockerfile"))
    )


def _is_containerized(path: str) -> bool:
    """Detect a standard containerized benchmark (Dockerfile + evaluate.sh)."""
    return (
        os.path.isdir(path)
        and os.path.exists(os.path.join(path, "Dockerfile"))
        and os.path.exists(os.path.join(path, "evaluate.sh"))
    )


def create_evaluator(
    config,
    llm_judge: Optional[LLMJudge] = None,
    max_concurrent: int = 4,
    env_vars: Optional[Dict[str, str]] = None,
) -> Union[Evaluator, ContainerizedEvaluator, HarborEvaluator]:
    """Return the right evaluator for the given config.

    Detection order (most specific first):
      1. Harbor task — instruction.md + tests/ + environment/Dockerfile
      2. Containerized — Dockerfile + evaluate.sh
      3. Python evaluator — fallback
    """
    path = config.evaluation_file or ""
    if _is_harbor_task(path):
        return HarborEvaluator(path, config, max_concurrent=max_concurrent, env_vars=env_vars)
    if _is_containerized(path):
        return ContainerizedEvaluator(
            path, config, max_concurrent=max_concurrent, env_vars=env_vars
        )
    return Evaluator(config, llm_judge=llm_judge, max_concurrent=max_concurrent, env_vars=env_vars)