File size: 3,029 Bytes
7f611c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
"""Program evaluation framework.

Evaluator hierarchy (pick one per benchmark)::

    Evaluator                  Python function: evaluate(program_path) -> dict
    ContainerizedEvaluator     Docker: Dockerfile + evaluate.sh  ->  JSON on stdout
    └── HarborEvaluator        Harbor protocol: instruction.md + tests/test.sh + environment/

``create_evaluator()`` auto-detects which one to use based on the benchmark
directory contents.  Detection order: Harbor > Containerized > Python.

Supporting modules:

- ``evaluation_result``  β€” ``EvaluationResult`` dataclass (metrics + artifacts)
- ``llm_judge``          β€” optional LLM-as-a-judge scorer (adds ``llm_*`` metrics)
- ``wrapper``            β€” CLI bridge so old ``evaluate(path)->dict`` functions
                           can emit container-protocol JSON on stdout
"""

import os
from typing import Dict, Optional, Union

from skydiscover.evaluation.container_evaluator import ContainerizedEvaluator
from skydiscover.evaluation.evaluation_result import EvaluationResult
from skydiscover.evaluation.evaluator import Evaluator
from skydiscover.evaluation.harbor_evaluator import HarborEvaluator
from skydiscover.evaluation.llm_judge import LLMJudge

__all__ = [
    "EvaluationResult",
    "Evaluator",
    "ContainerizedEvaluator",
    "HarborEvaluator",
    "LLMJudge",
    "create_evaluator",
]


def _is_harbor_task(path: str) -> bool:
    """Detect a Harbor task directory (instruction.md + tests/test.sh + environment/Dockerfile)."""
    return (
        os.path.isdir(path)
        and os.path.exists(os.path.join(path, "instruction.md"))
        and os.path.exists(os.path.join(path, "tests", "test.sh"))
        and os.path.isdir(os.path.join(path, "environment"))
        and os.path.exists(os.path.join(path, "environment", "Dockerfile"))
    )


def _is_containerized(path: str) -> bool:
    """Detect a standard containerized benchmark (Dockerfile + evaluate.sh)."""
    return (
        os.path.isdir(path)
        and os.path.exists(os.path.join(path, "Dockerfile"))
        and os.path.exists(os.path.join(path, "evaluate.sh"))
    )


def create_evaluator(
    config,
    llm_judge: Optional[LLMJudge] = None,
    max_concurrent: int = 4,
    env_vars: Optional[Dict[str, str]] = None,
) -> Union[Evaluator, ContainerizedEvaluator, HarborEvaluator]:
    """Return the right evaluator for the given config.

    Detection order (most specific first):
      1. Harbor task β€” instruction.md + tests/ + environment/Dockerfile
      2. Containerized β€” Dockerfile + evaluate.sh
      3. Python evaluator β€” fallback
    """
    path = config.evaluation_file or ""
    if _is_harbor_task(path):
        return HarborEvaluator(path, config, max_concurrent=max_concurrent, env_vars=env_vars)
    if _is_containerized(path):
        return ContainerizedEvaluator(
            path, config, max_concurrent=max_concurrent, env_vars=env_vars
        )
    return Evaluator(config, llm_judge=llm_judge, max_concurrent=max_concurrent, env_vars=env_vars)