JustinTX
/

sky2

Model card Files Files and versions

sky2 / skydiscover /evaluation /__init__.py

JustinTX's picture

Add files using upload-large-folder tool

7f611c5 verified about 1 month ago

history blame contribute delete

3.03 kB

	"""Program evaluation framework.

	Evaluator hierarchy (pick one per benchmark)::

	Evaluator Python function: evaluate(program_path) -> dict
	ContainerizedEvaluator Docker: Dockerfile + evaluate.sh -> JSON on stdout
	└── HarborEvaluator Harbor protocol: instruction.md + tests/test.sh + environment/

	``create_evaluator()`` auto-detects which one to use based on the benchmark
	directory contents. Detection order: Harbor > Containerized > Python.

	Supporting modules:

	- ``evaluation_result`` — ``EvaluationResult`` dataclass (metrics + artifacts)
	- ``llm_judge`` — optional LLM-as-a-judge scorer (adds ``llm_*`` metrics)
	- ``wrapper`` — CLI bridge so old ``evaluate(path)->dict`` functions
	can emit container-protocol JSON on stdout
	"""

	import os
	from typing import Dict, Optional, Union

	from skydiscover.evaluation.container_evaluator import ContainerizedEvaluator
	from skydiscover.evaluation.evaluation_result import EvaluationResult
	from skydiscover.evaluation.evaluator import Evaluator
	from skydiscover.evaluation.harbor_evaluator import HarborEvaluator
	from skydiscover.evaluation.llm_judge import LLMJudge

	__all__ = [
	"EvaluationResult",
	"Evaluator",
	"ContainerizedEvaluator",
	"HarborEvaluator",
	"LLMJudge",
	"create_evaluator",
	]


	def _is_harbor_task(path: str) -> bool:
	"""Detect a Harbor task directory (instruction.md + tests/test.sh + environment/Dockerfile)."""
	return (
	os.path.isdir(path)
	and os.path.exists(os.path.join(path, "instruction.md"))
	and os.path.exists(os.path.join(path, "tests", "test.sh"))
	and os.path.isdir(os.path.join(path, "environment"))
	and os.path.exists(os.path.join(path, "environment", "Dockerfile"))
	)


	def _is_containerized(path: str) -> bool:
	"""Detect a standard containerized benchmark (Dockerfile + evaluate.sh)."""
	return (
	os.path.isdir(path)
	and os.path.exists(os.path.join(path, "Dockerfile"))
	and os.path.exists(os.path.join(path, "evaluate.sh"))
	)


	def create_evaluator(
	config,
	llm_judge: Optional[LLMJudge] = None,
	max_concurrent: int = 4,
	env_vars: Optional[Dict[str, str]] = None,
	) -> Union[Evaluator, ContainerizedEvaluator, HarborEvaluator]:
	"""Return the right evaluator for the given config.

	Detection order (most specific first):
	1. Harbor task — instruction.md + tests/ + environment/Dockerfile
	2. Containerized — Dockerfile + evaluate.sh
	3. Python evaluator — fallback
	"""
	path = config.evaluation_file or ""
	if _is_harbor_task(path):
	return HarborEvaluator(path, config, max_concurrent=max_concurrent, env_vars=env_vars)
	if _is_containerized(path):
	return ContainerizedEvaluator(
	path, config, max_concurrent=max_concurrent, env_vars=env_vars
	)
	return Evaluator(config, llm_judge=llm_judge, max_concurrent=max_concurrent, env_vars=env_vars)