sky2 / tests /evaluation /test_harbor_evaluator.py

Add files using upload-large-folder tool

af83196 verified about 1 month ago

15.1 kB

	"""Tests for HarborEvaluator — solution path extraction, task.toml parsing, reward reading, and detection."""

	import json
	import textwrap
	from unittest.mock import patch, MagicMock

	import pytest

	from skydiscover.config import EvaluatorConfig
	from skydiscover.evaluation import _is_harbor_task, _is_containerized
	from skydiscover.evaluation.harbor_evaluator import HarborEvaluator, _DEFAULT_SOLUTION_PATH


	def _make_evaluator(task_dir: str) -> HarborEvaluator:
	"""Create a HarborEvaluator without starting Docker."""
	inst = object.__new__(HarborEvaluator)
	inst.task_dir = task_dir
	return inst


	# ------------------------------------------------------------------
	# task.toml timeout parsing
	# ------------------------------------------------------------------


	class TestTaskTomlTimeout:
	def test_reads_verifier_timeout(self, tmp_path):
	(tmp_path / "task.toml").write_text("[verifier]\ntimeout_sec = 3600\n")
	inst = _make_evaluator(str(tmp_path))
	config = EvaluatorConfig()
	inst._apply_task_toml_timeout(config)
	assert config.timeout == 3600

	def test_no_task_toml_keeps_default(self, tmp_path):
	inst = _make_evaluator(str(tmp_path))
	config = EvaluatorConfig()
	inst._apply_task_toml_timeout(config)
	assert config.timeout == 360

	def test_missing_key_keeps_default(self, tmp_path):
	(tmp_path / "task.toml").write_text("[metadata]\nname = 'test'\n")
	inst = _make_evaluator(str(tmp_path))
	config = EvaluatorConfig()
	inst._apply_task_toml_timeout(config)
	assert config.timeout == 360

	def test_inline_timeout(self, tmp_path):
	(tmp_path / "task.toml").write_text("timeout_sec = 1200\n")
	inst = _make_evaluator(str(tmp_path))
	config = EvaluatorConfig()
	inst._apply_task_toml_timeout(config)
	assert config.timeout == 1200

	def test_malformed_toml_keeps_default(self, tmp_path):
	(tmp_path / "task.toml").write_bytes(b"\x80\x81\x82")
	inst = _make_evaluator(str(tmp_path))
	config = EvaluatorConfig()
	inst._apply_task_toml_timeout(config)
	assert config.timeout == 360


	# ------------------------------------------------------------------
	# Solution path extraction: solve.sh (tier 1)
	# ------------------------------------------------------------------


	class TestExtractPathFromSolveSh:
	def _write_solve_sh(self, tmp_path, content: str):
	solution_dir = tmp_path / "solution"
	solution_dir.mkdir(exist_ok=True)
	(solution_dir / "solve.sh").write_text(content)
	return _make_evaluator(str(tmp_path))

	def test_absolute_cat_redirect(self, tmp_path):
	inst = self._write_solve_sh(tmp_path, "cat > /app/solver.py << 'EOF'\nprint('hi')\nEOF\n")
	assert inst._extract_path_from_solve_sh() == "/app/solver.py"

	def test_bare_redirect(self, tmp_path):
	inst = self._write_solve_sh(tmp_path, "> /workspace/solution.py << 'EOF'\ncode\nEOF\n")
	assert inst._extract_path_from_solve_sh() == "/workspace/solution.py"

	def test_rust_extension(self, tmp_path):
	inst = self._write_solve_sh(tmp_path, "cat > /app/src/main.rs << 'EOF'\nfn main(){}\nEOF\n")
	assert inst._extract_path_from_solve_sh() == "/app/src/main.rs"

	def test_cpp_extension(self, tmp_path):
	inst = self._write_solve_sh(tmp_path, "cat > /solution/solve.cpp << 'EOF'\nint main(){}\nEOF\n")
	assert inst._extract_path_from_solve_sh() == "/solution/solve.cpp"

	def test_relative_path_with_cd(self, tmp_path):
	content = textwrap.dedent("""\
	#!/bin/bash
	cd "/workspace/project"
	cat > src/interfaces/base.rs << 'EOF'
	code
	EOF
	""")
	inst = self._write_solve_sh(tmp_path, content)
	assert inst._extract_path_from_solve_sh() == "/workspace/project/src/interfaces/base.rs"

	def test_relative_path_with_variable_assignment(self, tmp_path):
	content = textwrap.dedent("""\
	#!/bin/bash
	RBENCH_DIR="/workspace/rbench_reference"
	cat > src/main.py << 'EOF'
	code
	EOF
	""")
	inst = self._write_solve_sh(tmp_path, content)
	assert inst._extract_path_from_solve_sh() == "/workspace/rbench_reference/src/main.py"

	def test_relative_path_with_dockerfile_workdir(self, tmp_path):
	(tmp_path / "solution").mkdir()
	(tmp_path / "solution" / "solve.sh").write_text("cat > solver.py << 'EOF'\ncode\nEOF\n")
	(tmp_path / "environment").mkdir()
	(tmp_path / "environment" / "Dockerfile").write_text("FROM python:3.11\nWORKDIR /opt/app\n")
	inst = _make_evaluator(str(tmp_path))
	assert inst._extract_path_from_solve_sh() == "/opt/app/solver.py"

	def test_no_solve_sh_returns_empty(self, tmp_path):
	inst = _make_evaluator(str(tmp_path))
	assert inst._extract_path_from_solve_sh() == ""

	def test_no_redirect_returns_empty(self, tmp_path):
	inst = self._write_solve_sh(tmp_path, "#!/bin/bash\necho hello\n")
	assert inst._extract_path_from_solve_sh() == ""


	# ------------------------------------------------------------------
	# Solution path extraction: instruction.md (tier 2)
	# ------------------------------------------------------------------


	class TestExtractPathFromInstruction:
	def test_backtick_path(self, tmp_path):
	(tmp_path / "instruction.md").write_text("Write your solution in `/app/solver.py`.\n")
	inst = _make_evaluator(str(tmp_path))
	assert inst._extract_path_from_instruction() == "/app/solver.py"

	def test_quoted_path(self, tmp_path):
	(tmp_path / "instruction.md").write_text('Save your code to "/workspace/solve.py".\n')
	inst = _make_evaluator(str(tmp_path))
	assert inst._extract_path_from_instruction() == "/workspace/solve.py"

	def test_preposition_path(self, tmp_path):
	(tmp_path / "instruction.md").write_text("Place your solution at /opt/solution.py and run it.\n")
	inst = _make_evaluator(str(tmp_path))
	assert inst._extract_path_from_instruction() == "/opt/solution.py"

	def test_no_path_returns_empty(self, tmp_path):
	(tmp_path / "instruction.md").write_text("Solve this problem efficiently.\n")
	inst = _make_evaluator(str(tmp_path))
	assert inst._extract_path_from_instruction() == ""

	def test_no_file_returns_empty(self, tmp_path):
	inst = _make_evaluator(str(tmp_path))
	assert inst._extract_path_from_instruction() == ""


	# ------------------------------------------------------------------
	# Full solution path extraction (tier priority)
	# ------------------------------------------------------------------


	class TestExtractSolutionPath:
	def test_prefers_solve_sh_over_instruction(self, tmp_path):
	(tmp_path / "solution").mkdir()
	(tmp_path / "solution" / "solve.sh").write_text("cat > /from/solve.py << 'EOF'\nEOF\n")
	(tmp_path / "instruction.md").write_text("Write to `/from/instruction.py`.\n")
	inst = _make_evaluator(str(tmp_path))
	assert inst._extract_solution_path() == "/from/solve.py"

	def test_falls_back_to_instruction(self, tmp_path):
	(tmp_path / "instruction.md").write_text("Write to `/from/instruction.py`.\n")
	inst = _make_evaluator(str(tmp_path))
	assert inst._extract_solution_path() == "/from/instruction.py"

	def test_falls_back_to_default(self, tmp_path):
	inst = _make_evaluator(str(tmp_path))
	assert inst._extract_solution_path() == _DEFAULT_SOLUTION_PATH


	# ------------------------------------------------------------------
	# _read_reward
	# ------------------------------------------------------------------


	def _mock_docker_exec(outputs: dict):
	"""Return a side_effect for subprocess.run that fakes `docker exec ... cat <path>`.

	Args:
	outputs: mapping from container path to (returncode, stdout) tuples.
	"""
	def side_effect(cmd, **kwargs):
	# Detect "docker exec <cid> cat <path>" calls.
	if cmd[:2] == ["docker", "exec"] and "cat" in cmd:
	path = cmd[-1]
	if path in outputs:
	rc, stdout = outputs[path]
	return MagicMock(returncode=rc, stdout=stdout)
	return MagicMock(returncode=1, stdout="")
	return side_effect


	class TestReadReward:
	def _make_inst(self):
	inst = object.__new__(HarborEvaluator)
	inst.container_id = "fake_container"
	return inst

	def test_reads_reward_txt(self):
	inst = self._make_inst()
	with patch("subprocess.run", side_effect=_mock_docker_exec({
	"/logs/verifier/reward.json": (1, ""),
	"/logs/verifier/reward.txt": (0, "0.75\n"),
	})):
	result = inst._read_reward()
	assert result.metrics["combined_score"] == 0.75

	def test_reads_reward_json_with_reward_key(self):
	inst = self._make_inst()
	payload = json.dumps({"reward": 0.9, "time_ms": 123})
	with patch("subprocess.run", side_effect=_mock_docker_exec({
	"/logs/verifier/reward.json": (0, payload),
	})):
	result = inst._read_reward()
	assert result.metrics["combined_score"] == 0.9
	assert result.metrics["time_ms"] == 123.0

	def test_reads_reward_json_with_score_key(self):
	inst = self._make_inst()
	payload = json.dumps({"score": 0.5})
	with patch("subprocess.run", side_effect=_mock_docker_exec({
	"/logs/verifier/reward.json": (0, payload),
	})):
	result = inst._read_reward()
	assert result.metrics["combined_score"] == 0.5

	def test_json_preferred_over_txt(self):
	inst = self._make_inst()
	payload = json.dumps({"reward": 0.9})
	with patch("subprocess.run", side_effect=_mock_docker_exec({
	"/logs/verifier/reward.json": (0, payload),
	"/logs/verifier/reward.txt": (0, "0.1\n"),
	})):
	result = inst._read_reward()
	assert result.metrics["combined_score"] == 0.9

	def test_missing_reward_key_defaults_to_zero(self):
	inst = self._make_inst()
	payload = json.dumps({"time_ms": 500})
	with patch("subprocess.run", side_effect=_mock_docker_exec({
	"/logs/verifier/reward.json": (0, payload),
	})):
	result = inst._read_reward()
	assert result.metrics["combined_score"] == 0.0

	def test_no_reward_files_returns_zero(self):
	inst = self._make_inst()
	with patch("subprocess.run", side_effect=_mock_docker_exec({
	"/logs/verifier/reward.json": (1, ""),
	"/logs/verifier/reward.txt": (1, ""),
	})):
	result = inst._read_reward()
	assert result.metrics["combined_score"] == 0.0
	assert "error" in result.artifacts

	def test_malformed_json_falls_back_to_txt(self):
	inst = self._make_inst()
	with patch("subprocess.run", side_effect=_mock_docker_exec({
	"/logs/verifier/reward.json": (0, "{bad json"),
	"/logs/verifier/reward.txt": (0, "0.42\n"),
	})):
	result = inst._read_reward()
	assert result.metrics["combined_score"] == 0.42

	def test_non_numeric_txt_falls_through(self):
	inst = self._make_inst()
	with patch("subprocess.run", side_effect=_mock_docker_exec({
	"/logs/verifier/reward.json": (1, ""),
	"/logs/verifier/reward.txt": (0, "not a number"),
	})):
	result = inst._read_reward()
	assert result.metrics["combined_score"] == 0.0
	assert "error" in result.artifacts


	# ------------------------------------------------------------------
	# Harbor task detection
	# ------------------------------------------------------------------


	def _make_harbor_dir(tmp_path):
	"""Create a minimal valid Harbor task directory."""
	(tmp_path / "instruction.md").write_text("problem")
	(tmp_path / "tests").mkdir()
	(tmp_path / "tests" / "test.sh").write_text("#!/bin/bash\n")
	(tmp_path / "environment").mkdir()
	(tmp_path / "environment" / "Dockerfile").write_text("FROM python:3.11\n")
	return str(tmp_path)


	class TestHarborTaskDetection:
	def test_valid_harbor_task(self, tmp_path):
	assert _is_harbor_task(_make_harbor_dir(tmp_path)) is True

	def test_missing_instruction_md(self, tmp_path):
	(tmp_path / "tests").mkdir()
	(tmp_path / "environment").mkdir()
	(tmp_path / "environment" / "Dockerfile").write_text("FROM python:3.11\n")
	assert _is_harbor_task(str(tmp_path)) is False

	def test_missing_tests_dir(self, tmp_path):
	(tmp_path / "instruction.md").write_text("problem")
	(tmp_path / "environment").mkdir()
	(tmp_path / "environment" / "Dockerfile").write_text("FROM python:3.11\n")
	assert _is_harbor_task(str(tmp_path)) is False

	def test_missing_test_sh(self, tmp_path):
	(tmp_path / "instruction.md").write_text("problem")
	(tmp_path / "tests").mkdir()
	(tmp_path / "environment").mkdir()
	(tmp_path / "environment" / "Dockerfile").write_text("FROM python:3.11\n")
	assert _is_harbor_task(str(tmp_path)) is False

	def test_missing_dockerfile(self, tmp_path):
	(tmp_path / "instruction.md").write_text("problem")
	(tmp_path / "tests").mkdir()
	(tmp_path / "tests" / "test.sh").write_text("#!/bin/bash\n")
	(tmp_path / "environment").mkdir()
	assert _is_harbor_task(str(tmp_path)) is False

	def test_not_a_directory(self, tmp_path):
	f = tmp_path / "not_a_dir"
	f.write_text("hi")
	assert _is_harbor_task(str(f)) is False


	class TestContainerizedDetection:
	def test_valid_containerized(self, tmp_path):
	(tmp_path / "Dockerfile").write_text("FROM python:3.11\n")
	(tmp_path / "evaluate.sh").write_text("#!/bin/bash\n")
	assert _is_containerized(str(tmp_path)) is True

	def test_missing_evaluate_sh(self, tmp_path):
	(tmp_path / "Dockerfile").write_text("FROM python:3.11\n")
	assert _is_containerized(str(tmp_path)) is False

	def test_missing_dockerfile(self, tmp_path):
	(tmp_path / "evaluate.sh").write_text("#!/bin/bash\n")
	assert _is_containerized(str(tmp_path)) is False


	class TestDetectionPriority:
	"""A dir that matches both Harbor and containerized should be detected as Harbor."""

	def test_harbor_wins_over_containerized(self, tmp_path):
	# Set up Harbor structure.
	_make_harbor_dir(tmp_path)
	# Also add containerized markers at root.
	(tmp_path / "Dockerfile").write_text("FROM python:3.11\n")
	(tmp_path / "evaluate.sh").write_text("#!/bin/bash\n")

	assert _is_harbor_task(str(tmp_path)) is True
	assert _is_containerized(str(tmp_path)) is True
	# create_evaluator checks harbor first — verify the detection functions
	# agree that both match, confirming the ordering in create_evaluator matters.