sky / tests /evaluation /test_container_evaluator.py
JustinTX's picture
Add files using upload-large-folder tool
e530698 verified
"""Tests for ContainerizedEvaluator — pure logic that doesn't need Docker."""
import json
from unittest.mock import patch
import pytest
from skydiscover.evaluation.container_evaluator import ContainerizedEvaluator
@pytest.fixture
def parse_output():
"""Return a bound _parse_output method without starting a real container."""
inst = object.__new__(ContainerizedEvaluator)
return inst._parse_output
# ------------------------------------------------------------------
# _parse_output: success cases
# ------------------------------------------------------------------
class TestParseOutputSuccess:
def test_full_valid_response(self, parse_output):
stdout = json.dumps({
"status": "success",
"combined_score": 0.85,
"metrics": {"combined_score": 0.85, "accuracy": 0.9, "speed": 0.8},
"artifacts": {"feedback": "good job"},
})
result = parse_output(stdout)
assert result.metrics["combined_score"] == 0.85
assert result.metrics["accuracy"] == 0.9
assert result.metrics["speed"] == 0.8
assert result.artifacts["feedback"] == "good job"
assert "status" not in result.artifacts
def test_combined_score_promoted_to_metrics(self, parse_output):
stdout = json.dumps({
"status": "success",
"combined_score": 0.5,
"metrics": {"accuracy": 0.5},
})
result = parse_output(stdout)
assert result.metrics["combined_score"] == 0.5
assert result.metrics["accuracy"] == 0.5
def test_no_artifacts(self, parse_output):
stdout = json.dumps({
"status": "success",
"combined_score": 1.0,
"metrics": {"combined_score": 1.0},
})
assert parse_output(stdout).artifacts == {}
def test_integer_metrics_converted_to_float(self, parse_output):
stdout = json.dumps({
"status": "success",
"combined_score": 1,
"metrics": {"n_correct": 5, "n_total": 5},
})
result = parse_output(stdout)
assert result.metrics["n_correct"] == 5.0
assert isinstance(result.metrics["n_correct"], float)
def test_non_numeric_metrics_filtered(self, parse_output):
stdout = json.dumps({
"status": "success",
"combined_score": 0.5,
"metrics": {"combined_score": 0.5, "label": "fast", "count": 3},
})
result = parse_output(stdout)
assert "label" not in result.metrics
assert result.metrics["count"] == 3.0
def test_trailing_whitespace_stripped(self, parse_output):
stdout = json.dumps({"status": "success", "combined_score": 0.7, "metrics": {}}) + "\n\n"
assert parse_output(stdout).metrics["combined_score"] == 0.7
# ------------------------------------------------------------------
# _parse_output: error / edge cases
# ------------------------------------------------------------------
class TestParseOutputErrors:
def test_malformed_json(self, parse_output):
result = parse_output("not json at all")
assert result.metrics["error"] == 0.0
assert "raw_output" in result.artifacts
def test_empty_string(self, parse_output):
result = parse_output("")
assert result.metrics["error"] == 0.0
assert "raw_output" in result.artifacts
def test_error_status_surfaces_in_artifacts(self, parse_output):
stdout = json.dumps({
"status": "error",
"combined_score": 0.0,
"metrics": {"combined_score": 0.0},
"artifacts": {"error": "segfault"},
})
result = parse_output(stdout)
assert result.metrics["combined_score"] == 0.0
assert result.artifacts["status"] == "error"
assert result.artifacts["error"] == "segfault"
def test_timeout_status(self, parse_output):
stdout = json.dumps({"status": "timeout", "combined_score": 0.0, "metrics": {}})
assert parse_output(stdout).artifacts["status"] == "timeout"
def test_missing_status_defaults_to_error(self, parse_output):
stdout = json.dumps({"combined_score": 0.5, "metrics": {"combined_score": 0.5}})
assert parse_output(stdout).artifacts["status"] == "error"
def test_missing_combined_score_defaults_to_zero(self, parse_output):
stdout = json.dumps({"status": "success", "metrics": {}})
assert parse_output(stdout).metrics["combined_score"] == 0.0
def test_missing_metrics_dict(self, parse_output):
stdout = json.dumps({"status": "success", "combined_score": 0.3})
assert parse_output(stdout).metrics["combined_score"] == 0.3
def test_partial_json_truncated(self, parse_output):
result = parse_output('{"status": "suc')
assert result.metrics["error"] == 0.0
assert "raw_output" in result.artifacts
# ------------------------------------------------------------------
# llm_judge attribute
# ------------------------------------------------------------------
class TestLlmJudgeAttribute:
def test_init_sets_llm_judge_to_none(self):
"""ContainerizedEvaluator.__init__ must set self.llm_judge before Docker calls."""
with patch.object(ContainerizedEvaluator, "_build_image", return_value="fake:latest"), \
patch.object(ContainerizedEvaluator, "_start_container", return_value="abc123"):
from skydiscover.config import EvaluatorConfig
inst = ContainerizedEvaluator.__new__(ContainerizedEvaluator)
ContainerizedEvaluator.__init__(inst, "/tmp/fake", EvaluatorConfig())
assert inst.llm_judge is None