File size: 5,695 Bytes
af83196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
"""Tests for ContainerizedEvaluator — pure logic that doesn't need Docker."""

import json
from unittest.mock import patch

import pytest

from skydiscover.evaluation.container_evaluator import ContainerizedEvaluator


@pytest.fixture
def parse_output():
    """Return a bound _parse_output method without starting a real container."""
    inst = object.__new__(ContainerizedEvaluator)
    return inst._parse_output


# ------------------------------------------------------------------
# _parse_output: success cases
# ------------------------------------------------------------------


class TestParseOutputSuccess:
    def test_full_valid_response(self, parse_output):
        stdout = json.dumps({
            "status": "success",
            "combined_score": 0.85,
            "metrics": {"combined_score": 0.85, "accuracy": 0.9, "speed": 0.8},
            "artifacts": {"feedback": "good job"},
        })
        result = parse_output(stdout)
        assert result.metrics["combined_score"] == 0.85
        assert result.metrics["accuracy"] == 0.9
        assert result.metrics["speed"] == 0.8
        assert result.artifacts["feedback"] == "good job"
        assert "status" not in result.artifacts

    def test_combined_score_promoted_to_metrics(self, parse_output):
        stdout = json.dumps({
            "status": "success",
            "combined_score": 0.5,
            "metrics": {"accuracy": 0.5},
        })
        result = parse_output(stdout)
        assert result.metrics["combined_score"] == 0.5
        assert result.metrics["accuracy"] == 0.5

    def test_no_artifacts(self, parse_output):
        stdout = json.dumps({
            "status": "success",
            "combined_score": 1.0,
            "metrics": {"combined_score": 1.0},
        })
        assert parse_output(stdout).artifacts == {}

    def test_integer_metrics_converted_to_float(self, parse_output):
        stdout = json.dumps({
            "status": "success",
            "combined_score": 1,
            "metrics": {"n_correct": 5, "n_total": 5},
        })
        result = parse_output(stdout)
        assert result.metrics["n_correct"] == 5.0
        assert isinstance(result.metrics["n_correct"], float)

    def test_non_numeric_metrics_filtered(self, parse_output):
        stdout = json.dumps({
            "status": "success",
            "combined_score": 0.5,
            "metrics": {"combined_score": 0.5, "label": "fast", "count": 3},
        })
        result = parse_output(stdout)
        assert "label" not in result.metrics
        assert result.metrics["count"] == 3.0

    def test_trailing_whitespace_stripped(self, parse_output):
        stdout = json.dumps({"status": "success", "combined_score": 0.7, "metrics": {}}) + "\n\n"
        assert parse_output(stdout).metrics["combined_score"] == 0.7


# ------------------------------------------------------------------
# _parse_output: error / edge cases
# ------------------------------------------------------------------


class TestParseOutputErrors:
    def test_malformed_json(self, parse_output):
        result = parse_output("not json at all")
        assert result.metrics["error"] == 0.0
        assert "raw_output" in result.artifacts

    def test_empty_string(self, parse_output):
        result = parse_output("")
        assert result.metrics["error"] == 0.0
        assert "raw_output" in result.artifacts

    def test_error_status_surfaces_in_artifacts(self, parse_output):
        stdout = json.dumps({
            "status": "error",
            "combined_score": 0.0,
            "metrics": {"combined_score": 0.0},
            "artifacts": {"error": "segfault"},
        })
        result = parse_output(stdout)
        assert result.metrics["combined_score"] == 0.0
        assert result.artifacts["status"] == "error"
        assert result.artifacts["error"] == "segfault"

    def test_timeout_status(self, parse_output):
        stdout = json.dumps({"status": "timeout", "combined_score": 0.0, "metrics": {}})
        assert parse_output(stdout).artifacts["status"] == "timeout"

    def test_missing_status_defaults_to_error(self, parse_output):
        stdout = json.dumps({"combined_score": 0.5, "metrics": {"combined_score": 0.5}})
        assert parse_output(stdout).artifacts["status"] == "error"

    def test_missing_combined_score_defaults_to_zero(self, parse_output):
        stdout = json.dumps({"status": "success", "metrics": {}})
        assert parse_output(stdout).metrics["combined_score"] == 0.0

    def test_missing_metrics_dict(self, parse_output):
        stdout = json.dumps({"status": "success", "combined_score": 0.3})
        assert parse_output(stdout).metrics["combined_score"] == 0.3

    def test_partial_json_truncated(self, parse_output):
        result = parse_output('{"status": "suc')
        assert result.metrics["error"] == 0.0
        assert "raw_output" in result.artifacts


# ------------------------------------------------------------------
# llm_judge attribute
# ------------------------------------------------------------------


class TestLlmJudgeAttribute:
    def test_init_sets_llm_judge_to_none(self):
        """ContainerizedEvaluator.__init__ must set self.llm_judge before Docker calls."""
        with patch.object(ContainerizedEvaluator, "_build_image", return_value="fake:latest"), \
             patch.object(ContainerizedEvaluator, "_start_container", return_value="abc123"):
            from skydiscover.config import EvaluatorConfig

            inst = ContainerizedEvaluator.__new__(ContainerizedEvaluator)
            ContainerizedEvaluator.__init__(inst, "/tmp/fake", EvaluatorConfig())
            assert inst.llm_judge is None