File size: 15,120 Bytes
af83196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
"""Tests for HarborEvaluator — solution path extraction, task.toml parsing, reward reading, and detection."""

import json
import textwrap
from unittest.mock import patch, MagicMock

import pytest

from skydiscover.config import EvaluatorConfig
from skydiscover.evaluation import _is_harbor_task, _is_containerized
from skydiscover.evaluation.harbor_evaluator import HarborEvaluator, _DEFAULT_SOLUTION_PATH


def _make_evaluator(task_dir: str) -> HarborEvaluator:
    """Create a HarborEvaluator without starting Docker."""
    inst = object.__new__(HarborEvaluator)
    inst.task_dir = task_dir
    return inst


# ------------------------------------------------------------------
# task.toml timeout parsing
# ------------------------------------------------------------------


class TestTaskTomlTimeout:
    def test_reads_verifier_timeout(self, tmp_path):
        (tmp_path / "task.toml").write_text("[verifier]\ntimeout_sec = 3600\n")
        inst = _make_evaluator(str(tmp_path))
        config = EvaluatorConfig()
        inst._apply_task_toml_timeout(config)
        assert config.timeout == 3600

    def test_no_task_toml_keeps_default(self, tmp_path):
        inst = _make_evaluator(str(tmp_path))
        config = EvaluatorConfig()
        inst._apply_task_toml_timeout(config)
        assert config.timeout == 360

    def test_missing_key_keeps_default(self, tmp_path):
        (tmp_path / "task.toml").write_text("[metadata]\nname = 'test'\n")
        inst = _make_evaluator(str(tmp_path))
        config = EvaluatorConfig()
        inst._apply_task_toml_timeout(config)
        assert config.timeout == 360

    def test_inline_timeout(self, tmp_path):
        (tmp_path / "task.toml").write_text("timeout_sec = 1200\n")
        inst = _make_evaluator(str(tmp_path))
        config = EvaluatorConfig()
        inst._apply_task_toml_timeout(config)
        assert config.timeout == 1200

    def test_malformed_toml_keeps_default(self, tmp_path):
        (tmp_path / "task.toml").write_bytes(b"\x80\x81\x82")
        inst = _make_evaluator(str(tmp_path))
        config = EvaluatorConfig()
        inst._apply_task_toml_timeout(config)
        assert config.timeout == 360


# ------------------------------------------------------------------
# Solution path extraction: solve.sh (tier 1)
# ------------------------------------------------------------------


class TestExtractPathFromSolveSh:
    def _write_solve_sh(self, tmp_path, content: str):
        solution_dir = tmp_path / "solution"
        solution_dir.mkdir(exist_ok=True)
        (solution_dir / "solve.sh").write_text(content)
        return _make_evaluator(str(tmp_path))

    def test_absolute_cat_redirect(self, tmp_path):
        inst = self._write_solve_sh(tmp_path, "cat > /app/solver.py << 'EOF'\nprint('hi')\nEOF\n")
        assert inst._extract_path_from_solve_sh() == "/app/solver.py"

    def test_bare_redirect(self, tmp_path):
        inst = self._write_solve_sh(tmp_path, "> /workspace/solution.py << 'EOF'\ncode\nEOF\n")
        assert inst._extract_path_from_solve_sh() == "/workspace/solution.py"

    def test_rust_extension(self, tmp_path):
        inst = self._write_solve_sh(tmp_path, "cat > /app/src/main.rs << 'EOF'\nfn main(){}\nEOF\n")
        assert inst._extract_path_from_solve_sh() == "/app/src/main.rs"

    def test_cpp_extension(self, tmp_path):
        inst = self._write_solve_sh(tmp_path, "cat > /solution/solve.cpp << 'EOF'\nint main(){}\nEOF\n")
        assert inst._extract_path_from_solve_sh() == "/solution/solve.cpp"

    def test_relative_path_with_cd(self, tmp_path):
        content = textwrap.dedent("""\
            #!/bin/bash
            cd "/workspace/project"
            cat > src/interfaces/base.rs << 'EOF'
            code
            EOF
        """)
        inst = self._write_solve_sh(tmp_path, content)
        assert inst._extract_path_from_solve_sh() == "/workspace/project/src/interfaces/base.rs"

    def test_relative_path_with_variable_assignment(self, tmp_path):
        content = textwrap.dedent("""\
            #!/bin/bash
            RBENCH_DIR="/workspace/rbench_reference"
            cat > src/main.py << 'EOF'
            code
            EOF
        """)
        inst = self._write_solve_sh(tmp_path, content)
        assert inst._extract_path_from_solve_sh() == "/workspace/rbench_reference/src/main.py"

    def test_relative_path_with_dockerfile_workdir(self, tmp_path):
        (tmp_path / "solution").mkdir()
        (tmp_path / "solution" / "solve.sh").write_text("cat > solver.py << 'EOF'\ncode\nEOF\n")
        (tmp_path / "environment").mkdir()
        (tmp_path / "environment" / "Dockerfile").write_text("FROM python:3.11\nWORKDIR /opt/app\n")
        inst = _make_evaluator(str(tmp_path))
        assert inst._extract_path_from_solve_sh() == "/opt/app/solver.py"

    def test_no_solve_sh_returns_empty(self, tmp_path):
        inst = _make_evaluator(str(tmp_path))
        assert inst._extract_path_from_solve_sh() == ""

    def test_no_redirect_returns_empty(self, tmp_path):
        inst = self._write_solve_sh(tmp_path, "#!/bin/bash\necho hello\n")
        assert inst._extract_path_from_solve_sh() == ""


# ------------------------------------------------------------------
# Solution path extraction: instruction.md (tier 2)
# ------------------------------------------------------------------


class TestExtractPathFromInstruction:
    def test_backtick_path(self, tmp_path):
        (tmp_path / "instruction.md").write_text("Write your solution in `/app/solver.py`.\n")
        inst = _make_evaluator(str(tmp_path))
        assert inst._extract_path_from_instruction() == "/app/solver.py"

    def test_quoted_path(self, tmp_path):
        (tmp_path / "instruction.md").write_text('Save your code to "/workspace/solve.py".\n')
        inst = _make_evaluator(str(tmp_path))
        assert inst._extract_path_from_instruction() == "/workspace/solve.py"

    def test_preposition_path(self, tmp_path):
        (tmp_path / "instruction.md").write_text("Place your solution at /opt/solution.py and run it.\n")
        inst = _make_evaluator(str(tmp_path))
        assert inst._extract_path_from_instruction() == "/opt/solution.py"

    def test_no_path_returns_empty(self, tmp_path):
        (tmp_path / "instruction.md").write_text("Solve this problem efficiently.\n")
        inst = _make_evaluator(str(tmp_path))
        assert inst._extract_path_from_instruction() == ""

    def test_no_file_returns_empty(self, tmp_path):
        inst = _make_evaluator(str(tmp_path))
        assert inst._extract_path_from_instruction() == ""


# ------------------------------------------------------------------
# Full solution path extraction (tier priority)
# ------------------------------------------------------------------


class TestExtractSolutionPath:
    def test_prefers_solve_sh_over_instruction(self, tmp_path):
        (tmp_path / "solution").mkdir()
        (tmp_path / "solution" / "solve.sh").write_text("cat > /from/solve.py << 'EOF'\nEOF\n")
        (tmp_path / "instruction.md").write_text("Write to `/from/instruction.py`.\n")
        inst = _make_evaluator(str(tmp_path))
        assert inst._extract_solution_path() == "/from/solve.py"

    def test_falls_back_to_instruction(self, tmp_path):
        (tmp_path / "instruction.md").write_text("Write to `/from/instruction.py`.\n")
        inst = _make_evaluator(str(tmp_path))
        assert inst._extract_solution_path() == "/from/instruction.py"

    def test_falls_back_to_default(self, tmp_path):
        inst = _make_evaluator(str(tmp_path))
        assert inst._extract_solution_path() == _DEFAULT_SOLUTION_PATH


# ------------------------------------------------------------------
# _read_reward
# ------------------------------------------------------------------


def _mock_docker_exec(outputs: dict):
    """Return a side_effect for subprocess.run that fakes `docker exec ... cat <path>`.

    Args:
        outputs: mapping from container path to (returncode, stdout) tuples.
    """
    def side_effect(cmd, **kwargs):
        # Detect "docker exec <cid> cat <path>" calls.
        if cmd[:2] == ["docker", "exec"] and "cat" in cmd:
            path = cmd[-1]
            if path in outputs:
                rc, stdout = outputs[path]
                return MagicMock(returncode=rc, stdout=stdout)
        return MagicMock(returncode=1, stdout="")
    return side_effect


class TestReadReward:
    def _make_inst(self):
        inst = object.__new__(HarborEvaluator)
        inst.container_id = "fake_container"
        return inst

    def test_reads_reward_txt(self):
        inst = self._make_inst()
        with patch("subprocess.run", side_effect=_mock_docker_exec({
            "/logs/verifier/reward.json": (1, ""),
            "/logs/verifier/reward.txt": (0, "0.75\n"),
        })):
            result = inst._read_reward()
        assert result.metrics["combined_score"] == 0.75

    def test_reads_reward_json_with_reward_key(self):
        inst = self._make_inst()
        payload = json.dumps({"reward": 0.9, "time_ms": 123})
        with patch("subprocess.run", side_effect=_mock_docker_exec({
            "/logs/verifier/reward.json": (0, payload),
        })):
            result = inst._read_reward()
        assert result.metrics["combined_score"] == 0.9
        assert result.metrics["time_ms"] == 123.0

    def test_reads_reward_json_with_score_key(self):
        inst = self._make_inst()
        payload = json.dumps({"score": 0.5})
        with patch("subprocess.run", side_effect=_mock_docker_exec({
            "/logs/verifier/reward.json": (0, payload),
        })):
            result = inst._read_reward()
        assert result.metrics["combined_score"] == 0.5

    def test_json_preferred_over_txt(self):
        inst = self._make_inst()
        payload = json.dumps({"reward": 0.9})
        with patch("subprocess.run", side_effect=_mock_docker_exec({
            "/logs/verifier/reward.json": (0, payload),
            "/logs/verifier/reward.txt": (0, "0.1\n"),
        })):
            result = inst._read_reward()
        assert result.metrics["combined_score"] == 0.9

    def test_missing_reward_key_defaults_to_zero(self):
        inst = self._make_inst()
        payload = json.dumps({"time_ms": 500})
        with patch("subprocess.run", side_effect=_mock_docker_exec({
            "/logs/verifier/reward.json": (0, payload),
        })):
            result = inst._read_reward()
        assert result.metrics["combined_score"] == 0.0

    def test_no_reward_files_returns_zero(self):
        inst = self._make_inst()
        with patch("subprocess.run", side_effect=_mock_docker_exec({
            "/logs/verifier/reward.json": (1, ""),
            "/logs/verifier/reward.txt": (1, ""),
        })):
            result = inst._read_reward()
        assert result.metrics["combined_score"] == 0.0
        assert "error" in result.artifacts

    def test_malformed_json_falls_back_to_txt(self):
        inst = self._make_inst()
        with patch("subprocess.run", side_effect=_mock_docker_exec({
            "/logs/verifier/reward.json": (0, "{bad json"),
            "/logs/verifier/reward.txt": (0, "0.42\n"),
        })):
            result = inst._read_reward()
        assert result.metrics["combined_score"] == 0.42

    def test_non_numeric_txt_falls_through(self):
        inst = self._make_inst()
        with patch("subprocess.run", side_effect=_mock_docker_exec({
            "/logs/verifier/reward.json": (1, ""),
            "/logs/verifier/reward.txt": (0, "not a number"),
        })):
            result = inst._read_reward()
        assert result.metrics["combined_score"] == 0.0
        assert "error" in result.artifacts


# ------------------------------------------------------------------
# Harbor task detection
# ------------------------------------------------------------------


def _make_harbor_dir(tmp_path):
    """Create a minimal valid Harbor task directory."""
    (tmp_path / "instruction.md").write_text("problem")
    (tmp_path / "tests").mkdir()
    (tmp_path / "tests" / "test.sh").write_text("#!/bin/bash\n")
    (tmp_path / "environment").mkdir()
    (tmp_path / "environment" / "Dockerfile").write_text("FROM python:3.11\n")
    return str(tmp_path)


class TestHarborTaskDetection:
    def test_valid_harbor_task(self, tmp_path):
        assert _is_harbor_task(_make_harbor_dir(tmp_path)) is True

    def test_missing_instruction_md(self, tmp_path):
        (tmp_path / "tests").mkdir()
        (tmp_path / "environment").mkdir()
        (tmp_path / "environment" / "Dockerfile").write_text("FROM python:3.11\n")
        assert _is_harbor_task(str(tmp_path)) is False

    def test_missing_tests_dir(self, tmp_path):
        (tmp_path / "instruction.md").write_text("problem")
        (tmp_path / "environment").mkdir()
        (tmp_path / "environment" / "Dockerfile").write_text("FROM python:3.11\n")
        assert _is_harbor_task(str(tmp_path)) is False

    def test_missing_test_sh(self, tmp_path):
        (tmp_path / "instruction.md").write_text("problem")
        (tmp_path / "tests").mkdir()
        (tmp_path / "environment").mkdir()
        (tmp_path / "environment" / "Dockerfile").write_text("FROM python:3.11\n")
        assert _is_harbor_task(str(tmp_path)) is False

    def test_missing_dockerfile(self, tmp_path):
        (tmp_path / "instruction.md").write_text("problem")
        (tmp_path / "tests").mkdir()
        (tmp_path / "tests" / "test.sh").write_text("#!/bin/bash\n")
        (tmp_path / "environment").mkdir()
        assert _is_harbor_task(str(tmp_path)) is False

    def test_not_a_directory(self, tmp_path):
        f = tmp_path / "not_a_dir"
        f.write_text("hi")
        assert _is_harbor_task(str(f)) is False


class TestContainerizedDetection:
    def test_valid_containerized(self, tmp_path):
        (tmp_path / "Dockerfile").write_text("FROM python:3.11\n")
        (tmp_path / "evaluate.sh").write_text("#!/bin/bash\n")
        assert _is_containerized(str(tmp_path)) is True

    def test_missing_evaluate_sh(self, tmp_path):
        (tmp_path / "Dockerfile").write_text("FROM python:3.11\n")
        assert _is_containerized(str(tmp_path)) is False

    def test_missing_dockerfile(self, tmp_path):
        (tmp_path / "evaluate.sh").write_text("#!/bin/bash\n")
        assert _is_containerized(str(tmp_path)) is False


class TestDetectionPriority:
    """A dir that matches both Harbor and containerized should be detected as Harbor."""

    def test_harbor_wins_over_containerized(self, tmp_path):
        # Set up Harbor structure.
        _make_harbor_dir(tmp_path)
        # Also add containerized markers at root.
        (tmp_path / "Dockerfile").write_text("FROM python:3.11\n")
        (tmp_path / "evaluate.sh").write_text("#!/bin/bash\n")

        assert _is_harbor_task(str(tmp_path)) is True
        assert _is_containerized(str(tmp_path)) is True
        # create_evaluator checks harbor first — verify the detection functions
        # agree that both match, confirming the ordering in create_evaluator matters.