File size: 6,202 Bytes

af83196

"""Smoke tests for the end-to-end discovery pipeline and unit guards for recent bug fixes."""

import os
import textwrap
from typing import Any, Dict, List
from unittest.mock import patch

import pytest

from skydiscover.api import DiscoveryResult, run_discovery
from skydiscover.config import Config, LLMModelConfig
from skydiscover.evaluation.evaluator import Evaluator, EvaluatorConfig
from skydiscover.llm.base import LLMResponse
from skydiscover.llm.llm_pool import LLMPool

# ---------------------------------------------------------------------------
# Inline evaluator source — scores programs with `def solve` higher
# ---------------------------------------------------------------------------
EVALUATOR_SOURCE = textwrap.dedent("""\
    import ast

    def evaluate(program_path: str) -> dict:
        with open(program_path, "r") as f:
            source = f.read()

        score = 0.1  # baseline for any non-empty program
        try:
            tree = ast.parse(source)
            # reward programs that define a `solve` function
            for node in ast.walk(tree):
                if isinstance(node, ast.FunctionDef) and node.name == "solve":
                    score = 0.8
                    break
        except SyntaxError:
            score = 0.0

        return {"combined_score": score}
    """)

# ---------------------------------------------------------------------------
# Inline seed program — intentionally does NOT define `solve` so it scores low
# ---------------------------------------------------------------------------
SEED_SOURCE = textwrap.dedent("""\
    def hello():
        return "hi"
    """)

# ---------------------------------------------------------------------------
# Mock LLM response — a full-rewrite code block containing `def solve`
# ---------------------------------------------------------------------------
MOCK_LLM_CODE = textwrap.dedent("""\
    def solve(x):
        return x ** 2 + 1
    """)

MOCK_RESPONSE_TEXT = f"```python\n{MOCK_LLM_CODE}```"


# ---------------------------------------------------------------------------
# FakeLLMPool — replaces the real LLMPool so no OpenAI client is created
# ---------------------------------------------------------------------------
class FakeLLMPool:
    """Drop-in replacement for LLMPool that returns a canned response."""

    def __init__(self, models_cfg: List[LLMModelConfig]):
        # Intentionally do NOT create real clients.
        self.models_cfg = models_cfg

    async def generate(
        self, system_message: str, messages: List[Dict[str, Any]], **kwargs
    ) -> LLMResponse:
        return LLMResponse(text=MOCK_RESPONSE_TEXT)

    async def generate_all(
        self, system_message: str, messages: List[Dict[str, Any]], **kwargs
    ) -> List[LLMResponse]:
        return [LLMResponse(text=MOCK_RESPONSE_TEXT)]


# ===========================================================================
# Smoke test: end-to-end pipeline with mocked LLM
# ===========================================================================
class TestSmokePipeline:
    def test_run_discovery_returns_result(self, tmp_path):
        """run_discovery completes 2 iterations and returns a valid DiscoveryResult."""

        # Write evaluator and seed program to tmp_path
        evaluator_file = tmp_path / "evaluator.py"
        evaluator_file.write_text(EVALUATOR_SOURCE)

        seed_file = tmp_path / "seed.py"
        seed_file.write_text(SEED_SOURCE)

        output_dir = str(tmp_path / "output")

        config = Config.from_dict(
            {
                "max_iterations": 2,
                "diff_based_generation": False,
                "monitor": {"enabled": False},
                "search": {"type": "topk"},
                "evaluator": {"evaluation_file": str(evaluator_file)},
                "llm": {
                    "models": [
                        {"name": "fake-model", "api_key": "fake", "api_base": "http://localhost:1"}
                    ],
                },
            }
        )

        with patch(
            "skydiscover.search.default_discovery_controller.LLMPool",
            FakeLLMPool,
        ):
            result = run_discovery(
                evaluator=str(evaluator_file),
                initial_program=str(seed_file),
                config=config,
                output_dir=output_dir,
                cleanup=False,
            )

        # Assertions
        assert isinstance(result, DiscoveryResult)
        assert result.best_score >= 0.8  # mock LLM produces `def solve` → scored 0.8
        assert "def solve" in result.best_solution
        assert os.path.isdir(output_dir)


# ===========================================================================
# Unit guards for recent bug fixes
# ===========================================================================
class TestBugFixGuards:
    def test_llm_pool_raises_on_zero_weights(self):
        """LLMPool must raise ValueError when all model weights are zero."""
        cfgs = [
            LLMModelConfig(name="m1", weight=0.0, api_key="k", api_base="http://x"),
            LLMModelConfig(name="m2", weight=0.0, api_key="k", api_base="http://x"),
        ]
        with pytest.raises(ValueError, match="weights"):
            LLMPool(cfgs)

    def test_llm_pool_raises_on_negative_weight(self):
        """LLMPool must raise ValueError when any model weight is negative."""
        cfgs = [
            LLMModelConfig(name="m1", weight=-1.0, api_key="k", api_base="http://x"),
            LLMModelConfig(name="m2", weight=2.0, api_key="k", api_base="http://x"),
        ]
        with pytest.raises(ValueError, match="weights"):
            LLMPool(cfgs)

    def test_evaluator_unique_module_names(self, tmp_path):
        """Two Evaluator instances for the same file must get distinct _module_name values."""
        eval_file = tmp_path / "eval.py"
        eval_file.write_text("def evaluate(program_path):\n    return {'combined_score': 1.0}\n")

        cfg = EvaluatorConfig(evaluation_file=str(eval_file))
        ev1 = Evaluator(config=cfg)
        ev2 = Evaluator(config=cfg)

        assert ev1._module_name != ev2._module_name