"""vLLM-backed proposer using Qwen-2.5-Coder-Instruct.

Lazy-imports vLLM so this module stays importable on hosts without a GPU
(laptops, CI). Instantiation requires vLLM + CUDA — in practice that means
inside the CHTC container.
"""

from __future__ import annotations

from typing import Any, cast

from ..utils.code_extraction import extract_python_code

DEFAULT_MODEL = "Qwen/Qwen2.5-Coder-1.5B-Instruct"

DEFAULT_SYSTEM_PROMPT = (
    "You are an expert Python programmer. Respond with a single Python code "
    "block containing the requested function and nothing else."
)


class Proposer:
    """Generates candidate Python solutions for coding prompts via vLLM.

    One GPU, one in-process model. Construct once per job and call
    `generate` with whole batches — vLLM batches internally for throughput.
    """

    def __init__(
        self,
        model_id: str = DEFAULT_MODEL,
        *,
        max_model_len: int = 4096,
        gpu_memory_utilization: float = 0.85,
        dtype: str = "bfloat16",
        trust_remote_code: bool = True,
        system_prompt: str = DEFAULT_SYSTEM_PROMPT,
    ) -> None:
        # Lazy import: keeps this module importable where vLLM is not installed.
        from vllm import LLM  # type: ignore[import-not-found]

        # Cast through Any: vLLM has no type stubs, so the LLM constructor's
        # return type is Unknown to pyright; cast erases that.
        self._llm = cast(
            "Any",
            LLM(
                model=model_id,
                max_model_len=max_model_len,
                gpu_memory_utilization=gpu_memory_utilization,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            ),
        )
        # self._llm is already Any after the cast above, so this is inferred Any.
        self._tokenizer = self._llm.get_tokenizer()
        self._system_prompt = system_prompt
        self._model_id = model_id

    @property
    def model_id(self) -> str:
        return self._model_id

    def format_prompt(self, problem: str) -> str:
        """Wrap a raw problem prompt with the model's chat template.

        Without `add_generation_prompt=True` Qwen will try to continue the
        user turn instead of starting an assistant response.
        """
        messages = [
            {"role": "system", "content": self._system_prompt},
            {"role": "user", "content": problem},
        ]
        rendered = self._tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
        )
        return str(rendered)

    def generate(
        self,
        prompts: list[str],
        *,
        n: int = 1,
        temperature: float = 0.2,
        top_p: float = 0.95,
        max_tokens: int = 1024,
    ) -> list[list[str]]:
        """Return `n` candidate Python solutions per prompt.

        Outer list has length `len(prompts)`; inner lists have length `n`
        and contain post-extracted Python source (no markdown fences, no
        surrounding commentary).
        """
        from vllm import SamplingParams  # type: ignore[import-not-found]

        formatted = [self.format_prompt(p) for p in prompts]
        params = cast(
            "Any",
            SamplingParams(
                n=n,
                temperature=temperature,
                top_p=top_p,
                max_tokens=max_tokens,
            ),
        )
        outputs = self._llm.generate(formatted, params)

        results: list[list[str]] = []
        for out in outputs:
            candidates = [extract_python_code(c.text) for c in out.outputs]
            results.append(candidates)
        return results

    def close(self) -> None:
        """Part of the Agent protocol; vLLM weights are freed by GC."""
        pass