"""vLLM-backed proposer using Qwen-2.5-Coder-Instruct. Lazy-imports vLLM so this module stays importable on hosts without a GPU (laptops, CI). Instantiation requires vLLM + CUDA — in practice that means inside the CHTC container. """ from __future__ import annotations from typing import Any, cast from ..utils.code_extraction import extract_python_code DEFAULT_MODEL = "Qwen/Qwen2.5-Coder-1.5B-Instruct" DEFAULT_SYSTEM_PROMPT = ( "You are an expert Python programmer. Respond with a single Python code " "block containing the requested function and nothing else." ) class Proposer: """Generates candidate Python solutions for coding prompts via vLLM. One GPU, one in-process model. Construct once per job and call `generate` with whole batches — vLLM batches internally for throughput. """ def __init__( self, model_id: str = DEFAULT_MODEL, *, max_model_len: int = 4096, gpu_memory_utilization: float = 0.85, dtype: str = "bfloat16", trust_remote_code: bool = True, system_prompt: str = DEFAULT_SYSTEM_PROMPT, ) -> None: # Lazy import: keeps this module importable where vLLM is not installed. from vllm import LLM # type: ignore[import-not-found] # Cast through Any: vLLM has no type stubs, so the LLM constructor's # return type is Unknown to pyright; cast erases that. self._llm = cast( "Any", LLM( model=model_id, max_model_len=max_model_len, gpu_memory_utilization=gpu_memory_utilization, dtype=dtype, trust_remote_code=trust_remote_code, ), ) # self._llm is already Any after the cast above, so this is inferred Any. self._tokenizer = self._llm.get_tokenizer() self._system_prompt = system_prompt self._model_id = model_id @property def model_id(self) -> str: return self._model_id def format_prompt(self, problem: str) -> str: """Wrap a raw problem prompt with the model's chat template. Without `add_generation_prompt=True` Qwen will try to continue the user turn instead of starting an assistant response. """ messages = [ {"role": "system", "content": self._system_prompt}, {"role": "user", "content": problem}, ] rendered = self._tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, ) return str(rendered) def generate( self, prompts: list[str], *, n: int = 1, temperature: float = 0.2, top_p: float = 0.95, max_tokens: int = 1024, ) -> list[list[str]]: """Return `n` candidate Python solutions per prompt. Outer list has length `len(prompts)`; inner lists have length `n` and contain post-extracted Python source (no markdown fences, no surrounding commentary). """ from vllm import SamplingParams # type: ignore[import-not-found] formatted = [self.format_prompt(p) for p in prompts] params = cast( "Any", SamplingParams( n=n, temperature=temperature, top_p=top_p, max_tokens=max_tokens, ), ) outputs = self._llm.generate(formatted, params) results: list[list[str]] = [] for out in outputs: candidates = [extract_python_code(c.text) for c in out.outputs] results.append(candidates) return results def close(self) -> None: """Part of the Agent protocol; vLLM weights are freed by GC.""" pass