dmaheshwar22's picture
deploy: replace template with real demo
0dd7c80 verified
"""vLLM-backed proposer using Qwen-2.5-Coder-Instruct.
Lazy-imports vLLM so this module stays importable on hosts without a GPU
(laptops, CI). Instantiation requires vLLM + CUDA — in practice that means
inside the CHTC container.
"""
from __future__ import annotations
from typing import Any, cast
from ..utils.code_extraction import extract_python_code
DEFAULT_MODEL = "Qwen/Qwen2.5-Coder-1.5B-Instruct"
DEFAULT_SYSTEM_PROMPT = (
"You are an expert Python programmer. Respond with a single Python code "
"block containing the requested function and nothing else."
)
class Proposer:
"""Generates candidate Python solutions for coding prompts via vLLM.
One GPU, one in-process model. Construct once per job and call
`generate` with whole batches — vLLM batches internally for throughput.
"""
def __init__(
self,
model_id: str = DEFAULT_MODEL,
*,
max_model_len: int = 4096,
gpu_memory_utilization: float = 0.85,
dtype: str = "bfloat16",
trust_remote_code: bool = True,
system_prompt: str = DEFAULT_SYSTEM_PROMPT,
) -> None:
# Lazy import: keeps this module importable where vLLM is not installed.
from vllm import LLM # type: ignore[import-not-found]
# Cast through Any: vLLM has no type stubs, so the LLM constructor's
# return type is Unknown to pyright; cast erases that.
self._llm = cast(
"Any",
LLM(
model=model_id,
max_model_len=max_model_len,
gpu_memory_utilization=gpu_memory_utilization,
dtype=dtype,
trust_remote_code=trust_remote_code,
),
)
# self._llm is already Any after the cast above, so this is inferred Any.
self._tokenizer = self._llm.get_tokenizer()
self._system_prompt = system_prompt
self._model_id = model_id
@property
def model_id(self) -> str:
return self._model_id
def format_prompt(self, problem: str) -> str:
"""Wrap a raw problem prompt with the model's chat template.
Without `add_generation_prompt=True` Qwen will try to continue the
user turn instead of starting an assistant response.
"""
messages = [
{"role": "system", "content": self._system_prompt},
{"role": "user", "content": problem},
]
rendered = self._tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
return str(rendered)
def generate(
self,
prompts: list[str],
*,
n: int = 1,
temperature: float = 0.2,
top_p: float = 0.95,
max_tokens: int = 1024,
) -> list[list[str]]:
"""Return `n` candidate Python solutions per prompt.
Outer list has length `len(prompts)`; inner lists have length `n`
and contain post-extracted Python source (no markdown fences, no
surrounding commentary).
"""
from vllm import SamplingParams # type: ignore[import-not-found]
formatted = [self.format_prompt(p) for p in prompts]
params = cast(
"Any",
SamplingParams(
n=n,
temperature=temperature,
top_p=top_p,
max_tokens=max_tokens,
),
)
outputs = self._llm.generate(formatted, params)
results: list[list[str]] = []
for out in outputs:
candidates = [extract_python_code(c.text) for c in out.outputs]
results.append(candidates)
return results
def close(self) -> None:
"""Part of the Agent protocol; vLLM weights are freed by GC."""
pass