Spaces:
Sleeping
Sleeping
| """vLLM-backed proposer using Qwen-2.5-Coder-Instruct. | |
| Lazy-imports vLLM so this module stays importable on hosts without a GPU | |
| (laptops, CI). Instantiation requires vLLM + CUDA — in practice that means | |
| inside the CHTC container. | |
| """ | |
| from __future__ import annotations | |
| from typing import Any, cast | |
| from ..utils.code_extraction import extract_python_code | |
| DEFAULT_MODEL = "Qwen/Qwen2.5-Coder-1.5B-Instruct" | |
| DEFAULT_SYSTEM_PROMPT = ( | |
| "You are an expert Python programmer. Respond with a single Python code " | |
| "block containing the requested function and nothing else." | |
| ) | |
| class Proposer: | |
| """Generates candidate Python solutions for coding prompts via vLLM. | |
| One GPU, one in-process model. Construct once per job and call | |
| `generate` with whole batches — vLLM batches internally for throughput. | |
| """ | |
| def __init__( | |
| self, | |
| model_id: str = DEFAULT_MODEL, | |
| *, | |
| max_model_len: int = 4096, | |
| gpu_memory_utilization: float = 0.85, | |
| dtype: str = "bfloat16", | |
| trust_remote_code: bool = True, | |
| system_prompt: str = DEFAULT_SYSTEM_PROMPT, | |
| ) -> None: | |
| # Lazy import: keeps this module importable where vLLM is not installed. | |
| from vllm import LLM # type: ignore[import-not-found] | |
| # Cast through Any: vLLM has no type stubs, so the LLM constructor's | |
| # return type is Unknown to pyright; cast erases that. | |
| self._llm = cast( | |
| "Any", | |
| LLM( | |
| model=model_id, | |
| max_model_len=max_model_len, | |
| gpu_memory_utilization=gpu_memory_utilization, | |
| dtype=dtype, | |
| trust_remote_code=trust_remote_code, | |
| ), | |
| ) | |
| # self._llm is already Any after the cast above, so this is inferred Any. | |
| self._tokenizer = self._llm.get_tokenizer() | |
| self._system_prompt = system_prompt | |
| self._model_id = model_id | |
| def model_id(self) -> str: | |
| return self._model_id | |
| def format_prompt(self, problem: str) -> str: | |
| """Wrap a raw problem prompt with the model's chat template. | |
| Without `add_generation_prompt=True` Qwen will try to continue the | |
| user turn instead of starting an assistant response. | |
| """ | |
| messages = [ | |
| {"role": "system", "content": self._system_prompt}, | |
| {"role": "user", "content": problem}, | |
| ] | |
| rendered = self._tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True, | |
| ) | |
| return str(rendered) | |
| def generate( | |
| self, | |
| prompts: list[str], | |
| *, | |
| n: int = 1, | |
| temperature: float = 0.2, | |
| top_p: float = 0.95, | |
| max_tokens: int = 1024, | |
| ) -> list[list[str]]: | |
| """Return `n` candidate Python solutions per prompt. | |
| Outer list has length `len(prompts)`; inner lists have length `n` | |
| and contain post-extracted Python source (no markdown fences, no | |
| surrounding commentary). | |
| """ | |
| from vllm import SamplingParams # type: ignore[import-not-found] | |
| formatted = [self.format_prompt(p) for p in prompts] | |
| params = cast( | |
| "Any", | |
| SamplingParams( | |
| n=n, | |
| temperature=temperature, | |
| top_p=top_p, | |
| max_tokens=max_tokens, | |
| ), | |
| ) | |
| outputs = self._llm.generate(formatted, params) | |
| results: list[list[str]] = [] | |
| for out in outputs: | |
| candidates = [extract_python_code(c.text) for c in out.outputs] | |
| results.append(candidates) | |
| return results | |
| def close(self) -> None: | |
| """Part of the Agent protocol; vLLM weights are freed by GC.""" | |
| pass | |