Spaces:

dmaheshwar22
/

verifiable-rl-coder

Sleeping

App Files Files Community

verifiable-rl-coder / src /verifiable_rl_coder /agents /proposer.py

dmaheshwar22

deploy: replace template with real demo

0dd7c80 verified about 1 month ago

raw

history blame contribute delete

3.82 kB

	"""vLLM-backed proposer using Qwen-2.5-Coder-Instruct.

	Lazy-imports vLLM so this module stays importable on hosts without a GPU
	(laptops, CI). Instantiation requires vLLM + CUDA — in practice that means
	inside the CHTC container.
	"""

	from __future__ import annotations

	from typing import Any, cast

	from ..utils.code_extraction import extract_python_code

	DEFAULT_MODEL = "Qwen/Qwen2.5-Coder-1.5B-Instruct"

	DEFAULT_SYSTEM_PROMPT = (
	"You are an expert Python programmer. Respond with a single Python code "
	"block containing the requested function and nothing else."
	)


	class Proposer:
	"""Generates candidate Python solutions for coding prompts via vLLM.

	One GPU, one in-process model. Construct once per job and call
	`generate` with whole batches — vLLM batches internally for throughput.
	"""

	def __init__(
	self,
	model_id: str = DEFAULT_MODEL,
	*,
	max_model_len: int = 4096,
	gpu_memory_utilization: float = 0.85,
	dtype: str = "bfloat16",
	trust_remote_code: bool = True,
	system_prompt: str = DEFAULT_SYSTEM_PROMPT,
	) -> None:
	# Lazy import: keeps this module importable where vLLM is not installed.
	from vllm import LLM # type: ignore[import-not-found]

	# Cast through Any: vLLM has no type stubs, so the LLM constructor's
	# return type is Unknown to pyright; cast erases that.
	self._llm = cast(
	"Any",
	LLM(
	model=model_id,
	max_model_len=max_model_len,
	gpu_memory_utilization=gpu_memory_utilization,
	dtype=dtype,
	trust_remote_code=trust_remote_code,
	),
	)
	# self._llm is already Any after the cast above, so this is inferred Any.
	self._tokenizer = self._llm.get_tokenizer()
	self._system_prompt = system_prompt
	self._model_id = model_id

	@property
	def model_id(self) -> str:
	return self._model_id

	def format_prompt(self, problem: str) -> str:
	"""Wrap a raw problem prompt with the model's chat template.

	Without `add_generation_prompt=True` Qwen will try to continue the
	user turn instead of starting an assistant response.
	"""
	messages = [
	{"role": "system", "content": self._system_prompt},
	{"role": "user", "content": problem},
	]
	rendered = self._tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True,
	)
	return str(rendered)

	def generate(
	self,
	prompts: list[str],
	*,
	n: int = 1,
	temperature: float = 0.2,
	top_p: float = 0.95,
	max_tokens: int = 1024,
	) -> list[list[str]]:
	"""Return `n` candidate Python solutions per prompt.

	Outer list has length `len(prompts)`; inner lists have length `n`
	and contain post-extracted Python source (no markdown fences, no
	surrounding commentary).
	"""
	from vllm import SamplingParams # type: ignore[import-not-found]

	formatted = [self.format_prompt(p) for p in prompts]
	params = cast(
	"Any",
	SamplingParams(
	n=n,
	temperature=temperature,
	top_p=top_p,
	max_tokens=max_tokens,
	),
	)
	outputs = self._llm.generate(formatted, params)

	results: list[list[str]] = []
	for out in outputs:
	candidates = [extract_python_code(c.text) for c in out.outputs]
	results.append(candidates)
	return results

	def close(self) -> None:
	"""Part of the Agent protocol; vLLM weights are freed by GC."""
	pass