Spaces:

vivekchakraverty
/

gdscript-assistant

Running on Zero

App Files Files Community

gdscript-assistant / generate.py

vivekchakraverty

Restore max_new_tokens to 512 (4-bit gen is fast: ~25 tok/s on GPU)

6246295 1 day ago

raw

history blame contribute delete

4.35 kB

	"""Generation with Qwen2.5-Coder-7B-Instruct on ZeroGPU.

	Canonical ZeroGPU pattern: the main process stays light (no model in it) and
	the model is loaded on the GPU inside the ``@spaces.GPU`` function, where
	the GPU actually exists. ``device_map="cuda"`` (accelerate) puts every shard on
	the allocated GPU. An ``lru_cache`` keeps it resident for the life of each GPU
	worker.

	Why not load once at import? On ZeroGPU there is no GPU outside ``@spaces.GPU``,
	and ZeroGPU forks the main process for every GPU call. Loading the 15 GB model
	into the main process makes that fork heavy and tangled with gradio's asyncio
	loop, and the GPU task never runs (it just times out -> "GPU task aborted").
	Keeping the main process model-free is what makes the GPU call actually execute.

	Local testing: set GDRAG_STUB_LLM=1 to return a canned answer without a GPU or
	the model download.
	"""
	from __future__ import annotations

	import os
	from functools import lru_cache

	MODEL_ID = os.environ.get("GDRAG_LLM", "Qwen/Qwen2.5-Coder-7B-Instruct")
	STUB = os.environ.get("GDRAG_STUB_LLM") == "1"

	# Import spaces BEFORE torch so ZeroGPU can patch CUDA. Degrade to a no-op
	# decorator (CPU) when running locally without the package.
	try:
	import spaces
	GPU = spaces.GPU
	ON_ZERO = True
	except Exception: # not on a Space
	ON_ZERO = False

	def GPU(dargs, *dkwargs):
	def deco(fn):
	return fn
	if dargs and callable(dargs[0]):
	return dargs[0]
	return deco


	@lru_cache(maxsize=1)
	def _model_and_tokenizer():
	"""Load tokenizer + 4-bit model. Called from inside ``generate`` so on
	ZeroGPU it runs in the GPU worker where bitsandbytes can quantize onto the
	allocated GPU (4-bit load requires a GPU). 4-bit shrinks the 7B from ~15 GB
	to ~5 GB, so it loads and generates fast enough to fit the GPU budget."""
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
	tok = AutoTokenizer.from_pretrained(MODEL_ID)
	if ON_ZERO:
	quant = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.bfloat16,
	bnb_4bit_use_double_quant=True,
	)
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID, quantization_config=quant, device_map={"": 0},
	)
	else:
	model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
	model.eval()
	return model, tok


	def _render(messages, tok) -> str:
	return tok.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True)


	@GPU(duration=180)
	def generate(messages: list[dict], max_new_tokens: int = 512,
	temperature: float = 0.2) -> str:
	"""Generate an assistant reply for chat-format ``messages``."""
	if STUB:
	return (
	"Here is a Godot 4 movement script:\n\n```gdscript\n"
	"extends CharacterBody2D\n\n@export var speed: float = 200.0\n\n"
	"func _physics_process(delta: float) -> void:\n"
	"\tvar dir := Input.get_vector(\"ui_left\", \"ui_right\", "
	"\"ui_up\", \"ui_down\")\n\tvelocity = dir * speed\n"
	"\tmove_and_slide()\n```\n"
	)
	import torch, time
	model, tok = _model_and_tokenizer()
	dev = "cuda" if ON_ZERO else "cpu"
	text = _render(messages, tok)
	inputs = tok([text], return_tensors="pt").to(dev)
	print(f"[gen] dev={dev} cuda_avail={torch.cuda.is_available()} generating max_new={max_new_tokens}", flush=True)
	t0 = time.time()
	with torch.no_grad():
	out = model.generate(
	**inputs, max_new_tokens=max_new_tokens,
	do_sample=temperature > 0, temperature=max(temperature, 1e-4),
	top_p=0.95, pad_token_id=tok.eos_token_id,
	)
	n_new = int(out.shape[-1] - inputs["input_ids"].shape[1])
	print(f"[gen] done {n_new} tokens in {time.time()-t0:.1f}s on {dev}", flush=True)
	gen = out[0][inputs["input_ids"].shape[1]:]
	return tok.decode(gen, skip_special_tokens=True).strip()


	def warmup() -> None:
	"""No-op on ZeroGPU: the model can only be loaded inside @spaces.GPU (the
	GPU does not exist in the main process)."""
	if not ON_ZERO and not STUB:
	_model_and_tokenizer()