"""Generation with Qwen2.5-Coder-7B-Instruct on ZeroGPU. Canonical ZeroGPU pattern: the main process stays light (no model in it) and the model is loaded **on the GPU inside** the ``@spaces.GPU`` function, where the GPU actually exists. ``device_map="cuda"`` (accelerate) puts every shard on the allocated GPU. An ``lru_cache`` keeps it resident for the life of each GPU worker. Why not load once at import? On ZeroGPU there is no GPU outside ``@spaces.GPU``, and ZeroGPU forks the main process for every GPU call. Loading the 15 GB model into the main process makes that fork heavy and tangled with gradio's asyncio loop, and the GPU task never runs (it just times out -> "GPU task aborted"). Keeping the main process model-free is what makes the GPU call actually execute. Local testing: set GDRAG_STUB_LLM=1 to return a canned answer without a GPU or the model download. """ from __future__ import annotations import os from functools import lru_cache MODEL_ID = os.environ.get("GDRAG_LLM", "Qwen/Qwen2.5-Coder-7B-Instruct") STUB = os.environ.get("GDRAG_STUB_LLM") == "1" # Import spaces BEFORE torch so ZeroGPU can patch CUDA. Degrade to a no-op # decorator (CPU) when running locally without the package. try: import spaces GPU = spaces.GPU ON_ZERO = True except Exception: # not on a Space ON_ZERO = False def GPU(*dargs, **dkwargs): def deco(fn): return fn if dargs and callable(dargs[0]): return dargs[0] return deco @lru_cache(maxsize=1) def _model_and_tokenizer(): """Load tokenizer + 4-bit model. Called from inside ``generate`` so on ZeroGPU it runs in the GPU worker where bitsandbytes can quantize onto the allocated GPU (4-bit load requires a GPU). 4-bit shrinks the 7B from ~15 GB to ~5 GB, so it loads and generates fast enough to fit the GPU budget.""" import torch from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig tok = AutoTokenizer.from_pretrained(MODEL_ID) if ON_ZERO: quant = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, ) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, quantization_config=quant, device_map={"": 0}, ) else: model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16) model.eval() return model, tok def _render(messages, tok) -> str: return tok.apply_chat_template( messages, tokenize=False, add_generation_prompt=True) @GPU(duration=180) def generate(messages: list[dict], max_new_tokens: int = 512, temperature: float = 0.2) -> str: """Generate an assistant reply for chat-format ``messages``.""" if STUB: return ( "Here is a Godot 4 movement script:\n\n```gdscript\n" "extends CharacterBody2D\n\n@export var speed: float = 200.0\n\n" "func _physics_process(delta: float) -> void:\n" "\tvar dir := Input.get_vector(\"ui_left\", \"ui_right\", " "\"ui_up\", \"ui_down\")\n\tvelocity = dir * speed\n" "\tmove_and_slide()\n```\n" ) import torch, time model, tok = _model_and_tokenizer() dev = "cuda" if ON_ZERO else "cpu" text = _render(messages, tok) inputs = tok([text], return_tensors="pt").to(dev) print(f"[gen] dev={dev} cuda_avail={torch.cuda.is_available()} generating max_new={max_new_tokens}", flush=True) t0 = time.time() with torch.no_grad(): out = model.generate( **inputs, max_new_tokens=max_new_tokens, do_sample=temperature > 0, temperature=max(temperature, 1e-4), top_p=0.95, pad_token_id=tok.eos_token_id, ) n_new = int(out.shape[-1] - inputs["input_ids"].shape[1]) print(f"[gen] done {n_new} tokens in {time.time()-t0:.1f}s on {dev}", flush=True) gen = out[0][inputs["input_ids"].shape[1]:] return tok.decode(gen, skip_special_tokens=True).strip() def warmup() -> None: """No-op on ZeroGPU: the model can only be loaded inside @spaces.GPU (the GPU does not exist in the main process).""" if not ON_ZERO and not STUB: _model_and_tokenizer()