Spaces:
Running on Zero
Running on Zero
| """Generation with Qwen2.5-Coder-7B-Instruct on ZeroGPU. | |
| Canonical ZeroGPU pattern: the main process stays light (no model in it) and | |
| the model is loaded **on the GPU inside** the ``@spaces.GPU`` function, where | |
| the GPU actually exists. ``device_map="cuda"`` (accelerate) puts every shard on | |
| the allocated GPU. An ``lru_cache`` keeps it resident for the life of each GPU | |
| worker. | |
| Why not load once at import? On ZeroGPU there is no GPU outside ``@spaces.GPU``, | |
| and ZeroGPU forks the main process for every GPU call. Loading the 15 GB model | |
| into the main process makes that fork heavy and tangled with gradio's asyncio | |
| loop, and the GPU task never runs (it just times out -> "GPU task aborted"). | |
| Keeping the main process model-free is what makes the GPU call actually execute. | |
| Local testing: set GDRAG_STUB_LLM=1 to return a canned answer without a GPU or | |
| the model download. | |
| """ | |
| from __future__ import annotations | |
| import os | |
| from functools import lru_cache | |
| MODEL_ID = os.environ.get("GDRAG_LLM", "Qwen/Qwen2.5-Coder-7B-Instruct") | |
| STUB = os.environ.get("GDRAG_STUB_LLM") == "1" | |
| # Import spaces BEFORE torch so ZeroGPU can patch CUDA. Degrade to a no-op | |
| # decorator (CPU) when running locally without the package. | |
| try: | |
| import spaces | |
| GPU = spaces.GPU | |
| ON_ZERO = True | |
| except Exception: # not on a Space | |
| ON_ZERO = False | |
| def GPU(*dargs, **dkwargs): | |
| def deco(fn): | |
| return fn | |
| if dargs and callable(dargs[0]): | |
| return dargs[0] | |
| return deco | |
| def _model_and_tokenizer(): | |
| """Load tokenizer + 4-bit model. Called from inside ``generate`` so on | |
| ZeroGPU it runs in the GPU worker where bitsandbytes can quantize onto the | |
| allocated GPU (4-bit load requires a GPU). 4-bit shrinks the 7B from ~15 GB | |
| to ~5 GB, so it loads and generates fast enough to fit the GPU budget.""" | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
| tok = AutoTokenizer.from_pretrained(MODEL_ID) | |
| if ON_ZERO: | |
| quant = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| bnb_4bit_use_double_quant=True, | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, quantization_config=quant, device_map={"": 0}, | |
| ) | |
| else: | |
| model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16) | |
| model.eval() | |
| return model, tok | |
| def _render(messages, tok) -> str: | |
| return tok.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True) | |
| def generate(messages: list[dict], max_new_tokens: int = 512, | |
| temperature: float = 0.2) -> str: | |
| """Generate an assistant reply for chat-format ``messages``.""" | |
| if STUB: | |
| return ( | |
| "Here is a Godot 4 movement script:\n\n```gdscript\n" | |
| "extends CharacterBody2D\n\n@export var speed: float = 200.0\n\n" | |
| "func _physics_process(delta: float) -> void:\n" | |
| "\tvar dir := Input.get_vector(\"ui_left\", \"ui_right\", " | |
| "\"ui_up\", \"ui_down\")\n\tvelocity = dir * speed\n" | |
| "\tmove_and_slide()\n```\n" | |
| ) | |
| import torch, time | |
| model, tok = _model_and_tokenizer() | |
| dev = "cuda" if ON_ZERO else "cpu" | |
| text = _render(messages, tok) | |
| inputs = tok([text], return_tensors="pt").to(dev) | |
| print(f"[gen] dev={dev} cuda_avail={torch.cuda.is_available()} generating max_new={max_new_tokens}", flush=True) | |
| t0 = time.time() | |
| with torch.no_grad(): | |
| out = model.generate( | |
| **inputs, max_new_tokens=max_new_tokens, | |
| do_sample=temperature > 0, temperature=max(temperature, 1e-4), | |
| top_p=0.95, pad_token_id=tok.eos_token_id, | |
| ) | |
| n_new = int(out.shape[-1] - inputs["input_ids"].shape[1]) | |
| print(f"[gen] done {n_new} tokens in {time.time()-t0:.1f}s on {dev}", flush=True) | |
| gen = out[0][inputs["input_ids"].shape[1]:] | |
| return tok.decode(gen, skip_special_tokens=True).strip() | |
| def warmup() -> None: | |
| """No-op on ZeroGPU: the model can only be loaded inside @spaces.GPU (the | |
| GPU does not exist in the main process).""" | |
| if not ON_ZERO and not STUB: | |
| _model_and_tokenizer() | |