"""In-process GPU inference for Hugging Face Spaces ZeroGPU. Mirrors modal_app.py, but instead of calling Modal over HTTP the models run locally inside the Space behind @spaces.GPU (which allocates a ZeroGPU for the duration of each call). Selected by TINYWORLD_INFER=local. Imported LAZILY (only when the local backend is active) because it pulls in torch / transformers / voxcpm / whisper — heavy deps the offline test suite and the Modal-backed path never need. Models are loaded on CPU once and moved to CUDA inside the GPU-decorated functions (CUDA is only available there on ZeroGPU). """ import os import re import tempfile import spaces import torch LLM_MODEL_ID = os.environ.get("TINYWORLD_LLM", "nvidia/Nemotron-Mini-4B-Instruct") VOICE_MODEL_ID = os.environ.get("TINYWORLD_VOICE_MODEL", "openbmb/VoxCPM2") WHISPER_SIZE = os.environ.get("TINYWORLD_WHISPER", "base") _llm = None _tok = None _voice = None _whisper = None # --------------------------------------------------------------------------- LLM def _load_llm(): global _llm, _tok if _llm is None: from transformers import AutoModelForCausalLM, AutoTokenizer print(f"[inference] loading {LLM_MODEL_ID} …") _tok = AutoTokenizer.from_pretrained(LLM_MODEL_ID, trust_remote_code=True) _llm = AutoModelForCausalLM.from_pretrained( LLM_MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True, ) return _llm, _tok def warmup(): """Download + load the LLM into CPU RAM ahead of time (no GPU needed) so the first @spaces.GPU call only moves to CUDA and generates — avoids the cold 8GB load racing the ZeroGPU duration limit. Safe to call from a background thread.""" try: _load_llm() print("[inference] LLM warmed (CPU RAM)") except Exception as e: print(f"[inference] warmup failed: {e}") def _strip_think(text): text = re.sub(r".*?", "", text, flags=re.DOTALL).strip() if "" in text: parts = text.split("") text = parts[-1].strip() if len(parts) > 1 else text return text.strip() @spaces.GPU(duration=120) def generate_batch(prompts): """One raw completion per prompt, all in a single GPU allocation. Returns a list of raw strings aligned with ``prompts`` (the reaction engine parses them).""" mdl, tok = _load_llm() mdl.to("cuda") outputs = [] for prompt in prompts: messages = [{"role": "user", "content": prompt}] input_text = tok.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) inputs = tok(input_text, return_tensors="pt").to("cuda") with torch.no_grad(): out = mdl.generate( **inputs, max_new_tokens=400, do_sample=True, temperature=0.8, top_p=0.9, ) new = out[0][inputs["input_ids"].shape[1]:] text = tok.decode(new, skip_special_tokens=True).strip() outputs.append(_strip_think(text)) return outputs # --------------------------------------------------------------------------- TTS def _load_voice(): global _voice if _voice is None: from voxcpm import VoxCPM print(f"[inference] loading {VOICE_MODEL_ID} …") _voice = VoxCPM.from_pretrained(VOICE_MODEL_ID) return _voice @spaces.GPU(duration=60) def synthesize_voice(text, voice_desc): """Returns a path to a WAV file (matches voice.generate_voice's contract).""" import soundfile as sf model = _load_voice() wav = model.generate(text=f"{voice_desc}{text}", cfg_value=2.0, inference_timesteps=10) path = os.path.join(tempfile.gettempdir(), f"tinyworld_voice_{os.getpid()}.wav") sf.write(path, wav, model.tts_model.sample_rate) return path # --------------------------------------------------------------------------- ASR def _load_whisper(): global _whisper if _whisper is None: import whisper print(f"[inference] loading Whisper {WHISPER_SIZE} …") _whisper = whisper.load_model(WHISPER_SIZE) return _whisper @spaces.GPU(duration=60) def transcribe_audio(audio_path): model = _load_whisper() result = model.transcribe(audio_path, fp16=True) return (result.get("text") or "").strip()