Spaces:
Running on Zero
Running on Zero
| """Nova Streaming Being · ZeroGPU Space | |
| Real Qwen3-TTS-12Hz-0.6B-Base inference + honest avatar substrate verdict. | |
| Phase 2 of NOVA mission · 2026-05-14: | |
| - Replaced _tone() placeholder with real Qwen3-TTS inference (fits zero-a10g). | |
| - Live-Avatar (Quark-Vision) requires 5×H800 / 80GB VRAM per its README; refuses | |
| cleanly on this substrate. | |
| - frontier_turn() still produces an audio+motion preview using real TTS audio. | |
| """ | |
| from __future__ import annotations | |
| import hashlib | |
| import json | |
| import subprocess | |
| import time | |
| from pathlib import Path | |
| import gradio as gr | |
| import numpy as np | |
| import soundfile as sf | |
| try: | |
| import spaces | |
| except Exception: | |
| class _Spaces: | |
| def GPU(self, *args, **kwargs): | |
| def deco(fn): | |
| return fn | |
| return deco | |
| spaces = _Spaces() | |
| ROOT = Path("/tmp/nova-streaming-being") | |
| ROOT.mkdir(parents=True, exist_ok=True) | |
| STACK = { | |
| "tts_target": "Qwen/Qwen3-TTS-12Hz-0.6B-Base", | |
| "tts_status": "real-inference-on-zero-a10g", | |
| "avatar_target": "Quark-Vision/Live-Avatar", | |
| "avatar_status": "substrate-rejected · 5xH800 req per upstream README", | |
| "avatar_proposed_alt": "Sonic / Hallo / Real3DPortrait — pending owner pick", | |
| "substrate": "HF Space hardware: zero-a10g (24GB)", | |
| } | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # Qwen3-TTS lazy load (cold-load on first call · GPU only via @spaces.GPU) | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| _TTS_MODEL = None | |
| def _load_qwen_tts(): | |
| """Load Qwen3-TTS-12Hz-0.6B-Base. Imports inside fn so the Space can boot | |
| on CPU before the first GPU call.""" | |
| global _TTS_MODEL | |
| if _TTS_MODEL is not None: | |
| return _TTS_MODEL | |
| import torch | |
| try: | |
| from qwen_tts import Qwen3TTSModel | |
| except ImportError as exc: | |
| raise RuntimeError( | |
| f"qwen-tts package missing: {exc}. Add 'qwen-tts' to requirements.txt." | |
| ) from exc | |
| print("[qwen_tts] loading Qwen/Qwen3-TTS-12Hz-0.6B-Base on cuda...", flush=True) | |
| _TTS_MODEL = Qwen3TTSModel.from_pretrained( | |
| "Qwen/Qwen3-TTS-12Hz-0.6B-Base", | |
| device_map="cuda:0", | |
| dtype=torch.bfloat16, | |
| attn_implementation="sdpa", | |
| ) | |
| print("[qwen_tts] ready", flush=True) | |
| return _TTS_MODEL | |
| def qwen3_tts_stream(text: str, voice_ref=None) -> tuple[str, str]: | |
| started = time.time() | |
| digest = hashlib.sha256(text.encode("utf-8")).hexdigest()[:10] | |
| out = ROOT / f"qwen3-{digest}.wav" | |
| err = None | |
| sample_rate = 24000 | |
| used_voice_clone = bool(voice_ref) | |
| try: | |
| model = _load_qwen_tts() | |
| # Default ref clip — Nova's existing voice profile if owner mounts one | |
| # Voice cloning from user-supplied ref if provided | |
| if voice_ref: | |
| audio_arr = model.generate(text=text, ref_audio=voice_ref) | |
| else: | |
| audio_arr = model.generate(text=text) | |
| if isinstance(audio_arr, dict): | |
| sample_rate = audio_arr.get("sample_rate", sample_rate) | |
| audio_arr = audio_arr.get("audio") or audio_arr.get("waveform") or audio_arr.get("samples") | |
| if hasattr(audio_arr, "cpu"): | |
| audio_arr = audio_arr.cpu().numpy() | |
| audio_arr = np.asarray(audio_arr, dtype=np.float32).reshape(-1) | |
| sf.write(out, audio_arr, sample_rate) | |
| except Exception as exc: | |
| err = f"{type(exc).__name__}: {exc}" | |
| # Fall back to brief placeholder so the Gradio contract still returns | |
| # a valid file path. | |
| sr = 24000 | |
| seconds = min(4.0, max(1.2, len(text) / 28)) | |
| t = np.linspace(0, seconds, int(sr * seconds), endpoint=False) | |
| envelope = np.minimum(1, t * 8) * np.minimum(1, (seconds - t) * 8) | |
| audio = 0.13 * np.sin(2 * np.pi * 185 * t) * envelope | |
| sf.write(out, audio, sr) | |
| meta = { | |
| "adapter": "qwen3_tts_stream", | |
| "target_model": STACK["tts_target"], | |
| "status": "real" if err is None else f"fallback_tone (err: {err})", | |
| "voice_cloned": used_voice_clone, | |
| "sample_rate": sample_rate, | |
| "wall_ms": int((time.time() - started) * 1000), | |
| } | |
| return str(out), json.dumps(meta, indent=2) | |
| def liveavatar_render(reference_image, audio_file, prompt: str) -> tuple[None, str]: | |
| """Substrate-reject LiveAvatar. Real model needs 5x H800.""" | |
| return None, json.dumps({ | |
| "adapter": "liveavatar_render", | |
| "target_model": STACK["avatar_target"], | |
| "status": "substrate-rejected", | |
| "reason": "Quark-Vision/Live-Avatar README: 14B-param diffusion model · 20fps requires 5×H800 GPUs · 80GB VRAM minimum. zero-a10g (24GB) cannot honestly run it.", | |
| "alternatives_for_a10g": [ | |
| "Sonic (Tencent) — single-GPU near-realtime audio→head", | |
| "Hallo / Hallo3 (fudan-generative-ai) — single-GPU, ~5–10s per second of video (NOT realtime)", | |
| "Real3DPortrait (CMU) — fast, lower fidelity", | |
| ], | |
| "owner_decision_needed": "pick alternative or accept Phase 7 async path with bigger GPU", | |
| }, indent=2) | |
| def avatarforcing_render(reference_image, audio_file, prompt: str) -> tuple[None, str]: | |
| return None, json.dumps({ | |
| "adapter": "avatarforcing_render", | |
| "target_model": "AvatarForcing", | |
| "status": "waiting-for-public-weights", | |
| }, indent=2) | |
| def _motion_video(reference_image, audio_path: str, text: str) -> tuple[str | None, dict]: | |
| """Audio + still-image motion-presence fallback (no lipsync).""" | |
| try: | |
| from PIL import Image, ImageDraw, ImageEnhance | |
| if reference_image: | |
| img = Image.open(reference_image).convert("RGB") | |
| else: | |
| img = Image.new("RGB", (720, 1280), "#03050b") | |
| img.thumbnail((720, 1280)) | |
| canvas = Image.new("RGB", (720, 1280), "#03050b") | |
| canvas.paste(img, ((720 - img.width) // 2, 0)) | |
| draw = ImageDraw.Draw(canvas) | |
| draw.rectangle((0, 0, 720, 130), fill=(3, 5, 11)) | |
| draw.text((36, 42), "NOVA STREAMING BEING", fill=(220, 255, 232)) | |
| draw.rectangle((0, 1120, 720, 1280), fill=(3, 5, 11)) | |
| draw.text((36, 1160), text[:74], fill=(247, 251, 255)) | |
| still = ROOT / f"frame-{hashlib.sha256((text + str(time.time())).encode()).hexdigest()[:10]}.jpg" | |
| ImageEnhance.Contrast(canvas).enhance(1.08).save(still, quality=92) | |
| out = ROOT / f"frontier-{hashlib.sha256((text + audio_path).encode()).hexdigest()[:10]}.mp4" | |
| subprocess.run( | |
| [ | |
| "ffmpeg", "-y", "-loglevel", "error", | |
| "-loop", "1", "-framerate", "30", "-i", str(still), | |
| "-i", audio_path, | |
| "-vf", "scale=720:1280,format=yuv420p", | |
| "-c:v", "libx264", "-preset", "veryfast", "-tune", "stillimage", | |
| "-c:a", "aac", "-b:a", "128k", "-shortest", str(out), | |
| ], | |
| check=True, | |
| ) | |
| return str(out), {"engine": "motion-presence", "frame_ms": 33.33, "status": "generated"} | |
| except Exception as exc: | |
| return None, {"engine": "motion-presence", "status": "failed", "error": f"{type(exc).__name__}: {exc}"} | |
| def frontier_turn(text: str, reference_image=None) -> tuple[str, str | None, str]: | |
| started = time.time() | |
| audio_path, tts_json = qwen3_tts_stream(text) | |
| video_path, video_meta = _motion_video(reference_image, audio_path, text) | |
| meta = { | |
| "adapter": "frontier_turn", | |
| "tts": json.loads(tts_json), | |
| "video": video_meta, | |
| "liveavatar_verdict": { | |
| "can_claim_realtime_lipsync": False, | |
| "reason": "see liveavatar_render adapter for substrate verdict", | |
| }, | |
| "wall_ms": int((time.time() - started) * 1000), | |
| } | |
| return audio_path, video_path, json.dumps(meta, indent=2) | |
| def public_gpu_status() -> str: | |
| return json.dumps(STACK, indent=2) | |
| with gr.Blocks(title="Nova Streaming Being") as demo: | |
| gr.Markdown("# Nova Streaming Being · ZeroGPU") | |
| gr.Markdown("Backs the `Talk to Nova` CVI loop on `cloud.novaregistrar.com/realestate`.") | |
| gr.Code(value=public_gpu_status(), label="Runtime substrate", language="json") | |
| with gr.Tab("Frontier Turn"): | |
| ft = gr.Textbox(label="Nova text", value="Eric, I am online on the GPU renderer path.") | |
| fref = gr.Image(label="Reference image", type="filepath") | |
| faudio = gr.Audio(label="Qwen3-TTS speech", type="filepath") | |
| fvideo = gr.Video(label="Avatar response (motion-presence)") | |
| fmeta = gr.Code(label="Renderer status", language="json") | |
| gr.Button("Run frontier turn").click(frontier_turn, [ft, fref], [faudio, fvideo, fmeta]) | |
| with gr.Tab("Speech (Qwen3-TTS · real)"): | |
| t = gr.Textbox(label="Nova text", value="Eric, I am online and ready to run the business loop.") | |
| voice = gr.Audio(label="Optional voice reference (3s clip)", type="filepath") | |
| audio = gr.Audio(label="Speech", type="filepath") | |
| meta = gr.Code(label="Adapter status", language="json") | |
| gr.Button("Render speech").click(qwen3_tts_stream, [t, voice], [audio, meta]) | |
| with gr.Tab("Avatar (substrate-honest)"): | |
| ref = gr.Image(label="Reference image") | |
| aud = gr.Audio(label="Speech audio", type="filepath") | |
| prompt = gr.Textbox(label="Scene prompt", value="Full-size redheaded CEO operator speaking directly to Eric.") | |
| video = gr.Video(label="Avatar stream segment") | |
| vmeta = gr.Code(label="Adapter status", language="json") | |
| gr.Button("Render LiveAvatar segment").click(liveavatar_render, [ref, aud, prompt], [video, vmeta]) | |
| gr.Button("Render AvatarForcing segment").click(avatarforcing_render, [ref, aud, prompt], [video, vmeta]) | |
| if __name__ == "__main__": | |
| demo.queue(default_concurrency_limit=2).launch(ssr_mode=False) | |