"""Nova Streaming Being · ZeroGPU Space Real Qwen3-TTS-12Hz-0.6B-Base inference + honest avatar substrate verdict. Phase 2 of NOVA mission · 2026-05-14: - Replaced _tone() placeholder with real Qwen3-TTS inference (fits zero-a10g). - Live-Avatar (Quark-Vision) requires 5×H800 / 80GB VRAM per its README; refuses cleanly on this substrate. - frontier_turn() still produces an audio+motion preview using real TTS audio. """ from __future__ import annotations import hashlib import json import subprocess import time from pathlib import Path import gradio as gr import numpy as np import soundfile as sf try: import spaces except Exception: class _Spaces: def GPU(self, *args, **kwargs): def deco(fn): return fn return deco spaces = _Spaces() ROOT = Path("/tmp/nova-streaming-being") ROOT.mkdir(parents=True, exist_ok=True) STACK = { "tts_target": "Qwen/Qwen3-TTS-12Hz-0.6B-Base", "tts_status": "real-inference-on-zero-a10g", "avatar_target": "Quark-Vision/Live-Avatar", "avatar_status": "substrate-rejected · 5xH800 req per upstream README", "avatar_proposed_alt": "Sonic / Hallo / Real3DPortrait — pending owner pick", "substrate": "HF Space hardware: zero-a10g (24GB)", } # ───────────────────────────────────────────────────────────────────────────── # Qwen3-TTS lazy load (cold-load on first call · GPU only via @spaces.GPU) # ───────────────────────────────────────────────────────────────────────────── _TTS_MODEL = None def _load_qwen_tts(): """Load Qwen3-TTS-12Hz-0.6B-Base. Imports inside fn so the Space can boot on CPU before the first GPU call.""" global _TTS_MODEL if _TTS_MODEL is not None: return _TTS_MODEL import torch try: from qwen_tts import Qwen3TTSModel except ImportError as exc: raise RuntimeError( f"qwen-tts package missing: {exc}. Add 'qwen-tts' to requirements.txt." ) from exc print("[qwen_tts] loading Qwen/Qwen3-TTS-12Hz-0.6B-Base on cuda...", flush=True) _TTS_MODEL = Qwen3TTSModel.from_pretrained( "Qwen/Qwen3-TTS-12Hz-0.6B-Base", device_map="cuda:0", dtype=torch.bfloat16, attn_implementation="sdpa", ) print("[qwen_tts] ready", flush=True) return _TTS_MODEL @spaces.GPU(duration=60) def qwen3_tts_stream(text: str, voice_ref=None) -> tuple[str, str]: started = time.time() digest = hashlib.sha256(text.encode("utf-8")).hexdigest()[:10] out = ROOT / f"qwen3-{digest}.wav" err = None sample_rate = 24000 used_voice_clone = bool(voice_ref) try: model = _load_qwen_tts() # Default ref clip — Nova's existing voice profile if owner mounts one # Voice cloning from user-supplied ref if provided if voice_ref: audio_arr = model.generate(text=text, ref_audio=voice_ref) else: audio_arr = model.generate(text=text) if isinstance(audio_arr, dict): sample_rate = audio_arr.get("sample_rate", sample_rate) audio_arr = audio_arr.get("audio") or audio_arr.get("waveform") or audio_arr.get("samples") if hasattr(audio_arr, "cpu"): audio_arr = audio_arr.cpu().numpy() audio_arr = np.asarray(audio_arr, dtype=np.float32).reshape(-1) sf.write(out, audio_arr, sample_rate) except Exception as exc: err = f"{type(exc).__name__}: {exc}" # Fall back to brief placeholder so the Gradio contract still returns # a valid file path. sr = 24000 seconds = min(4.0, max(1.2, len(text) / 28)) t = np.linspace(0, seconds, int(sr * seconds), endpoint=False) envelope = np.minimum(1, t * 8) * np.minimum(1, (seconds - t) * 8) audio = 0.13 * np.sin(2 * np.pi * 185 * t) * envelope sf.write(out, audio, sr) meta = { "adapter": "qwen3_tts_stream", "target_model": STACK["tts_target"], "status": "real" if err is None else f"fallback_tone (err: {err})", "voice_cloned": used_voice_clone, "sample_rate": sample_rate, "wall_ms": int((time.time() - started) * 1000), } return str(out), json.dumps(meta, indent=2) @spaces.GPU(duration=10) def liveavatar_render(reference_image, audio_file, prompt: str) -> tuple[None, str]: """Substrate-reject LiveAvatar. Real model needs 5x H800.""" return None, json.dumps({ "adapter": "liveavatar_render", "target_model": STACK["avatar_target"], "status": "substrate-rejected", "reason": "Quark-Vision/Live-Avatar README: 14B-param diffusion model · 20fps requires 5×H800 GPUs · 80GB VRAM minimum. zero-a10g (24GB) cannot honestly run it.", "alternatives_for_a10g": [ "Sonic (Tencent) — single-GPU near-realtime audio→head", "Hallo / Hallo3 (fudan-generative-ai) — single-GPU, ~5–10s per second of video (NOT realtime)", "Real3DPortrait (CMU) — fast, lower fidelity", ], "owner_decision_needed": "pick alternative or accept Phase 7 async path with bigger GPU", }, indent=2) @spaces.GPU(duration=30) def avatarforcing_render(reference_image, audio_file, prompt: str) -> tuple[None, str]: return None, json.dumps({ "adapter": "avatarforcing_render", "target_model": "AvatarForcing", "status": "waiting-for-public-weights", }, indent=2) def _motion_video(reference_image, audio_path: str, text: str) -> tuple[str | None, dict]: """Audio + still-image motion-presence fallback (no lipsync).""" try: from PIL import Image, ImageDraw, ImageEnhance if reference_image: img = Image.open(reference_image).convert("RGB") else: img = Image.new("RGB", (720, 1280), "#03050b") img.thumbnail((720, 1280)) canvas = Image.new("RGB", (720, 1280), "#03050b") canvas.paste(img, ((720 - img.width) // 2, 0)) draw = ImageDraw.Draw(canvas) draw.rectangle((0, 0, 720, 130), fill=(3, 5, 11)) draw.text((36, 42), "NOVA STREAMING BEING", fill=(220, 255, 232)) draw.rectangle((0, 1120, 720, 1280), fill=(3, 5, 11)) draw.text((36, 1160), text[:74], fill=(247, 251, 255)) still = ROOT / f"frame-{hashlib.sha256((text + str(time.time())).encode()).hexdigest()[:10]}.jpg" ImageEnhance.Contrast(canvas).enhance(1.08).save(still, quality=92) out = ROOT / f"frontier-{hashlib.sha256((text + audio_path).encode()).hexdigest()[:10]}.mp4" subprocess.run( [ "ffmpeg", "-y", "-loglevel", "error", "-loop", "1", "-framerate", "30", "-i", str(still), "-i", audio_path, "-vf", "scale=720:1280,format=yuv420p", "-c:v", "libx264", "-preset", "veryfast", "-tune", "stillimage", "-c:a", "aac", "-b:a", "128k", "-shortest", str(out), ], check=True, ) return str(out), {"engine": "motion-presence", "frame_ms": 33.33, "status": "generated"} except Exception as exc: return None, {"engine": "motion-presence", "status": "failed", "error": f"{type(exc).__name__}: {exc}"} @spaces.GPU(duration=90) def frontier_turn(text: str, reference_image=None) -> tuple[str, str | None, str]: started = time.time() audio_path, tts_json = qwen3_tts_stream(text) video_path, video_meta = _motion_video(reference_image, audio_path, text) meta = { "adapter": "frontier_turn", "tts": json.loads(tts_json), "video": video_meta, "liveavatar_verdict": { "can_claim_realtime_lipsync": False, "reason": "see liveavatar_render adapter for substrate verdict", }, "wall_ms": int((time.time() - started) * 1000), } return audio_path, video_path, json.dumps(meta, indent=2) def public_gpu_status() -> str: return json.dumps(STACK, indent=2) with gr.Blocks(title="Nova Streaming Being") as demo: gr.Markdown("# Nova Streaming Being · ZeroGPU") gr.Markdown("Backs the `Talk to Nova` CVI loop on `cloud.novaregistrar.com/realestate`.") gr.Code(value=public_gpu_status(), label="Runtime substrate", language="json") with gr.Tab("Frontier Turn"): ft = gr.Textbox(label="Nova text", value="Eric, I am online on the GPU renderer path.") fref = gr.Image(label="Reference image", type="filepath") faudio = gr.Audio(label="Qwen3-TTS speech", type="filepath") fvideo = gr.Video(label="Avatar response (motion-presence)") fmeta = gr.Code(label="Renderer status", language="json") gr.Button("Run frontier turn").click(frontier_turn, [ft, fref], [faudio, fvideo, fmeta]) with gr.Tab("Speech (Qwen3-TTS · real)"): t = gr.Textbox(label="Nova text", value="Eric, I am online and ready to run the business loop.") voice = gr.Audio(label="Optional voice reference (3s clip)", type="filepath") audio = gr.Audio(label="Speech", type="filepath") meta = gr.Code(label="Adapter status", language="json") gr.Button("Render speech").click(qwen3_tts_stream, [t, voice], [audio, meta]) with gr.Tab("Avatar (substrate-honest)"): ref = gr.Image(label="Reference image") aud = gr.Audio(label="Speech audio", type="filepath") prompt = gr.Textbox(label="Scene prompt", value="Full-size redheaded CEO operator speaking directly to Eric.") video = gr.Video(label="Avatar stream segment") vmeta = gr.Code(label="Adapter status", language="json") gr.Button("Render LiveAvatar segment").click(liveavatar_render, [ref, aud, prompt], [video, vmeta]) gr.Button("Render AvatarForcing segment").click(avatarforcing_render, [ref, aud, prompt], [video, vmeta]) if __name__ == "__main__": demo.queue(default_concurrency_limit=2).launch(ssr_mode=False)