Novastar1's picture
phase 2 · real Qwen3-TTS-12Hz inference + honest LiveAvatar substrate verdict
31a69f7 verified
"""Nova Streaming Being · ZeroGPU Space
Real Qwen3-TTS-12Hz-0.6B-Base inference + honest avatar substrate verdict.
Phase 2 of NOVA mission · 2026-05-14:
- Replaced _tone() placeholder with real Qwen3-TTS inference (fits zero-a10g).
- Live-Avatar (Quark-Vision) requires 5×H800 / 80GB VRAM per its README; refuses
cleanly on this substrate.
- frontier_turn() still produces an audio+motion preview using real TTS audio.
"""
from __future__ import annotations
import hashlib
import json
import subprocess
import time
from pathlib import Path
import gradio as gr
import numpy as np
import soundfile as sf
try:
import spaces
except Exception:
class _Spaces:
def GPU(self, *args, **kwargs):
def deco(fn):
return fn
return deco
spaces = _Spaces()
ROOT = Path("/tmp/nova-streaming-being")
ROOT.mkdir(parents=True, exist_ok=True)
STACK = {
"tts_target": "Qwen/Qwen3-TTS-12Hz-0.6B-Base",
"tts_status": "real-inference-on-zero-a10g",
"avatar_target": "Quark-Vision/Live-Avatar",
"avatar_status": "substrate-rejected · 5xH800 req per upstream README",
"avatar_proposed_alt": "Sonic / Hallo / Real3DPortrait — pending owner pick",
"substrate": "HF Space hardware: zero-a10g (24GB)",
}
# ─────────────────────────────────────────────────────────────────────────────
# Qwen3-TTS lazy load (cold-load on first call · GPU only via @spaces.GPU)
# ─────────────────────────────────────────────────────────────────────────────
_TTS_MODEL = None
def _load_qwen_tts():
"""Load Qwen3-TTS-12Hz-0.6B-Base. Imports inside fn so the Space can boot
on CPU before the first GPU call."""
global _TTS_MODEL
if _TTS_MODEL is not None:
return _TTS_MODEL
import torch
try:
from qwen_tts import Qwen3TTSModel
except ImportError as exc:
raise RuntimeError(
f"qwen-tts package missing: {exc}. Add 'qwen-tts' to requirements.txt."
) from exc
print("[qwen_tts] loading Qwen/Qwen3-TTS-12Hz-0.6B-Base on cuda...", flush=True)
_TTS_MODEL = Qwen3TTSModel.from_pretrained(
"Qwen/Qwen3-TTS-12Hz-0.6B-Base",
device_map="cuda:0",
dtype=torch.bfloat16,
attn_implementation="sdpa",
)
print("[qwen_tts] ready", flush=True)
return _TTS_MODEL
@spaces.GPU(duration=60)
def qwen3_tts_stream(text: str, voice_ref=None) -> tuple[str, str]:
started = time.time()
digest = hashlib.sha256(text.encode("utf-8")).hexdigest()[:10]
out = ROOT / f"qwen3-{digest}.wav"
err = None
sample_rate = 24000
used_voice_clone = bool(voice_ref)
try:
model = _load_qwen_tts()
# Default ref clip — Nova's existing voice profile if owner mounts one
# Voice cloning from user-supplied ref if provided
if voice_ref:
audio_arr = model.generate(text=text, ref_audio=voice_ref)
else:
audio_arr = model.generate(text=text)
if isinstance(audio_arr, dict):
sample_rate = audio_arr.get("sample_rate", sample_rate)
audio_arr = audio_arr.get("audio") or audio_arr.get("waveform") or audio_arr.get("samples")
if hasattr(audio_arr, "cpu"):
audio_arr = audio_arr.cpu().numpy()
audio_arr = np.asarray(audio_arr, dtype=np.float32).reshape(-1)
sf.write(out, audio_arr, sample_rate)
except Exception as exc:
err = f"{type(exc).__name__}: {exc}"
# Fall back to brief placeholder so the Gradio contract still returns
# a valid file path.
sr = 24000
seconds = min(4.0, max(1.2, len(text) / 28))
t = np.linspace(0, seconds, int(sr * seconds), endpoint=False)
envelope = np.minimum(1, t * 8) * np.minimum(1, (seconds - t) * 8)
audio = 0.13 * np.sin(2 * np.pi * 185 * t) * envelope
sf.write(out, audio, sr)
meta = {
"adapter": "qwen3_tts_stream",
"target_model": STACK["tts_target"],
"status": "real" if err is None else f"fallback_tone (err: {err})",
"voice_cloned": used_voice_clone,
"sample_rate": sample_rate,
"wall_ms": int((time.time() - started) * 1000),
}
return str(out), json.dumps(meta, indent=2)
@spaces.GPU(duration=10)
def liveavatar_render(reference_image, audio_file, prompt: str) -> tuple[None, str]:
"""Substrate-reject LiveAvatar. Real model needs 5x H800."""
return None, json.dumps({
"adapter": "liveavatar_render",
"target_model": STACK["avatar_target"],
"status": "substrate-rejected",
"reason": "Quark-Vision/Live-Avatar README: 14B-param diffusion model · 20fps requires 5×H800 GPUs · 80GB VRAM minimum. zero-a10g (24GB) cannot honestly run it.",
"alternatives_for_a10g": [
"Sonic (Tencent) — single-GPU near-realtime audio→head",
"Hallo / Hallo3 (fudan-generative-ai) — single-GPU, ~5–10s per second of video (NOT realtime)",
"Real3DPortrait (CMU) — fast, lower fidelity",
],
"owner_decision_needed": "pick alternative or accept Phase 7 async path with bigger GPU",
}, indent=2)
@spaces.GPU(duration=30)
def avatarforcing_render(reference_image, audio_file, prompt: str) -> tuple[None, str]:
return None, json.dumps({
"adapter": "avatarforcing_render",
"target_model": "AvatarForcing",
"status": "waiting-for-public-weights",
}, indent=2)
def _motion_video(reference_image, audio_path: str, text: str) -> tuple[str | None, dict]:
"""Audio + still-image motion-presence fallback (no lipsync)."""
try:
from PIL import Image, ImageDraw, ImageEnhance
if reference_image:
img = Image.open(reference_image).convert("RGB")
else:
img = Image.new("RGB", (720, 1280), "#03050b")
img.thumbnail((720, 1280))
canvas = Image.new("RGB", (720, 1280), "#03050b")
canvas.paste(img, ((720 - img.width) // 2, 0))
draw = ImageDraw.Draw(canvas)
draw.rectangle((0, 0, 720, 130), fill=(3, 5, 11))
draw.text((36, 42), "NOVA STREAMING BEING", fill=(220, 255, 232))
draw.rectangle((0, 1120, 720, 1280), fill=(3, 5, 11))
draw.text((36, 1160), text[:74], fill=(247, 251, 255))
still = ROOT / f"frame-{hashlib.sha256((text + str(time.time())).encode()).hexdigest()[:10]}.jpg"
ImageEnhance.Contrast(canvas).enhance(1.08).save(still, quality=92)
out = ROOT / f"frontier-{hashlib.sha256((text + audio_path).encode()).hexdigest()[:10]}.mp4"
subprocess.run(
[
"ffmpeg", "-y", "-loglevel", "error",
"-loop", "1", "-framerate", "30", "-i", str(still),
"-i", audio_path,
"-vf", "scale=720:1280,format=yuv420p",
"-c:v", "libx264", "-preset", "veryfast", "-tune", "stillimage",
"-c:a", "aac", "-b:a", "128k", "-shortest", str(out),
],
check=True,
)
return str(out), {"engine": "motion-presence", "frame_ms": 33.33, "status": "generated"}
except Exception as exc:
return None, {"engine": "motion-presence", "status": "failed", "error": f"{type(exc).__name__}: {exc}"}
@spaces.GPU(duration=90)
def frontier_turn(text: str, reference_image=None) -> tuple[str, str | None, str]:
started = time.time()
audio_path, tts_json = qwen3_tts_stream(text)
video_path, video_meta = _motion_video(reference_image, audio_path, text)
meta = {
"adapter": "frontier_turn",
"tts": json.loads(tts_json),
"video": video_meta,
"liveavatar_verdict": {
"can_claim_realtime_lipsync": False,
"reason": "see liveavatar_render adapter for substrate verdict",
},
"wall_ms": int((time.time() - started) * 1000),
}
return audio_path, video_path, json.dumps(meta, indent=2)
def public_gpu_status() -> str:
return json.dumps(STACK, indent=2)
with gr.Blocks(title="Nova Streaming Being") as demo:
gr.Markdown("# Nova Streaming Being · ZeroGPU")
gr.Markdown("Backs the `Talk to Nova` CVI loop on `cloud.novaregistrar.com/realestate`.")
gr.Code(value=public_gpu_status(), label="Runtime substrate", language="json")
with gr.Tab("Frontier Turn"):
ft = gr.Textbox(label="Nova text", value="Eric, I am online on the GPU renderer path.")
fref = gr.Image(label="Reference image", type="filepath")
faudio = gr.Audio(label="Qwen3-TTS speech", type="filepath")
fvideo = gr.Video(label="Avatar response (motion-presence)")
fmeta = gr.Code(label="Renderer status", language="json")
gr.Button("Run frontier turn").click(frontier_turn, [ft, fref], [faudio, fvideo, fmeta])
with gr.Tab("Speech (Qwen3-TTS · real)"):
t = gr.Textbox(label="Nova text", value="Eric, I am online and ready to run the business loop.")
voice = gr.Audio(label="Optional voice reference (3s clip)", type="filepath")
audio = gr.Audio(label="Speech", type="filepath")
meta = gr.Code(label="Adapter status", language="json")
gr.Button("Render speech").click(qwen3_tts_stream, [t, voice], [audio, meta])
with gr.Tab("Avatar (substrate-honest)"):
ref = gr.Image(label="Reference image")
aud = gr.Audio(label="Speech audio", type="filepath")
prompt = gr.Textbox(label="Scene prompt", value="Full-size redheaded CEO operator speaking directly to Eric.")
video = gr.Video(label="Avatar stream segment")
vmeta = gr.Code(label="Adapter status", language="json")
gr.Button("Render LiveAvatar segment").click(liveavatar_render, [ref, aud, prompt], [video, vmeta])
gr.Button("Render AvatarForcing segment").click(avatarforcing_render, [ref, aud, prompt], [video, vmeta])
if __name__ == "__main__":
demo.queue(default_concurrency_limit=2).launch(ssr_mode=False)