Spaces:
Running on Zero
Running on Zero
File size: 10,242 Bytes
31a69f7 bb74874 103c2ff bb74874 59c2903 bb74874 31a69f7 bb74874 31a69f7 bb74874 31a69f7 bb74874 31a69f7 bb74874 31a69f7 bb74874 31a69f7 bb74874 31a69f7 bb74874 31a69f7 bb74874 31a69f7 103c2ff 31a69f7 bb74874 31a69f7 bb74874 59c2903 103c2ff 31a69f7 bb74874 31a69f7 103c2ff 31a69f7 bb74874 103c2ff 31a69f7 103c2ff bb74874 31a69f7 103c2ff 77d738b 103c2ff 77d738b 103c2ff 77d738b 31a69f7 77d738b 103c2ff 31a69f7 103c2ff bb74874 31a69f7 bb74874 77d738b 31a69f7 77d738b 31a69f7 bb74874 31a69f7 bb74874 31a69f7 bb74874 e6cd774 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 | """Nova Streaming Being · ZeroGPU Space
Real Qwen3-TTS-12Hz-0.6B-Base inference + honest avatar substrate verdict.
Phase 2 of NOVA mission · 2026-05-14:
- Replaced _tone() placeholder with real Qwen3-TTS inference (fits zero-a10g).
- Live-Avatar (Quark-Vision) requires 5×H800 / 80GB VRAM per its README; refuses
cleanly on this substrate.
- frontier_turn() still produces an audio+motion preview using real TTS audio.
"""
from __future__ import annotations
import hashlib
import json
import subprocess
import time
from pathlib import Path
import gradio as gr
import numpy as np
import soundfile as sf
try:
import spaces
except Exception:
class _Spaces:
def GPU(self, *args, **kwargs):
def deco(fn):
return fn
return deco
spaces = _Spaces()
ROOT = Path("/tmp/nova-streaming-being")
ROOT.mkdir(parents=True, exist_ok=True)
STACK = {
"tts_target": "Qwen/Qwen3-TTS-12Hz-0.6B-Base",
"tts_status": "real-inference-on-zero-a10g",
"avatar_target": "Quark-Vision/Live-Avatar",
"avatar_status": "substrate-rejected · 5xH800 req per upstream README",
"avatar_proposed_alt": "Sonic / Hallo / Real3DPortrait — pending owner pick",
"substrate": "HF Space hardware: zero-a10g (24GB)",
}
# ─────────────────────────────────────────────────────────────────────────────
# Qwen3-TTS lazy load (cold-load on first call · GPU only via @spaces.GPU)
# ─────────────────────────────────────────────────────────────────────────────
_TTS_MODEL = None
def _load_qwen_tts():
"""Load Qwen3-TTS-12Hz-0.6B-Base. Imports inside fn so the Space can boot
on CPU before the first GPU call."""
global _TTS_MODEL
if _TTS_MODEL is not None:
return _TTS_MODEL
import torch
try:
from qwen_tts import Qwen3TTSModel
except ImportError as exc:
raise RuntimeError(
f"qwen-tts package missing: {exc}. Add 'qwen-tts' to requirements.txt."
) from exc
print("[qwen_tts] loading Qwen/Qwen3-TTS-12Hz-0.6B-Base on cuda...", flush=True)
_TTS_MODEL = Qwen3TTSModel.from_pretrained(
"Qwen/Qwen3-TTS-12Hz-0.6B-Base",
device_map="cuda:0",
dtype=torch.bfloat16,
attn_implementation="sdpa",
)
print("[qwen_tts] ready", flush=True)
return _TTS_MODEL
@spaces.GPU(duration=60)
def qwen3_tts_stream(text: str, voice_ref=None) -> tuple[str, str]:
started = time.time()
digest = hashlib.sha256(text.encode("utf-8")).hexdigest()[:10]
out = ROOT / f"qwen3-{digest}.wav"
err = None
sample_rate = 24000
used_voice_clone = bool(voice_ref)
try:
model = _load_qwen_tts()
# Default ref clip — Nova's existing voice profile if owner mounts one
# Voice cloning from user-supplied ref if provided
if voice_ref:
audio_arr = model.generate(text=text, ref_audio=voice_ref)
else:
audio_arr = model.generate(text=text)
if isinstance(audio_arr, dict):
sample_rate = audio_arr.get("sample_rate", sample_rate)
audio_arr = audio_arr.get("audio") or audio_arr.get("waveform") or audio_arr.get("samples")
if hasattr(audio_arr, "cpu"):
audio_arr = audio_arr.cpu().numpy()
audio_arr = np.asarray(audio_arr, dtype=np.float32).reshape(-1)
sf.write(out, audio_arr, sample_rate)
except Exception as exc:
err = f"{type(exc).__name__}: {exc}"
# Fall back to brief placeholder so the Gradio contract still returns
# a valid file path.
sr = 24000
seconds = min(4.0, max(1.2, len(text) / 28))
t = np.linspace(0, seconds, int(sr * seconds), endpoint=False)
envelope = np.minimum(1, t * 8) * np.minimum(1, (seconds - t) * 8)
audio = 0.13 * np.sin(2 * np.pi * 185 * t) * envelope
sf.write(out, audio, sr)
meta = {
"adapter": "qwen3_tts_stream",
"target_model": STACK["tts_target"],
"status": "real" if err is None else f"fallback_tone (err: {err})",
"voice_cloned": used_voice_clone,
"sample_rate": sample_rate,
"wall_ms": int((time.time() - started) * 1000),
}
return str(out), json.dumps(meta, indent=2)
@spaces.GPU(duration=10)
def liveavatar_render(reference_image, audio_file, prompt: str) -> tuple[None, str]:
"""Substrate-reject LiveAvatar. Real model needs 5x H800."""
return None, json.dumps({
"adapter": "liveavatar_render",
"target_model": STACK["avatar_target"],
"status": "substrate-rejected",
"reason": "Quark-Vision/Live-Avatar README: 14B-param diffusion model · 20fps requires 5×H800 GPUs · 80GB VRAM minimum. zero-a10g (24GB) cannot honestly run it.",
"alternatives_for_a10g": [
"Sonic (Tencent) — single-GPU near-realtime audio→head",
"Hallo / Hallo3 (fudan-generative-ai) — single-GPU, ~5–10s per second of video (NOT realtime)",
"Real3DPortrait (CMU) — fast, lower fidelity",
],
"owner_decision_needed": "pick alternative or accept Phase 7 async path with bigger GPU",
}, indent=2)
@spaces.GPU(duration=30)
def avatarforcing_render(reference_image, audio_file, prompt: str) -> tuple[None, str]:
return None, json.dumps({
"adapter": "avatarforcing_render",
"target_model": "AvatarForcing",
"status": "waiting-for-public-weights",
}, indent=2)
def _motion_video(reference_image, audio_path: str, text: str) -> tuple[str | None, dict]:
"""Audio + still-image motion-presence fallback (no lipsync)."""
try:
from PIL import Image, ImageDraw, ImageEnhance
if reference_image:
img = Image.open(reference_image).convert("RGB")
else:
img = Image.new("RGB", (720, 1280), "#03050b")
img.thumbnail((720, 1280))
canvas = Image.new("RGB", (720, 1280), "#03050b")
canvas.paste(img, ((720 - img.width) // 2, 0))
draw = ImageDraw.Draw(canvas)
draw.rectangle((0, 0, 720, 130), fill=(3, 5, 11))
draw.text((36, 42), "NOVA STREAMING BEING", fill=(220, 255, 232))
draw.rectangle((0, 1120, 720, 1280), fill=(3, 5, 11))
draw.text((36, 1160), text[:74], fill=(247, 251, 255))
still = ROOT / f"frame-{hashlib.sha256((text + str(time.time())).encode()).hexdigest()[:10]}.jpg"
ImageEnhance.Contrast(canvas).enhance(1.08).save(still, quality=92)
out = ROOT / f"frontier-{hashlib.sha256((text + audio_path).encode()).hexdigest()[:10]}.mp4"
subprocess.run(
[
"ffmpeg", "-y", "-loglevel", "error",
"-loop", "1", "-framerate", "30", "-i", str(still),
"-i", audio_path,
"-vf", "scale=720:1280,format=yuv420p",
"-c:v", "libx264", "-preset", "veryfast", "-tune", "stillimage",
"-c:a", "aac", "-b:a", "128k", "-shortest", str(out),
],
check=True,
)
return str(out), {"engine": "motion-presence", "frame_ms": 33.33, "status": "generated"}
except Exception as exc:
return None, {"engine": "motion-presence", "status": "failed", "error": f"{type(exc).__name__}: {exc}"}
@spaces.GPU(duration=90)
def frontier_turn(text: str, reference_image=None) -> tuple[str, str | None, str]:
started = time.time()
audio_path, tts_json = qwen3_tts_stream(text)
video_path, video_meta = _motion_video(reference_image, audio_path, text)
meta = {
"adapter": "frontier_turn",
"tts": json.loads(tts_json),
"video": video_meta,
"liveavatar_verdict": {
"can_claim_realtime_lipsync": False,
"reason": "see liveavatar_render adapter for substrate verdict",
},
"wall_ms": int((time.time() - started) * 1000),
}
return audio_path, video_path, json.dumps(meta, indent=2)
def public_gpu_status() -> str:
return json.dumps(STACK, indent=2)
with gr.Blocks(title="Nova Streaming Being") as demo:
gr.Markdown("# Nova Streaming Being · ZeroGPU")
gr.Markdown("Backs the `Talk to Nova` CVI loop on `cloud.novaregistrar.com/realestate`.")
gr.Code(value=public_gpu_status(), label="Runtime substrate", language="json")
with gr.Tab("Frontier Turn"):
ft = gr.Textbox(label="Nova text", value="Eric, I am online on the GPU renderer path.")
fref = gr.Image(label="Reference image", type="filepath")
faudio = gr.Audio(label="Qwen3-TTS speech", type="filepath")
fvideo = gr.Video(label="Avatar response (motion-presence)")
fmeta = gr.Code(label="Renderer status", language="json")
gr.Button("Run frontier turn").click(frontier_turn, [ft, fref], [faudio, fvideo, fmeta])
with gr.Tab("Speech (Qwen3-TTS · real)"):
t = gr.Textbox(label="Nova text", value="Eric, I am online and ready to run the business loop.")
voice = gr.Audio(label="Optional voice reference (3s clip)", type="filepath")
audio = gr.Audio(label="Speech", type="filepath")
meta = gr.Code(label="Adapter status", language="json")
gr.Button("Render speech").click(qwen3_tts_stream, [t, voice], [audio, meta])
with gr.Tab("Avatar (substrate-honest)"):
ref = gr.Image(label="Reference image")
aud = gr.Audio(label="Speech audio", type="filepath")
prompt = gr.Textbox(label="Scene prompt", value="Full-size redheaded CEO operator speaking directly to Eric.")
video = gr.Video(label="Avatar stream segment")
vmeta = gr.Code(label="Adapter status", language="json")
gr.Button("Render LiveAvatar segment").click(liveavatar_render, [ref, aud, prompt], [video, vmeta])
gr.Button("Render AvatarForcing segment").click(avatarforcing_render, [ref, aud, prompt], [video, vmeta])
if __name__ == "__main__":
demo.queue(default_concurrency_limit=2).launch(ssr_mode=False)
|