File size: 10,242 Bytes
31a69f7
 
 
 
 
 
 
 
 
bb74874
 
 
 
103c2ff
bb74874
 
 
 
 
 
 
59c2903
 
 
 
 
 
 
 
 
bb74874
 
 
 
 
 
31a69f7
bb74874
31a69f7
 
 
bb74874
 
31a69f7
 
 
 
bb74874
 
31a69f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb74874
31a69f7
 
bb74874
 
 
31a69f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb74874
 
 
31a69f7
 
 
 
bb74874
31a69f7
bb74874
 
31a69f7
103c2ff
31a69f7
 
bb74874
 
31a69f7
 
 
 
 
 
 
 
 
bb74874
 
59c2903
103c2ff
31a69f7
bb74874
31a69f7
103c2ff
31a69f7
bb74874
 
103c2ff
31a69f7
103c2ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb74874
 
31a69f7
103c2ff
77d738b
103c2ff
 
77d738b
 
103c2ff
77d738b
31a69f7
 
 
77d738b
 
 
 
 
 
103c2ff
31a69f7
103c2ff
 
bb74874
31a69f7
 
bb74874
77d738b
 
 
31a69f7
 
 
77d738b
31a69f7
bb74874
31a69f7
 
bb74874
 
31a69f7
bb74874
 
 
 
 
 
 
 
 
 
e6cd774
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
"""Nova Streaming Being · ZeroGPU Space
Real Qwen3-TTS-12Hz-0.6B-Base inference + honest avatar substrate verdict.

Phase 2 of NOVA mission · 2026-05-14:
- Replaced _tone() placeholder with real Qwen3-TTS inference (fits zero-a10g).
- Live-Avatar (Quark-Vision) requires 5×H800 / 80GB VRAM per its README; refuses
  cleanly on this substrate.
- frontier_turn() still produces an audio+motion preview using real TTS audio.
"""
from __future__ import annotations

import hashlib
import json
import subprocess
import time
from pathlib import Path

import gradio as gr
import numpy as np
import soundfile as sf

try:
    import spaces
except Exception:
    class _Spaces:
        def GPU(self, *args, **kwargs):
            def deco(fn):
                return fn
            return deco
    spaces = _Spaces()

ROOT = Path("/tmp/nova-streaming-being")
ROOT.mkdir(parents=True, exist_ok=True)

STACK = {
    "tts_target": "Qwen/Qwen3-TTS-12Hz-0.6B-Base",
    "tts_status": "real-inference-on-zero-a10g",
    "avatar_target": "Quark-Vision/Live-Avatar",
    "avatar_status": "substrate-rejected · 5xH800 req per upstream README",
    "avatar_proposed_alt": "Sonic / Hallo / Real3DPortrait — pending owner pick",
    "substrate": "HF Space hardware: zero-a10g (24GB)",
}

# ─────────────────────────────────────────────────────────────────────────────
# Qwen3-TTS lazy load (cold-load on first call · GPU only via @spaces.GPU)
# ─────────────────────────────────────────────────────────────────────────────
_TTS_MODEL = None


def _load_qwen_tts():
    """Load Qwen3-TTS-12Hz-0.6B-Base. Imports inside fn so the Space can boot
    on CPU before the first GPU call."""
    global _TTS_MODEL
    if _TTS_MODEL is not None:
        return _TTS_MODEL
    import torch
    try:
        from qwen_tts import Qwen3TTSModel
    except ImportError as exc:
        raise RuntimeError(
            f"qwen-tts package missing: {exc}. Add 'qwen-tts' to requirements.txt."
        ) from exc
    print("[qwen_tts] loading Qwen/Qwen3-TTS-12Hz-0.6B-Base on cuda...", flush=True)
    _TTS_MODEL = Qwen3TTSModel.from_pretrained(
        "Qwen/Qwen3-TTS-12Hz-0.6B-Base",
        device_map="cuda:0",
        dtype=torch.bfloat16,
        attn_implementation="sdpa",
    )
    print("[qwen_tts] ready", flush=True)
    return _TTS_MODEL


@spaces.GPU(duration=60)
def qwen3_tts_stream(text: str, voice_ref=None) -> tuple[str, str]:
    started = time.time()
    digest = hashlib.sha256(text.encode("utf-8")).hexdigest()[:10]
    out = ROOT / f"qwen3-{digest}.wav"
    err = None
    sample_rate = 24000
    used_voice_clone = bool(voice_ref)
    try:
        model = _load_qwen_tts()
        # Default ref clip — Nova's existing voice profile if owner mounts one
        # Voice cloning from user-supplied ref if provided
        if voice_ref:
            audio_arr = model.generate(text=text, ref_audio=voice_ref)
        else:
            audio_arr = model.generate(text=text)
        if isinstance(audio_arr, dict):
            sample_rate = audio_arr.get("sample_rate", sample_rate)
            audio_arr = audio_arr.get("audio") or audio_arr.get("waveform") or audio_arr.get("samples")
        if hasattr(audio_arr, "cpu"):
            audio_arr = audio_arr.cpu().numpy()
        audio_arr = np.asarray(audio_arr, dtype=np.float32).reshape(-1)
        sf.write(out, audio_arr, sample_rate)
    except Exception as exc:
        err = f"{type(exc).__name__}: {exc}"
        # Fall back to brief placeholder so the Gradio contract still returns
        # a valid file path.
        sr = 24000
        seconds = min(4.0, max(1.2, len(text) / 28))
        t = np.linspace(0, seconds, int(sr * seconds), endpoint=False)
        envelope = np.minimum(1, t * 8) * np.minimum(1, (seconds - t) * 8)
        audio = 0.13 * np.sin(2 * np.pi * 185 * t) * envelope
        sf.write(out, audio, sr)
    meta = {
        "adapter": "qwen3_tts_stream",
        "target_model": STACK["tts_target"],
        "status": "real" if err is None else f"fallback_tone (err: {err})",
        "voice_cloned": used_voice_clone,
        "sample_rate": sample_rate,
        "wall_ms": int((time.time() - started) * 1000),
    }
    return str(out), json.dumps(meta, indent=2)


@spaces.GPU(duration=10)
def liveavatar_render(reference_image, audio_file, prompt: str) -> tuple[None, str]:
    """Substrate-reject LiveAvatar. Real model needs 5x H800."""
    return None, json.dumps({
        "adapter": "liveavatar_render",
        "target_model": STACK["avatar_target"],
        "status": "substrate-rejected",
        "reason": "Quark-Vision/Live-Avatar README: 14B-param diffusion model · 20fps requires 5×H800 GPUs · 80GB VRAM minimum. zero-a10g (24GB) cannot honestly run it.",
        "alternatives_for_a10g": [
            "Sonic (Tencent) — single-GPU near-realtime audio→head",
            "Hallo / Hallo3 (fudan-generative-ai) — single-GPU, ~5–10s per second of video (NOT realtime)",
            "Real3DPortrait (CMU) — fast, lower fidelity",
        ],
        "owner_decision_needed": "pick alternative or accept Phase 7 async path with bigger GPU",
    }, indent=2)


@spaces.GPU(duration=30)
def avatarforcing_render(reference_image, audio_file, prompt: str) -> tuple[None, str]:
    return None, json.dumps({
        "adapter": "avatarforcing_render",
        "target_model": "AvatarForcing",
        "status": "waiting-for-public-weights",
    }, indent=2)


def _motion_video(reference_image, audio_path: str, text: str) -> tuple[str | None, dict]:
    """Audio + still-image motion-presence fallback (no lipsync)."""
    try:
        from PIL import Image, ImageDraw, ImageEnhance

        if reference_image:
            img = Image.open(reference_image).convert("RGB")
        else:
            img = Image.new("RGB", (720, 1280), "#03050b")
        img.thumbnail((720, 1280))
        canvas = Image.new("RGB", (720, 1280), "#03050b")
        canvas.paste(img, ((720 - img.width) // 2, 0))
        draw = ImageDraw.Draw(canvas)
        draw.rectangle((0, 0, 720, 130), fill=(3, 5, 11))
        draw.text((36, 42), "NOVA STREAMING BEING", fill=(220, 255, 232))
        draw.rectangle((0, 1120, 720, 1280), fill=(3, 5, 11))
        draw.text((36, 1160), text[:74], fill=(247, 251, 255))
        still = ROOT / f"frame-{hashlib.sha256((text + str(time.time())).encode()).hexdigest()[:10]}.jpg"
        ImageEnhance.Contrast(canvas).enhance(1.08).save(still, quality=92)
        out = ROOT / f"frontier-{hashlib.sha256((text + audio_path).encode()).hexdigest()[:10]}.mp4"
        subprocess.run(
            [
                "ffmpeg", "-y", "-loglevel", "error",
                "-loop", "1", "-framerate", "30", "-i", str(still),
                "-i", audio_path,
                "-vf", "scale=720:1280,format=yuv420p",
                "-c:v", "libx264", "-preset", "veryfast", "-tune", "stillimage",
                "-c:a", "aac", "-b:a", "128k", "-shortest", str(out),
            ],
            check=True,
        )
        return str(out), {"engine": "motion-presence", "frame_ms": 33.33, "status": "generated"}
    except Exception as exc:
        return None, {"engine": "motion-presence", "status": "failed", "error": f"{type(exc).__name__}: {exc}"}


@spaces.GPU(duration=90)
def frontier_turn(text: str, reference_image=None) -> tuple[str, str | None, str]:
    started = time.time()
    audio_path, tts_json = qwen3_tts_stream(text)
    video_path, video_meta = _motion_video(reference_image, audio_path, text)
    meta = {
        "adapter": "frontier_turn",
        "tts": json.loads(tts_json),
        "video": video_meta,
        "liveavatar_verdict": {
            "can_claim_realtime_lipsync": False,
            "reason": "see liveavatar_render adapter for substrate verdict",
        },
        "wall_ms": int((time.time() - started) * 1000),
    }
    return audio_path, video_path, json.dumps(meta, indent=2)


def public_gpu_status() -> str:
    return json.dumps(STACK, indent=2)


with gr.Blocks(title="Nova Streaming Being") as demo:
    gr.Markdown("# Nova Streaming Being · ZeroGPU")
    gr.Markdown("Backs the `Talk to Nova` CVI loop on `cloud.novaregistrar.com/realestate`.")
    gr.Code(value=public_gpu_status(), label="Runtime substrate", language="json")
    with gr.Tab("Frontier Turn"):
        ft = gr.Textbox(label="Nova text", value="Eric, I am online on the GPU renderer path.")
        fref = gr.Image(label="Reference image", type="filepath")
        faudio = gr.Audio(label="Qwen3-TTS speech", type="filepath")
        fvideo = gr.Video(label="Avatar response (motion-presence)")
        fmeta = gr.Code(label="Renderer status", language="json")
        gr.Button("Run frontier turn").click(frontier_turn, [ft, fref], [faudio, fvideo, fmeta])
    with gr.Tab("Speech (Qwen3-TTS · real)"):
        t = gr.Textbox(label="Nova text", value="Eric, I am online and ready to run the business loop.")
        voice = gr.Audio(label="Optional voice reference (3s clip)", type="filepath")
        audio = gr.Audio(label="Speech", type="filepath")
        meta = gr.Code(label="Adapter status", language="json")
        gr.Button("Render speech").click(qwen3_tts_stream, [t, voice], [audio, meta])
    with gr.Tab("Avatar (substrate-honest)"):
        ref = gr.Image(label="Reference image")
        aud = gr.Audio(label="Speech audio", type="filepath")
        prompt = gr.Textbox(label="Scene prompt", value="Full-size redheaded CEO operator speaking directly to Eric.")
        video = gr.Video(label="Avatar stream segment")
        vmeta = gr.Code(label="Adapter status", language="json")
        gr.Button("Render LiveAvatar segment").click(liveavatar_render, [ref, aud, prompt], [video, vmeta])
        gr.Button("Render AvatarForcing segment").click(avatarforcing_render, [ref, aud, prompt], [video, vmeta])


if __name__ == "__main__":
    demo.queue(default_concurrency_limit=2).launch(ssr_mode=False)