Spaces:

broadfield-dev
/

qweb3-tts-cpu

Paused

File size: 22,397 Bytes

72a6ef9

import gradio as gr
import torch
import numpy as np
import soundfile as sf
from pathlib import Path
import os
import warnings
import subprocess
import tempfile
import math

warnings.filterwarnings("ignore", category=UserWarning)

# ────────────────────────────────────────────────
# Lazy import for TTS model (not required for video tab)
# ────────────────────────────────────────────────
try:
    from qwen_tts import Qwen3TTSModel
    TTS_AVAILABLE = True
except ImportError:
    TTS_AVAILABLE = False

# ────────────────────────────────────────────────
# Globals & Model Loader
# ────────────────────────────────────────────────

MODELS = {
    "1.7B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
    "0.6B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice",
    "1.7B-VoiceDesign": "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign",
    "1.7B-Base":        "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
    "0.6B-Base":        "Qwen/Qwen3-TTS-12Hz-0.6B-Base",
}

loaded_models = {}

def get_model(model_key: str, dtype_str: str = "float32", progress=gr.Progress()):
    if not TTS_AVAILABLE:
        raise gr.Error("qwen_tts is not installed. TTS tabs unavailable.")
    key = f"{model_key}_{dtype_str}"
    if key in loaded_models:
        return loaded_models[key]
    progress(0.1, desc=f"Loading {model_key} ({dtype_str}) …")
    repo_id = MODELS[model_key]
    dtype = torch.float32 if dtype_str == "float32" else torch.float16
    try:
        model = Qwen3TTSModel.from_pretrained(
            repo_id, device_map="cpu", dtype=dtype,
            torch_dtype=dtype, low_cpu_mem_usage=True,
        )
    except Exception as e:
        raise gr.Error(f"Model loading failed:\n{str(e)}\n\nTry float32 or smaller variant.")
    loaded_models[key] = model
    progress(0.9, desc="Model ready.")
    return model


# ────────────────────────────────────────────────
# TTS Inference (unchanged)
# ────────────────────────────────────────────────

def infer_custom_voice(text, lang, speaker, instruct, model_key, precision, progress=gr.Progress()):
    if not text.strip():
        return None, "Please enter some text."
    model = get_model(model_key, precision, progress)
    progress(0.4, desc="Generating …")
    try:
        wavs, sr = model.generate_custom_voice(
            text=text, language=lang if lang != "Auto" else None,
            speaker=speaker, instruct=instruct.strip() or None, max_new_tokens=1500,
        )
        path = "/tmp/output_custom.wav"
        sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr)
        info = f"**Generated with {model_key}**  \nlang: {lang}  \nspeaker: {speaker}  \ninstruct: {instruct or '(none)'}"
        return path, info
    except Exception as e:
        return None, f"**Error**: {str(e)}"


def infer_voice_design(text, lang, instruct, model_key, precision, progress=gr.Progress()):
    if not text.strip() or not instruct.strip():
        return None, "Text and voice instruction required."
    model = get_model(model_key, precision, progress)
    progress(0.4, desc="Generating …")
    try:
        wavs, sr = model.generate_voice_design(
            text=text, language=lang if lang != "Auto" else None,
            instruct=instruct, max_new_tokens=1500,
        )
        path = "/tmp/output_design.wav"
        sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr)
        info = f"**Voice Design – {model_key}**  \nlang: {lang}  \ninstruct: {instruct}"
        return path, info
    except Exception as e:
        return None, f"**Error**: {str(e)}"


def infer_voice_clone(text, lang, ref_audio, ref_text, x_vector_only, model_key, precision, progress=gr.Progress()):
    if not text.strip():
        return None, "Enter text to synthesize."
    if not ref_audio:
        return None, "Upload reference audio."
    model = get_model(model_key, precision, progress)
    progress(0.3, desc="Processing reference …")
    try:
        wavs, sr = model.generate_voice_clone(
            text=text, language=lang if lang != "Auto" else None,
            ref_audio=ref_audio, ref_text=ref_text.strip() or None,
            x_vector_only_mode=x_vector_only, max_new_tokens=1500,
        )
        path = "/tmp/output_clone.wav"
        sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr)
        info = f"**Voice Clone – {model_key}**  \nlang: {lang}  \nx-vector-only: {x_vector_only}"
        return path, info
    except Exception as e:
        return None, f"**Error**: {str(e)}"


# ────────────────────────────────────────────────
# Video Generation Helpers
# ────────────────────────────────────────────────

RESOLUTIONS = {
    "1080×1920 (TikTok/Reels 9:16)": (1080, 1920),
    "1080×1080 (Instagram Square)":  (1080, 1080),
    "1920×1080 (YouTube Landscape)": (1920, 1080),
    "1280×720 (YouTube 720p)":       (1280, 720),
}

VISUAL_STYLES = [
    "🎙 Solid + Waveform",
    "🌊 Animated Spectrum Bars",
    "⚡ Oscilloscope Line",
    "🌈 Gradient Pulse",
    "🔲 Minimal Dark + Title",
]


def hex_to_rgb(h: str):
    h = h.lstrip("#")
    return tuple(int(h[i:i+2], 16) for i in (0, 2, 4))


def render_frame_solid_waveform(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps):
    """Solid background with a centered waveform line."""
    from PIL import Image, ImageDraw, ImageFont
    img = Image.new("RGB", (w, h), bg_color)
    draw = ImageDraw.Draw(img)
    # waveform
    n = len(audio_chunk)
    if n == 0:
        return img
    cx = h // 2
    bar_w = max(1, w // max(n, 1))
    for i, amp in enumerate(audio_chunk):
        x = int(i * w / n)
        bar_h = int(abs(amp) * h * 0.4)
        draw.rectangle([x, cx - bar_h, x + bar_w - 1, cx + bar_h], fill=accent_color)
    # title text
    if title:
        try:
            from PIL import ImageFont
            font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", max(20, h // 25))
        except Exception:
            font = None
        draw.text((w // 2, h // 10), title, fill=(255, 255, 255), anchor="mm", font=font)
    return img


def render_frame_spectrum(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps):
    """Animated spectrum-like bars using FFT."""
    from PIL import Image, ImageDraw
    img = Image.new("RGB", (w, h), bg_color)
    draw = ImageDraw.Draw(img)
    N_BARS = 64
    n = len(audio_chunk)
    if n > 0:
        spectrum = np.abs(np.fft.rfft(audio_chunk, n=512))[:N_BARS]
        spectrum = spectrum / (spectrum.max() + 1e-9)
    else:
        spectrum = np.zeros(N_BARS)
    bar_w = w // N_BARS
    for i, val in enumerate(spectrum):
        bar_h = int(val * h * 0.8)
        x0 = i * bar_w
        x1 = x0 + bar_w - 2
        # gradient colour from accent to white
        r, g, b = accent_color
        draw.rectangle([x0, h - bar_h, x1, h], fill=(r, g, b))
    if title:
        try:
            from PIL import ImageFont
            font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", max(20, h // 25))
        except Exception:
            font = None
        draw.text((w // 2, h // 10), title, fill=(255, 255, 255), anchor="mm", font=font)
    return img


def render_frame_oscilloscope(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps):
    """Classic green-phosphor oscilloscope line."""
    from PIL import Image, ImageDraw
    img = Image.new("RGB", (w, h), (10, 10, 10))
    draw = ImageDraw.Draw(img)
    n = len(audio_chunk)
    cx = h // 2
    pts = []
    for i in range(n):
        x = int(i * w / n)
        y = int(cx - audio_chunk[i] * h * 0.4)
        y = max(0, min(h - 1, y))
        pts.append((x, y))
    if len(pts) > 1:
        draw.line(pts, fill=accent_color, width=max(2, h // 200))
    if title:
        try:
            from PIL import ImageFont
            font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", max(20, h // 25))
        except Exception:
            font = None
        draw.text((w // 2, h // 12), title, fill=(200, 255, 200), anchor="mm", font=font)
    return img


def render_frame_gradient_pulse(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps):
    """Radial gradient that pulses with RMS energy."""
    from PIL import Image, ImageDraw, ImageFilter
    rms = float(np.sqrt(np.mean(audio_chunk ** 2))) if len(audio_chunk) > 0 else 0
    r0, g0, b0 = bg_color
    r1, g1, b1 = accent_color
    img = Image.new("RGB", (w, h))
    pixels = img.load()
    cx, cy = w // 2, h // 2
    max_r = math.sqrt(cx**2 + cy**2)
    pulse = 0.3 + rms * 2.5
    for y in range(h):
        for x in range(w):
            dist = math.sqrt((x - cx)**2 + (y - cy)**2) / max_r
            t = max(0.0, min(1.0, 1 - dist / pulse))
            pixels[x, y] = (
                int(r0 + t * (r1 - r0)),
                int(g0 + t * (g1 - g0)),
                int(b0 + t * (b1 - b0)),
            )
    draw = ImageDraw.Draw(img)
    if title:
        try:
            from PIL import ImageFont
            font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", max(20, h // 25))
        except Exception:
            font = None
        draw.text((cx, h // 10), title, fill=(255, 255, 255), anchor="mm", font=font)
    return img


def render_frame_minimal_dark(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps):
    """Minimal dark with thin progress bar + centered text."""
    from PIL import Image, ImageDraw
    img = Image.new("RGB", (w, h), (18, 18, 22))
    draw = ImageDraw.Draw(img)
    # thin horizontal waveform strip
    strip_h = max(4, h // 15)
    cy = h // 2
    n = len(audio_chunk)
    for i in range(n):
        x = int(i * w / n)
        amp = int(audio_chunk[i] * strip_h)
        draw.rectangle([x, cy - abs(amp), x, cy + abs(amp)], fill=accent_color)
    # bottom progress indicator: thin white line based on frame
    prog_w = int(frame_idx * w / max(fps * 1, 1))  # width grows with time; real duration injected below
    draw.rectangle([0, h - 4, prog_w, h], fill=accent_color)
    if title:
        try:
            from PIL import ImageFont
            font_size = max(24, h // 18)
            font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size)
        except Exception:
            font = None
        draw.text((w // 2, h * 2 // 5), title, fill=(240, 240, 245), anchor="mm", font=font)
    return img


RENDERERS = {
    "🎙 Solid + Waveform":     render_frame_solid_waveform,
    "🌊 Animated Spectrum Bars": render_frame_spectrum,
    "⚡ Oscilloscope Line":    render_frame_oscilloscope,
    "🌈 Gradient Pulse":       render_frame_gradient_pulse,
    "🔲 Minimal Dark + Title": render_frame_minimal_dark,
}


def audio_to_video(
    audio_path, style, resolution_label,
    bg_hex, accent_hex, title_text, fps_str,
    progress=gr.Progress()
):
    if not audio_path:
        return None, "❌ No audio file provided. Generate or upload audio first."

    fps = int(fps_str)
    w, h = RESOLUTIONS[resolution_label]
    bg_color    = hex_to_rgb(bg_hex)
    accent_color = hex_to_rgb(accent_hex)
    render_fn   = RENDERERS[style]

    # ---- Load audio ----
    progress(0.05, desc="Reading audio …")
    try:
        audio_data, sr = sf.read(audio_path, dtype="float32")
    except Exception as e:
        return None, f"❌ Could not read audio: {e}"

    if audio_data.ndim > 1:
        audio_data = audio_data.mean(axis=1)  # mono

    duration   = len(audio_data) / sr
    n_frames   = int(duration * fps)
    samples_per_frame = max(1, len(audio_data) // max(n_frames, 1))

    # ---- Write frames to temp dir ----
    progress(0.10, desc="Rendering frames …")
    with tempfile.TemporaryDirectory() as tmpdir:
        frame_dir = Path(tmpdir) / "frames"
        frame_dir.mkdir()

        # gradient_pulse is slow (pixel-by-pixel); warn user
        for fi in range(n_frames):
            if fi % max(1, n_frames // 20) == 0:
                progress(0.10 + 0.65 * fi / n_frames, desc=f"Frame {fi}/{n_frames} …")

            start = fi * samples_per_frame
            end   = min(start + samples_per_frame, len(audio_data))
            chunk = audio_data[start:end] if end > start else np.zeros(64)

            img = render_fn(w, h, chunk, bg_color, accent_color, title_text, fi, fps)
            img.save(str(frame_dir / f"frame_{fi:06d}.png"))

        # ---- Assemble with ffmpeg ----
        progress(0.80, desc="Encoding video …")
        out_path = "/tmp/tts_video.mp4"
        ffmpeg_cmd = [
            "ffmpeg", "-y",
            "-framerate", str(fps),
            "-i", str(frame_dir / "frame_%06d.png"),
            "-i", audio_path,
            "-c:v", "libx264",
            "-preset", "fast",
            "-crf", "23",
            "-pix_fmt", "yuv420p",
            "-c:a", "aac",
            "-b:a", "192k",
            "-shortest",
            "-movflags", "+faststart",
            out_path,
        ]
        result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True)
        if result.returncode != 0:
            return None, f"❌ ffmpeg error:\n```\n{result.stderr[-1500:]}\n```"

    progress(1.0, desc="Done!")
    info = (
        f"✅ **Video ready!**  \n"
        f"Style: `{style}` · Resolution: `{w}×{h}` · FPS: `{fps}` · Duration: `{duration:.1f}s`"
    )
    return out_path, info


# ────────────────────────────────────────────────
# UI
# ────────────────────────────────────────────────

css = """
.radio-row { display: flex; flex-wrap: wrap; gap: 1.2rem; align-items: center; }
.radio-row > div { min-width: 140px; }
"""

with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Qwen3-TTS Full Demo\nAll released variants • CPU-friendly • No streaming")

    # ── Tab 1: Custom Voice ────────────────────────────────────────────────
    with gr.Tab("CustomVoice – Preset speakers + instruct"):
        gr.Markdown("Uses 9 built-in premium voices + optional style instruction")
        with gr.Row(elem_classes="radio-row"):
            cv_model     = gr.Radio(["1.7B-CustomVoice", "0.6B-CustomVoice"], value="1.7B-CustomVoice", label="Model")
            cv_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision")
        with gr.Row():
            cv_text    = gr.Textbox(label="Text to speak", lines=4, value="这是一个测试。希望声音听起来自然一些。")
            cv_lang    = gr.Dropdown(["Auto","Chinese","English","Japanese","Korean"], value="Auto", label="Language")
            cv_speaker = gr.Dropdown(
                ["Vivian","Serena","Uncle_Fu","Dylan","Eric","Ryan","Aiden","Ono_Anna","Sohee"],
                value="Vivian", label="Speaker"
            )
        cv_instruct = gr.Textbox(label="Style instruction (optional)", lines=2, placeholder="用特别愤怒的语气说")
        cv_btn   = gr.Button("Generate", variant="primary")
        cv_audio = gr.Audio(label="Generated Speech", type="filepath")
        cv_info  = gr.Markdown()
        cv_btn.click(infer_custom_voice,
                     inputs=[cv_text, cv_lang, cv_speaker, cv_instruct, cv_model, cv_precision],
                     outputs=[cv_audio, cv_info])

    # ── Tab 2: Voice Design ───────────────────────────────────────────────
    with gr.Tab("Voice Design – Describe any voice"):
        gr.Markdown("Create arbitrary voices from natural language description (only 1.7B variant)")
        with gr.Row(elem_classes="radio-row"):
            vd_model     = gr.Radio(["1.7B-VoiceDesign"], value="1.7B-VoiceDesign", label="Model")
            vd_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision")
        vd_text     = gr.Textbox(label="Text to speak", lines=4, value="哥哥，你回来啦，人家等了好久，要抱抱！")
        vd_lang     = gr.Dropdown(["Auto","Chinese","English"], value="Chinese", label="Language")
        vd_instruct = gr.Textbox(label="Voice description / instruction", lines=4,
                                  value="体现撒娇稚嫩的萝莉女声，音调偏高且起伏明显，黏人、做作又刻意卖萌的感觉")
        vd_btn   = gr.Button("Generate", variant="primary")
        vd_audio = gr.Audio(label="Generated Speech", type="filepath")
        vd_info  = gr.Markdown()
        vd_btn.click(infer_voice_design,
                     inputs=[vd_text, vd_lang, vd_instruct, vd_model, vd_precision],
                     outputs=[vd_audio, vd_info])

    # ── Tab 3: Voice Clone ────────────────────────────────────────────────
    with gr.Tab("Base – Voice Clone from reference audio"):
        gr.Markdown("3-second rapid voice cloning using reference clip (Base models only)")
        with gr.Row(elem_classes="radio-row"):
            cl_model     = gr.Radio(["1.7B-Base","0.6B-Base"], value="1.7B-Base", label="Model")
            cl_precision = gr.Radio(["float32","float16"], value="float32", label="Precision")
        cl_text = gr.Textbox(label="Text to synthesize", lines=4, value="This is my cloned voice now speaking normally.")
        cl_lang = gr.Dropdown(["Auto","English","Chinese"], value="Auto", label="Language")
        with gr.Row():
            cl_ref_audio = gr.Audio(label="Reference audio clip", type="filepath", sources=["upload","microphone"])
            cl_ref_text  = gr.Textbox(label="Transcript of reference (optional)", lines=2)
        cl_xvec_only = gr.Checkbox(label="x-vector only mode (faster, lower quality)", value=False)
        cl_btn   = gr.Button("Clone & Generate", variant="primary")
        cl_audio = gr.Audio(label="Cloned Speech", type="filepath")
        cl_info  = gr.Markdown()
        cl_btn.click(infer_voice_clone,
                     inputs=[cl_text, cl_lang, cl_ref_audio, cl_ref_text, cl_xvec_only, cl_model, cl_precision],
                     outputs=[cl_audio, cl_info])

    # ── Tab 4: Audio → Video ──────────────────────────────────────────────
    with gr.Tab("🎬 Audio → Video"):
        gr.Markdown(
            "## Audio → Social Media Video\n"
            "Upload **any WAV/MP3** (or paste the path from a generated clip above) "
            "and render it into a shareable MP4 with a visual style.\n\n"
            "> ⚠️ **Gradient Pulse** renders per-pixel and is slow for long audio — prefer other styles for > 30 s clips."
        )

        with gr.Row():
            with gr.Column(scale=3):
                vid_audio = gr.Audio(
                    label="Input audio (upload or record)",
                    type="filepath",
                    sources=["upload", "microphone"],
                )
                vid_title = gr.Textbox(
                    label="Title / caption text (shown on video)",
                    placeholder="My AI Voice · Qwen3-TTS",
                    value=""
                )
            with gr.Column(scale=2):
                vid_style = gr.Radio(
                    VISUAL_STYLES,
                    value="🌊 Animated Spectrum Bars",
                    label="Visual style",
                )
                vid_res = gr.Dropdown(
                    list(RESOLUTIONS.keys()),
                    value="1080×1920 (TikTok/Reels 9:16)",
                    label="Resolution / aspect ratio",
                )
                vid_fps = gr.Radio(["24", "30"], value="24", label="FPS")

        with gr.Row():
            vid_bg     = gr.ColorPicker(value="#0d0d1a", label="Background colour")
            vid_accent = gr.ColorPicker(value="#7c3aed", label="Accent / waveform colour")

        vid_btn   = gr.Button("🎬 Render Video", variant="primary", size="lg")
        vid_out   = gr.Video(label="Output video")
        vid_info  = gr.Markdown()

        vid_btn.click(
            audio_to_video,
            inputs=[vid_audio, vid_style, vid_res, vid_bg, vid_accent, vid_title, vid_fps],
            outputs=[vid_out, vid_info],
        )

        gr.Markdown("""
**Style guide:**
| Style | Best for | Notes |
|---|---|---|
| 🎙 Solid + Waveform | Podcasts, quotes | Fast, clean |
| 🌊 Animated Spectrum Bars | Music / speech highlights | FFT-based, energetic |
| ⚡ Oscilloscope Line | Dark/techy aesthetic | Classic green-on-black |
| 🌈 Gradient Pulse | Ambient / ASMR | Slow render — use short clips |
| 🔲 Minimal Dark + Title | Branded content | Great with a title caption |
        """)

    # ── Footer ────────────────────────────────────────────────────────────
    gr.Markdown("""
**Notes**  
• First generation per model loads weights (may take 1–5 min).  
• Use **float32** if **float16** causes crashes (common on CPU).  
• **0.6B** models are faster / lighter on CPU.  
• Video tab requires `ffmpeg` and `Pillow` (both standard on most systems).  
• Repo & docs: https://github.com/QwenLM/Qwen3-TTS
    """)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)