Spaces:

ZeroPointMonkey
/

voice-clone-bench

Paused

File size: 20,235 Bytes

"""
Voice Clone Bench — Chatterbox Multilingual zero-shot voice cloning.

Standalone prototype to A/B open-weight voice cloning against ElevenLabs:
  upload a reference voice -> type arbitrary text -> get speech in the cloned voice.

Mirrors the official ResembleAI/Chatterbox-Multilingual-TTS inference path, with:
  - a clone-first UI (reference upload is the primary input),
  - long-text sentence chunking (so JOI-length scripts work, not just 300 chars),
  - a clean programmatic endpoint (api_name="/clone") for later bot integration.
"""
import os
import random
import re
import tempfile
import threading
import uuid

import numpy as np
import soundfile as sf
import torch
import gradio as gr
import spaces

from src.chatterbox.mtl_tts import ChatterboxMultilingualTTS, SUPPORTED_LANGUAGES

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on device: {DEVICE}")

MODEL = None

# ── Cross-user voice-leak guard ──────────────────────────────────────────────
# The TTS model API stores speaker conditioning on the SHARED singleton model
# (`model.conds`, set by `prepare_conditionals`, read by every `generate`).
# A long script is chunked and synthesized in a loop that reuses those
# conditionals. If two callers' requests interleave on this process (Gradio
# runs sync events in a worker thread pool), caller B's `prepare_conditionals`
# overwrites `model.conds` mid-loop, so caller A's later chunks synthesize in
# B's voice — a cross-user voice PRIVACY LEAK.
#
# Fix: hold this lock for the ENTIRE set-reference -> generate-all-chunks
# critical section so a request owns the model exclusively for its full
# synthesis. The (CPU, GPU-budget-free) reference cleaning runs OUTSIDE the
# lock to keep the exclusive window as short as possible.
_MODEL_LOCK = threading.Lock()

# ── Faithful-cloning defaults ────────────────────────────────────────────────
# Tuned for SPEAKER SIMILARITY (clean identity match), not expressiveness.
# Rationale (Resemble AI Chatterbox guidance + community cloning presets):
#   - exaggeration LOW (~0.4): keeps delivery neutral/professional so the model
#     reproduces the reference identity instead of "acting" it.
#   - cfg_weight 0.5: balanced default; lower (~0.3) speeds pacing, 0.0 helps
#     cross-lingual transfer avoid inheriting the reference-language accent.
#   - temperature 0.7: slightly below the 0.8 default for steadier, more
#     consistent output across chunked long scripts (less random drift).
DEFAULT_EXAGGERATION = 0.4
DEFAULT_CFG_WEIGHT = 0.5
DEFAULT_TEMPERATURE = 0.7
DEFAULT_REPETITION_PENALTY = 2.0
DEFAULT_MIN_P = 0.05
DEFAULT_TOP_P = 1.0

# Built-in sample reference voices per language (used when no reference is uploaded).
LANGUAGE_CONFIG = {
    "en": {
        "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/en_f1.flac",
        "text": "Last month, we reached a new milestone with two billion views on our YouTube channel.",
    },
    "fr": {
        "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/fr_f1.flac",
        "text": "Le mois dernier, nous avons atteint un nouveau jalon avec deux milliards de vues.",
    },
    "es": {
        "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/es_f1.flac",
        "text": "El mes pasado alcanzamos un nuevo hito: dos mil millones de visualizaciones.",
    },
    "de": {
        "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/de_f1.flac",
        "text": "Letzten Monat haben wir einen neuen Meilenstein erreicht: zwei Milliarden Aufrufe.",
    },
}

# Per-chunk character budget. Chatterbox is most stable on short-ish segments,
# so long scripts are split at sentence boundaries and concatenated.
CHUNK_CHARS = 280


def get_or_load_model():
    global MODEL
    if MODEL is None:
        print("Loading ChatterboxMultilingualTTS ...")
        MODEL = ChatterboxMultilingualTTS.from_pretrained(DEVICE)
        if hasattr(MODEL, "to") and str(getattr(MODEL, "device", "")) != DEVICE:
            MODEL.to(DEVICE)
        print(f"Model loaded. Internal device: {getattr(MODEL, 'device', 'N/A')}")
    return MODEL


# Warm the weights at startup (download + CPU/meta init); GPU attaches inside @spaces.GPU.
try:
    get_or_load_model()
except Exception as e:  # noqa: BLE001
    print(f"WARNING: model failed to load at startup: {e}")


def set_seed(seed: int):
    torch.manual_seed(seed)
    if DEVICE == "cuda":
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    random.seed(seed)
    np.random.seed(seed)


# ── Audio cleanup (background-audio removal) ─────────────────────────────────
# Optional preprocessing: isolate the spoken voice from a noisy/musical
# reference clip BEFORE cloning, so the speaker conditionals are built from
# clean speech. Uses HT-Demucs (htdemucs_ft vocals stem, #1 open-source vocal
# SDR) via the pure-numpy + onnxruntime `demucs-onnx` package — no torch/
# torchaudio dependency, so it can't disturb the pinned Chatterbox stack.
# Runs on CPU so it does NOT consume the ZeroGPU budget. Designed as the first
# member of a future "audio cleanup" feature group (denoise, trim, normalize…).
#
# STOPGAP — bound CPU separation time. demucs-onnx runtime scales ~linearly with
# clip length; on long references it ran ~180s and blew the bot's voice timeout.
# Speaker conditioning only needs a few seconds of clean speech, so we trim the
# reference to a short leading slice BEFORE separation. This caps CPU work to
# ~30-40s regardless of input length while keeping clone quality (the conditioner
# never used more than the leading seconds anyway).
_SEPARATOR_READY = None
_CLEAN_TRIM_SECONDS = 10.0


def _ensure_separator():
    """Lazy-import demucs-onnx. Returns the callable or None if unavailable."""
    global _SEPARATOR_READY
    if _SEPARATOR_READY is None:
        try:
            from demucs_onnx import separate_stem  # noqa: PLC0415
            _SEPARATOR_READY = separate_stem
        except Exception as e:  # noqa: BLE001
            print(f"WARNING: demucs-onnx unavailable, voice isolation disabled: {e}")
            _SEPARATOR_READY = False
    return _SEPARATOR_READY or None


def isolate_voice(audio_path: str) -> str:
    """Return a path to a cleaned WAV with background music/noise removed.

    Falls back to the original clip (and warns) if separation is unavailable
    or fails, so cloning never hard-breaks on a cleanup error.
    """
    if not audio_path:
        return audio_path
    separate_stem = _ensure_separator()
    if separate_stem is None:
        raise gr.Error("Voice isolation is unavailable (demucs-onnx not installed).")

    try:
        sr = sf.info(audio_path).samplerate
    except Exception:  # noqa: BLE001
        sr = 44100

    # STOPGAP: trim long references to a short leading slice so CPU separation
    # time is bounded (Demucs runtime ~linear in clip length). The speaker
    # conditioner only needs a few seconds of clean speech. We separate the
    # trimmed slice; if anything in the trim path fails we fall back to the
    # full clip so cleaning never hard-breaks.
    sep_input = audio_path
    trim_path = None
    try:
        info = sf.info(audio_path)
        max_frames = int(_CLEAN_TRIM_SECONDS * info.samplerate)
        if info.frames > max_frames:
            # ENERGY-AWARE WINDOW: don't blindly take the FIRST _CLEAN_TRIM_SECONDS.
            # Real uploads often open with a quiet lead-in (silence, breath, a
            # greeting), so a fixed leading slice can hand the speaker
            # conditioner a near-silent window and starve the clone. Read the
            # whole clip, find the highest-RMS contiguous window of the trim
            # length, and separate THAT. Falls back to the leading slice if the
            # scan can't run.
            full, file_sr = sf.read(audio_path, dtype="float32")
            mono = full.mean(axis=1) if full.ndim == 2 else full
            win = int(_CLEAN_TRIM_SECONDS * file_sr)
            best_start = 0
            if mono.size > win:
                step = max(1, int(0.25 * file_sr))  # 0.25s hop is plenty
                power = mono.astype(np.float64) ** 2
                csum = np.concatenate([[0.0], np.cumsum(power)])
                best_energy = -1.0
                for start in range(0, mono.size - win + 1, step):
                    energy = csum[start + win] - csum[start]
                    if energy > best_energy:
                        best_energy = energy
                        best_start = start
            data = full[best_start:best_start + win]
            trim_path = os.path.join(tempfile.gettempdir(), f"cleantrim_{uuid.uuid4().hex}.wav")
            sf.write(trim_path, data, file_sr)
            sep_input = trim_path
            print(
                f"Trimmed reference for cleaning: {info.frames/info.samplerate:.1f}s "
                f"-> {_CLEAN_TRIM_SECONDS:.1f}s (energy window @ {best_start/file_sr:.1f}s)"
            )
    except Exception as e:  # noqa: BLE001
        print(f"WARNING: reference trim failed, separating full clip: {e}")
        sep_input = audio_path

    # htdemucs_ft vocals specialist (CPU keeps this off the ZeroGPU budget).
    try:
        vocals = separate_stem(sep_input, "vocals", providers="cpu")  # (channels, samples)
    finally:
        if trim_path and os.path.exists(trim_path):
            try:
                os.remove(trim_path)
            except OSError:
                pass
    vocals = np.asarray(vocals, dtype=np.float32)
    if vocals.ndim == 2:
        vocals = vocals.mean(axis=0)  # downmix to mono for the speaker encoder
    peak = float(np.max(np.abs(vocals))) if vocals.size else 0.0
    if peak > 1.0:
        vocals = vocals / peak

    # Unique per call: `random` may be seeded deterministically elsewhere, so two
    # callers could otherwise derive the same filename and clobber each other's
    # cleaned reference. uuid4 is independent of the seeded RNG.
    out_path = os.path.join(tempfile.gettempdir(), f"isolated_{uuid.uuid4().hex}.wav")
    sf.write(out_path, vocals, sr)
    print(f"Isolated voice -> {out_path} ({len(vocals)/sr:.1f}s @ {sr}Hz)")
    return out_path


def isolate_voice_ui(audio_path: str):
    """UI/endpoint wrapper: preview the cleaned reference (api_name=/isolate_voice)."""
    if not audio_path:
        raise gr.Error("Upload a reference clip first.")
    return isolate_voice(audio_path)


def default_audio_for_ui(lang: str):
    return LANGUAGE_CONFIG.get(lang, {}).get("audio")


def default_text_for_ui(lang: str) -> str:
    return LANGUAGE_CONFIG.get(lang, {}).get("text", "")


def split_into_chunks(text: str, max_chars: int = CHUNK_CHARS):
    """Split text into <= max_chars chunks, preferring sentence boundaries."""
    text = " ".join((text or "").split())
    if not text:
        return []
    if len(text) <= max_chars:
        return [text]
    sentences = re.split(r"(?<=[\.\!\?。！？])\s+", text)
    chunks, cur = [], ""
    for sent in sentences:
        # A single sentence longer than the budget: hard-split on spaces.
        while len(sent) > max_chars:
            head = sent[:max_chars].rsplit(" ", 1)[0] or sent[:max_chars]
            chunks.append(head.strip())
            sent = sent[len(head):].strip()
        if not cur:
            cur = sent
        elif len(cur) + 1 + len(sent) <= max_chars:
            cur = f"{cur} {sent}"
        else:
            chunks.append(cur.strip())
            cur = sent
    if cur.strip():
        chunks.append(cur.strip())
    return [c for c in chunks if c]


def _maybe_clean_reference(ref: str, clean_reference: bool) -> str:
    """Optionally strip background music/noise from a user-supplied reference."""
    if not (clean_reference and ref):
        return ref
    try:
        return isolate_voice(ref)
    except Exception as e:  # noqa: BLE001
        gr.Warning(f"Background-audio removal failed, using raw reference: {e}")
        return ref


@spaces.GPU(duration=120)
def clone_and_speak(
    text: str,
    language_id: str = "en",
    audio_prompt_path: str = None,
    exaggeration: float = DEFAULT_EXAGGERATION,
    cfg_weight: float = DEFAULT_CFG_WEIGHT,
    temperature: float = DEFAULT_TEMPERATURE,
    seed: int = 0,
    clean_reference: bool = False,
    repetition_penalty: float = DEFAULT_REPETITION_PENALTY,
    min_p: float = DEFAULT_MIN_P,
    top_p: float = DEFAULT_TOP_P,
):
    """
    Clone the voice in `audio_prompt_path` and speak `text` in language `language_id`.

    Args:
        text: text to synthesize (long scripts are auto-chunked).
        language_id: language code (en, fr, de, es, it, pt, hi, ja, zh, ...).
        audio_prompt_path: path/URL to a reference voice clip. If omitted, a
            built-in sample voice for the language is used.
        exaggeration: expressiveness (0.25-2.0; ~0.4 = neutral/faithful clone).
        cfg_weight: CFG / pacing (0.0-1.0; lower ~0.3 = faster pace, 0.0 for
            cross-lingual transfer; 0.5 = balanced default).
        temperature: sampling randomness (0.05-2.0; lower = more consistent).
        seed: 0 for random, otherwise reproducible.
        clean_reference: if True, isolate the voice (remove background music/
            noise) from the uploaded reference before cloning.
        repetition_penalty: discourages repeated tokens (model default 2.0).
        min_p: min-p nucleus floor (model default 0.05).
        top_p: top-p nucleus threshold (model default 1.0).

    Returns:
        (sample_rate, waveform) tuple consumable by gr.Audio.
    """
    model = get_or_load_model()
    if model is None:
        raise RuntimeError("TTS model is not loaded.")

    if not text or not text.strip():
        raise gr.Error("Please enter some text to speak.")

    ref = audio_prompt_path or default_audio_for_ui(language_id)
    if not ref:
        raise gr.Error("Upload a reference audio clip to clone (or pick a language with a built-in sample).")

    # Optional preprocessing: clean the reference so conditionals are built from
    # isolated speech (only applies to a user-uploaded clip, not built-in samples).
    if audio_prompt_path:
        ref = _maybe_clean_reference(ref, clean_reference)

    lang = (language_id or "en").lower()
    chunks = split_into_chunks(text)
    print(f"Cloning voice | lang={lang} | chunks={len(chunks)} | clean_ref={clean_reference} | ref={ref}")

    sr = model.sr
    silence = np.zeros(int(0.15 * sr), dtype=np.float32)
    pieces = []

    # CRITICAL SECTION — hold the model exclusively for this whole request.
    # `prepare_conditionals` mutates the shared `model.conds`; reusing it across
    # the chunk loop is only safe if no other caller can re-prepare the model in
    # between. The lock makes that guarantee even under concurrent Gradio
    # requests, so a caller's voice can never bleed into another's clip. RNG
    # seeding lives inside the lock too, since it perturbs shared generator state
    # that `generate` consumes.
    with _MODEL_LOCK:
        if seed and int(seed) != 0:
            set_seed(int(seed))

        # Prepare speaker conditionals ONCE from the reference, then reuse across
        # chunks so the cloned identity stays consistent for the whole script.
        model.prepare_conditionals(ref, exaggeration=exaggeration)

        for i, chunk in enumerate(chunks):
            wav = model.generate(
                chunk,
                language_id=lang,
                audio_prompt_path=None,  # reuse prepared conditionals
                exaggeration=exaggeration,
                cfg_weight=cfg_weight,
                temperature=temperature,
                repetition_penalty=repetition_penalty,
                min_p=min_p,
                top_p=top_p,
            )
            arr = wav.squeeze(0).detach().cpu().numpy().astype(np.float32)
            pieces.append(arr)
            if i != len(chunks) - 1:
                pieces.append(silence)

    full = np.concatenate(pieces) if pieces else np.zeros(1, dtype=np.float32)
    print("Generation complete.")
    return (sr, full)


def on_language_change(lang):
    return default_audio_for_ui(lang), default_text_for_ui(lang)


with gr.Blocks(title="Voice Clone Bench") as demo:
    gr.Markdown(
        """
        # 🎙️ Voice Clone Bench — Chatterbox (zero-shot)
        Upload a **reference voice**, type **any text**, get speech **in that cloned voice**.
        Built to A/B against ElevenLabs. Model: Chatterbox Multilingual (Resemble AI, MIT).
        """
    )
    with gr.Row():
        with gr.Column():
            ref_wav = gr.Audio(
                sources=["upload", "microphone"],
                type="filepath",
                label="① Reference voice to clone (5–20s clean speech). Empty = built-in sample.",
                value=default_audio_for_ui("en"),
            )
            clean_reference = gr.Checkbox(
                value=False,
                label="🧹 Remove background audio from reference (isolate voice before cloning)",
                info="Strips music/noise with HT-Demucs so the clone is built from clean speech.",
            )
            preview_btn = gr.Button("🧹 Preview cleaned reference", size="sm")
            cleaned_preview = gr.Audio(label="Isolated voice (preview)", visible=True)
            language_id = gr.Dropdown(
                choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
                value="en",
                label="② Language",
            )
            text = gr.Textbox(
                value=default_text_for_ui("en"),
                label="③ Text to speak (long scripts are auto-chunked)",
                lines=5,
                max_lines=20,
            )
            with gr.Accordion("Cloning controls (tuned for faithful voice cloning)", open=True):
                exaggeration = gr.Slider(
                    0.0, 2.0, step=0.05, value=DEFAULT_EXAGGERATION,
                    label="Exaggeration — lower = more neutral/faithful (≈0.4); 0.7+ = expressive",
                )
                cfg_weight = gr.Slider(
                    0.0, 1.0, step=0.05, value=DEFAULT_CFG_WEIGHT,
                    label="CFG / Pace — 0.5 balanced; ~0.3 faster; 0.0 for cross-lingual",
                )
                temperature = gr.Slider(
                    0.05, 2.0, step=0.05, value=DEFAULT_TEMPERATURE,
                    label="Temperature — lower = more consistent/faithful (≈0.7)",
                )
                seed = gr.Number(value=0, label="Seed (0 = random)")
                with gr.Accordion("Sampling (advanced)", open=False):
                    repetition_penalty = gr.Slider(
                        1.0, 2.5, step=0.05, value=DEFAULT_REPETITION_PENALTY,
                        label="Repetition penalty (default 2.0)",
                    )
                    min_p = gr.Slider(0.0, 0.5, step=0.01, value=DEFAULT_MIN_P, label="min_p (default 0.05)")
                    top_p = gr.Slider(0.1, 1.0, step=0.05, value=DEFAULT_TOP_P, label="top_p (default 1.0)")
            run_btn = gr.Button("Clone & Speak", variant="primary")
        with gr.Column():
            audio_output = gr.Audio(label="Cloned speech output")

    language_id.change(fn=on_language_change, inputs=[language_id], outputs=[ref_wav, text], show_progress=False)

    preview_btn.click(
        fn=isolate_voice_ui,
        inputs=[ref_wav],
        outputs=[cleaned_preview],
        api_name="isolate_voice",
    )

    run_btn.click(
        fn=clone_and_speak,
        inputs=[
            text, language_id, ref_wav, exaggeration, cfg_weight, temperature, seed,
            clean_reference, repetition_penalty, min_p, top_p,
        ],
        outputs=[audio_output],
        api_name="clone",
    )

if __name__ == "__main__":
    demo.queue(max_size=20).launch(mcp_server=True)