File size: 20,235 Bytes
5a4878b
 
 
 
 
 
 
 
 
 
 
c0b00e8
5a4878b
 
c0b00e8
4a26525
 
5a4878b
 
c0b00e8
5a4878b
 
 
 
 
 
 
 
 
 
 
4a26525
 
 
 
 
 
 
 
 
 
 
751f97c
 
4a26525
 
c0b00e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a4878b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0b00e8
 
 
 
 
 
751f97c
 
7dbfffd
 
 
 
 
 
 
c0b00e8
7dbfffd
c0b00e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
751f97c
 
852fbdc
751f97c
 
c0b00e8
 
 
 
 
 
 
 
 
 
 
 
7dbfffd
 
 
 
 
 
 
 
 
 
 
f982159
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7dbfffd
 
 
f982159
 
 
 
7dbfffd
 
 
 
751f97c
7dbfffd
 
 
 
 
 
 
 
c0b00e8
 
 
 
 
 
 
4a26525
 
 
 
c0b00e8
 
 
 
 
 
 
 
 
 
 
 
5a4878b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0b00e8
 
 
 
 
751f97c
c0b00e8
 
 
 
 
5a4878b
 
 
 
 
c0b00e8
 
 
5a4878b
c0b00e8
 
 
 
5a4878b
 
 
 
 
 
 
 
 
c0b00e8
 
 
 
5a4878b
c0b00e8
 
 
 
 
5a4878b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0b00e8
 
 
 
 
5a4878b
 
c0b00e8
5a4878b
 
 
 
4a26525
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a4878b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0b00e8
 
 
 
 
 
 
5a4878b
 
 
 
 
 
 
 
 
 
 
c0b00e8
 
 
 
 
 
 
 
 
 
 
 
 
5a4878b
c0b00e8
 
 
 
 
 
 
5a4878b
 
 
 
 
 
c0b00e8
 
 
 
 
 
 
5a4878b
 
c0b00e8
 
 
 
5a4878b
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
"""
Voice Clone Bench β€” Chatterbox Multilingual zero-shot voice cloning.

Standalone prototype to A/B open-weight voice cloning against ElevenLabs:
  upload a reference voice -> type arbitrary text -> get speech in the cloned voice.

Mirrors the official ResembleAI/Chatterbox-Multilingual-TTS inference path, with:
  - a clone-first UI (reference upload is the primary input),
  - long-text sentence chunking (so JOI-length scripts work, not just 300 chars),
  - a clean programmatic endpoint (api_name="/clone") for later bot integration.
"""
import os
import random
import re
import tempfile
import threading
import uuid

import numpy as np
import soundfile as sf
import torch
import gradio as gr
import spaces

from src.chatterbox.mtl_tts import ChatterboxMultilingualTTS, SUPPORTED_LANGUAGES

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on device: {DEVICE}")

MODEL = None

# ── Cross-user voice-leak guard ──────────────────────────────────────────────
# The TTS model API stores speaker conditioning on the SHARED singleton model
# (`model.conds`, set by `prepare_conditionals`, read by every `generate`).
# A long script is chunked and synthesized in a loop that reuses those
# conditionals. If two callers' requests interleave on this process (Gradio
# runs sync events in a worker thread pool), caller B's `prepare_conditionals`
# overwrites `model.conds` mid-loop, so caller A's later chunks synthesize in
# B's voice β€” a cross-user voice PRIVACY LEAK.
#
# Fix: hold this lock for the ENTIRE set-reference -> generate-all-chunks
# critical section so a request owns the model exclusively for its full
# synthesis. The (CPU, GPU-budget-free) reference cleaning runs OUTSIDE the
# lock to keep the exclusive window as short as possible.
_MODEL_LOCK = threading.Lock()

# ── Faithful-cloning defaults ────────────────────────────────────────────────
# Tuned for SPEAKER SIMILARITY (clean identity match), not expressiveness.
# Rationale (Resemble AI Chatterbox guidance + community cloning presets):
#   - exaggeration LOW (~0.4): keeps delivery neutral/professional so the model
#     reproduces the reference identity instead of "acting" it.
#   - cfg_weight 0.5: balanced default; lower (~0.3) speeds pacing, 0.0 helps
#     cross-lingual transfer avoid inheriting the reference-language accent.
#   - temperature 0.7: slightly below the 0.8 default for steadier, more
#     consistent output across chunked long scripts (less random drift).
DEFAULT_EXAGGERATION = 0.4
DEFAULT_CFG_WEIGHT = 0.5
DEFAULT_TEMPERATURE = 0.7
DEFAULT_REPETITION_PENALTY = 2.0
DEFAULT_MIN_P = 0.05
DEFAULT_TOP_P = 1.0

# Built-in sample reference voices per language (used when no reference is uploaded).
LANGUAGE_CONFIG = {
    "en": {
        "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/en_f1.flac",
        "text": "Last month, we reached a new milestone with two billion views on our YouTube channel.",
    },
    "fr": {
        "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/fr_f1.flac",
        "text": "Le mois dernier, nous avons atteint un nouveau jalon avec deux milliards de vues.",
    },
    "es": {
        "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/es_f1.flac",
        "text": "El mes pasado alcanzamos un nuevo hito: dos mil millones de visualizaciones.",
    },
    "de": {
        "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/de_f1.flac",
        "text": "Letzten Monat haben wir einen neuen Meilenstein erreicht: zwei Milliarden Aufrufe.",
    },
}

# Per-chunk character budget. Chatterbox is most stable on short-ish segments,
# so long scripts are split at sentence boundaries and concatenated.
CHUNK_CHARS = 280


def get_or_load_model():
    global MODEL
    if MODEL is None:
        print("Loading ChatterboxMultilingualTTS ...")
        MODEL = ChatterboxMultilingualTTS.from_pretrained(DEVICE)
        if hasattr(MODEL, "to") and str(getattr(MODEL, "device", "")) != DEVICE:
            MODEL.to(DEVICE)
        print(f"Model loaded. Internal device: {getattr(MODEL, 'device', 'N/A')}")
    return MODEL


# Warm the weights at startup (download + CPU/meta init); GPU attaches inside @spaces.GPU.
try:
    get_or_load_model()
except Exception as e:  # noqa: BLE001
    print(f"WARNING: model failed to load at startup: {e}")


def set_seed(seed: int):
    torch.manual_seed(seed)
    if DEVICE == "cuda":
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    random.seed(seed)
    np.random.seed(seed)


# ── Audio cleanup (background-audio removal) ─────────────────────────────────
# Optional preprocessing: isolate the spoken voice from a noisy/musical
# reference clip BEFORE cloning, so the speaker conditionals are built from
# clean speech. Uses HT-Demucs (htdemucs_ft vocals stem, #1 open-source vocal
# SDR) via the pure-numpy + onnxruntime `demucs-onnx` package β€” no torch/
# torchaudio dependency, so it can't disturb the pinned Chatterbox stack.
# Runs on CPU so it does NOT consume the ZeroGPU budget. Designed as the first
# member of a future "audio cleanup" feature group (denoise, trim, normalize…).
#
# STOPGAP β€” bound CPU separation time. demucs-onnx runtime scales ~linearly with
# clip length; on long references it ran ~180s and blew the bot's voice timeout.
# Speaker conditioning only needs a few seconds of clean speech, so we trim the
# reference to a short leading slice BEFORE separation. This caps CPU work to
# ~30-40s regardless of input length while keeping clone quality (the conditioner
# never used more than the leading seconds anyway).
_SEPARATOR_READY = None
_CLEAN_TRIM_SECONDS = 10.0


def _ensure_separator():
    """Lazy-import demucs-onnx. Returns the callable or None if unavailable."""
    global _SEPARATOR_READY
    if _SEPARATOR_READY is None:
        try:
            from demucs_onnx import separate_stem  # noqa: PLC0415
            _SEPARATOR_READY = separate_stem
        except Exception as e:  # noqa: BLE001
            print(f"WARNING: demucs-onnx unavailable, voice isolation disabled: {e}")
            _SEPARATOR_READY = False
    return _SEPARATOR_READY or None


def isolate_voice(audio_path: str) -> str:
    """Return a path to a cleaned WAV with background music/noise removed.

    Falls back to the original clip (and warns) if separation is unavailable
    or fails, so cloning never hard-breaks on a cleanup error.
    """
    if not audio_path:
        return audio_path
    separate_stem = _ensure_separator()
    if separate_stem is None:
        raise gr.Error("Voice isolation is unavailable (demucs-onnx not installed).")

    try:
        sr = sf.info(audio_path).samplerate
    except Exception:  # noqa: BLE001
        sr = 44100

    # STOPGAP: trim long references to a short leading slice so CPU separation
    # time is bounded (Demucs runtime ~linear in clip length). The speaker
    # conditioner only needs a few seconds of clean speech. We separate the
    # trimmed slice; if anything in the trim path fails we fall back to the
    # full clip so cleaning never hard-breaks.
    sep_input = audio_path
    trim_path = None
    try:
        info = sf.info(audio_path)
        max_frames = int(_CLEAN_TRIM_SECONDS * info.samplerate)
        if info.frames > max_frames:
            # ENERGY-AWARE WINDOW: don't blindly take the FIRST _CLEAN_TRIM_SECONDS.
            # Real uploads often open with a quiet lead-in (silence, breath, a
            # greeting), so a fixed leading slice can hand the speaker
            # conditioner a near-silent window and starve the clone. Read the
            # whole clip, find the highest-RMS contiguous window of the trim
            # length, and separate THAT. Falls back to the leading slice if the
            # scan can't run.
            full, file_sr = sf.read(audio_path, dtype="float32")
            mono = full.mean(axis=1) if full.ndim == 2 else full
            win = int(_CLEAN_TRIM_SECONDS * file_sr)
            best_start = 0
            if mono.size > win:
                step = max(1, int(0.25 * file_sr))  # 0.25s hop is plenty
                power = mono.astype(np.float64) ** 2
                csum = np.concatenate([[0.0], np.cumsum(power)])
                best_energy = -1.0
                for start in range(0, mono.size - win + 1, step):
                    energy = csum[start + win] - csum[start]
                    if energy > best_energy:
                        best_energy = energy
                        best_start = start
            data = full[best_start:best_start + win]
            trim_path = os.path.join(tempfile.gettempdir(), f"cleantrim_{uuid.uuid4().hex}.wav")
            sf.write(trim_path, data, file_sr)
            sep_input = trim_path
            print(
                f"Trimmed reference for cleaning: {info.frames/info.samplerate:.1f}s "
                f"-> {_CLEAN_TRIM_SECONDS:.1f}s (energy window @ {best_start/file_sr:.1f}s)"
            )
    except Exception as e:  # noqa: BLE001
        print(f"WARNING: reference trim failed, separating full clip: {e}")
        sep_input = audio_path

    # htdemucs_ft vocals specialist (CPU keeps this off the ZeroGPU budget).
    try:
        vocals = separate_stem(sep_input, "vocals", providers="cpu")  # (channels, samples)
    finally:
        if trim_path and os.path.exists(trim_path):
            try:
                os.remove(trim_path)
            except OSError:
                pass
    vocals = np.asarray(vocals, dtype=np.float32)
    if vocals.ndim == 2:
        vocals = vocals.mean(axis=0)  # downmix to mono for the speaker encoder
    peak = float(np.max(np.abs(vocals))) if vocals.size else 0.0
    if peak > 1.0:
        vocals = vocals / peak

    # Unique per call: `random` may be seeded deterministically elsewhere, so two
    # callers could otherwise derive the same filename and clobber each other's
    # cleaned reference. uuid4 is independent of the seeded RNG.
    out_path = os.path.join(tempfile.gettempdir(), f"isolated_{uuid.uuid4().hex}.wav")
    sf.write(out_path, vocals, sr)
    print(f"Isolated voice -> {out_path} ({len(vocals)/sr:.1f}s @ {sr}Hz)")
    return out_path


def isolate_voice_ui(audio_path: str):
    """UI/endpoint wrapper: preview the cleaned reference (api_name=/isolate_voice)."""
    if not audio_path:
        raise gr.Error("Upload a reference clip first.")
    return isolate_voice(audio_path)


def default_audio_for_ui(lang: str):
    return LANGUAGE_CONFIG.get(lang, {}).get("audio")


def default_text_for_ui(lang: str) -> str:
    return LANGUAGE_CONFIG.get(lang, {}).get("text", "")


def split_into_chunks(text: str, max_chars: int = CHUNK_CHARS):
    """Split text into <= max_chars chunks, preferring sentence boundaries."""
    text = " ".join((text or "").split())
    if not text:
        return []
    if len(text) <= max_chars:
        return [text]
    sentences = re.split(r"(?<=[\.\!\?γ€‚οΌοΌŸ])\s+", text)
    chunks, cur = [], ""
    for sent in sentences:
        # A single sentence longer than the budget: hard-split on spaces.
        while len(sent) > max_chars:
            head = sent[:max_chars].rsplit(" ", 1)[0] or sent[:max_chars]
            chunks.append(head.strip())
            sent = sent[len(head):].strip()
        if not cur:
            cur = sent
        elif len(cur) + 1 + len(sent) <= max_chars:
            cur = f"{cur} {sent}"
        else:
            chunks.append(cur.strip())
            cur = sent
    if cur.strip():
        chunks.append(cur.strip())
    return [c for c in chunks if c]


def _maybe_clean_reference(ref: str, clean_reference: bool) -> str:
    """Optionally strip background music/noise from a user-supplied reference."""
    if not (clean_reference and ref):
        return ref
    try:
        return isolate_voice(ref)
    except Exception as e:  # noqa: BLE001
        gr.Warning(f"Background-audio removal failed, using raw reference: {e}")
        return ref


@spaces.GPU(duration=120)
def clone_and_speak(
    text: str,
    language_id: str = "en",
    audio_prompt_path: str = None,
    exaggeration: float = DEFAULT_EXAGGERATION,
    cfg_weight: float = DEFAULT_CFG_WEIGHT,
    temperature: float = DEFAULT_TEMPERATURE,
    seed: int = 0,
    clean_reference: bool = False,
    repetition_penalty: float = DEFAULT_REPETITION_PENALTY,
    min_p: float = DEFAULT_MIN_P,
    top_p: float = DEFAULT_TOP_P,
):
    """
    Clone the voice in `audio_prompt_path` and speak `text` in language `language_id`.

    Args:
        text: text to synthesize (long scripts are auto-chunked).
        language_id: language code (en, fr, de, es, it, pt, hi, ja, zh, ...).
        audio_prompt_path: path/URL to a reference voice clip. If omitted, a
            built-in sample voice for the language is used.
        exaggeration: expressiveness (0.25-2.0; ~0.4 = neutral/faithful clone).
        cfg_weight: CFG / pacing (0.0-1.0; lower ~0.3 = faster pace, 0.0 for
            cross-lingual transfer; 0.5 = balanced default).
        temperature: sampling randomness (0.05-2.0; lower = more consistent).
        seed: 0 for random, otherwise reproducible.
        clean_reference: if True, isolate the voice (remove background music/
            noise) from the uploaded reference before cloning.
        repetition_penalty: discourages repeated tokens (model default 2.0).
        min_p: min-p nucleus floor (model default 0.05).
        top_p: top-p nucleus threshold (model default 1.0).

    Returns:
        (sample_rate, waveform) tuple consumable by gr.Audio.
    """
    model = get_or_load_model()
    if model is None:
        raise RuntimeError("TTS model is not loaded.")

    if not text or not text.strip():
        raise gr.Error("Please enter some text to speak.")

    ref = audio_prompt_path or default_audio_for_ui(language_id)
    if not ref:
        raise gr.Error("Upload a reference audio clip to clone (or pick a language with a built-in sample).")

    # Optional preprocessing: clean the reference so conditionals are built from
    # isolated speech (only applies to a user-uploaded clip, not built-in samples).
    if audio_prompt_path:
        ref = _maybe_clean_reference(ref, clean_reference)

    lang = (language_id or "en").lower()
    chunks = split_into_chunks(text)
    print(f"Cloning voice | lang={lang} | chunks={len(chunks)} | clean_ref={clean_reference} | ref={ref}")

    sr = model.sr
    silence = np.zeros(int(0.15 * sr), dtype=np.float32)
    pieces = []

    # CRITICAL SECTION β€” hold the model exclusively for this whole request.
    # `prepare_conditionals` mutates the shared `model.conds`; reusing it across
    # the chunk loop is only safe if no other caller can re-prepare the model in
    # between. The lock makes that guarantee even under concurrent Gradio
    # requests, so a caller's voice can never bleed into another's clip. RNG
    # seeding lives inside the lock too, since it perturbs shared generator state
    # that `generate` consumes.
    with _MODEL_LOCK:
        if seed and int(seed) != 0:
            set_seed(int(seed))

        # Prepare speaker conditionals ONCE from the reference, then reuse across
        # chunks so the cloned identity stays consistent for the whole script.
        model.prepare_conditionals(ref, exaggeration=exaggeration)

        for i, chunk in enumerate(chunks):
            wav = model.generate(
                chunk,
                language_id=lang,
                audio_prompt_path=None,  # reuse prepared conditionals
                exaggeration=exaggeration,
                cfg_weight=cfg_weight,
                temperature=temperature,
                repetition_penalty=repetition_penalty,
                min_p=min_p,
                top_p=top_p,
            )
            arr = wav.squeeze(0).detach().cpu().numpy().astype(np.float32)
            pieces.append(arr)
            if i != len(chunks) - 1:
                pieces.append(silence)

    full = np.concatenate(pieces) if pieces else np.zeros(1, dtype=np.float32)
    print("Generation complete.")
    return (sr, full)


def on_language_change(lang):
    return default_audio_for_ui(lang), default_text_for_ui(lang)


with gr.Blocks(title="Voice Clone Bench") as demo:
    gr.Markdown(
        """
        # πŸŽ™οΈ Voice Clone Bench β€” Chatterbox (zero-shot)
        Upload a **reference voice**, type **any text**, get speech **in that cloned voice**.
        Built to A/B against ElevenLabs. Model: Chatterbox Multilingual (Resemble AI, MIT).
        """
    )
    with gr.Row():
        with gr.Column():
            ref_wav = gr.Audio(
                sources=["upload", "microphone"],
                type="filepath",
                label="β‘  Reference voice to clone (5–20s clean speech). Empty = built-in sample.",
                value=default_audio_for_ui("en"),
            )
            clean_reference = gr.Checkbox(
                value=False,
                label="🧹 Remove background audio from reference (isolate voice before cloning)",
                info="Strips music/noise with HT-Demucs so the clone is built from clean speech.",
            )
            preview_btn = gr.Button("🧹 Preview cleaned reference", size="sm")
            cleaned_preview = gr.Audio(label="Isolated voice (preview)", visible=True)
            language_id = gr.Dropdown(
                choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
                value="en",
                label="β‘‘ Language",
            )
            text = gr.Textbox(
                value=default_text_for_ui("en"),
                label="β‘’ Text to speak (long scripts are auto-chunked)",
                lines=5,
                max_lines=20,
            )
            with gr.Accordion("Cloning controls (tuned for faithful voice cloning)", open=True):
                exaggeration = gr.Slider(
                    0.0, 2.0, step=0.05, value=DEFAULT_EXAGGERATION,
                    label="Exaggeration β€” lower = more neutral/faithful (β‰ˆ0.4); 0.7+ = expressive",
                )
                cfg_weight = gr.Slider(
                    0.0, 1.0, step=0.05, value=DEFAULT_CFG_WEIGHT,
                    label="CFG / Pace β€” 0.5 balanced; ~0.3 faster; 0.0 for cross-lingual",
                )
                temperature = gr.Slider(
                    0.05, 2.0, step=0.05, value=DEFAULT_TEMPERATURE,
                    label="Temperature β€” lower = more consistent/faithful (β‰ˆ0.7)",
                )
                seed = gr.Number(value=0, label="Seed (0 = random)")
                with gr.Accordion("Sampling (advanced)", open=False):
                    repetition_penalty = gr.Slider(
                        1.0, 2.5, step=0.05, value=DEFAULT_REPETITION_PENALTY,
                        label="Repetition penalty (default 2.0)",
                    )
                    min_p = gr.Slider(0.0, 0.5, step=0.01, value=DEFAULT_MIN_P, label="min_p (default 0.05)")
                    top_p = gr.Slider(0.1, 1.0, step=0.05, value=DEFAULT_TOP_P, label="top_p (default 1.0)")
            run_btn = gr.Button("Clone & Speak", variant="primary")
        with gr.Column():
            audio_output = gr.Audio(label="Cloned speech output")

    language_id.change(fn=on_language_change, inputs=[language_id], outputs=[ref_wav, text], show_progress=False)

    preview_btn.click(
        fn=isolate_voice_ui,
        inputs=[ref_wav],
        outputs=[cleaned_preview],
        api_name="isolate_voice",
    )

    run_btn.click(
        fn=clone_and_speak,
        inputs=[
            text, language_id, ref_wav, exaggeration, cfg_weight, temperature, seed,
            clean_reference, repetition_penalty, min_p, top_p,
        ],
        outputs=[audio_output],
        api_name="clone",
    )

if __name__ == "__main__":
    demo.queue(max_size=20).launch(mcp_server=True)