Spaces:
Runtime error
Runtime error
| import os | |
| os.environ.setdefault("NUMBA_DISABLE_CUDA", "1") | |
| os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True") | |
| import ctypes | |
| import glob | |
| import site | |
| def _preload_cudart13(): | |
| # zonos2's JIT kernels are compiled by the image's CUDA 13 nvcc and link | |
| # libcudart.so.13, but torch 2.9.1 (cu128) only ships cudart 12. | |
| patterns = [f"{sp}/nvidia/**/libcudart.so.13*" for sp in site.getsitepackages()] | |
| patterns += [ | |
| "/usr/local/cuda*/targets/*/lib/libcudart.so.13*", | |
| "/usr/local/cuda*/lib64/libcudart.so.13*", | |
| "/usr/lib/x86_64-linux-gnu/libcudart.so.13*", | |
| ] | |
| for pattern in patterns: | |
| for lib in sorted(glob.glob(pattern, recursive=True)): | |
| ctypes.CDLL(lib, mode=ctypes.RTLD_GLOBAL) | |
| return | |
| _preload_cudart13() | |
| import spaces | |
| import hashlib | |
| import random | |
| import threading | |
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| from huggingface_hub import snapshot_download | |
| MODEL_REPO = "Zyphra/ZONOS2" | |
| SPEAKER_REPO = "marksverdhei/Qwen3-Voice-Embedding-12Hz-1.7B" | |
| SAMPLE_RATE = 44100 | |
| FRAMES_PER_SECOND = SAMPLE_RATE / 512 # DAC hop length | |
| MODEL_PATH = snapshot_download(MODEL_REPO, allow_patterns=["*.json", "*.pth", "*.pt", "*.yaml"]) | |
| snapshot_download(SPEAKER_REPO) | |
| import dac as _dac | |
| _dac.utils.download(model_type="44khz") | |
| from zonos2.message.tts import TTSSamplingParams, TTSUserMsg | |
| from zonos2.tokenizer.textnorm import TTSTextNormalizer | |
| from zonos2.tts import TTSLLM | |
| import socket | |
| from zonos2.engine.config import EngineConfig | |
| _DIST_PORT = None | |
| def _distributed_addr(self): | |
| # Upstream hardcodes tcp://127.0.0.1:23333; when ZeroGPU retries a call in | |
| # a fresh worker while another worker is still mid-init, the fixed port | |
| # collides with EADDRINUSE. Pick a free port once per process instead. | |
| global _DIST_PORT | |
| if _DIST_PORT is None: | |
| with socket.socket() as s: | |
| s.bind(("127.0.0.1", 0)) | |
| _DIST_PORT = s.getsockname()[1] | |
| return f"tcp://127.0.0.1:{_DIST_PORT}" | |
| EngineConfig.distributed_addr = property(_distributed_addr) | |
| import zonos2.engine.engine as zonos2_engine | |
| from zonos2.models.weight import _normalize_zonos2_state_dict | |
| # Deserialize the 15.3 GB checkpoint once in the main process (mmap keeps it | |
| # page-cache backed); forked GPU workers inherit it copy-on-write, so cold | |
| # engine init skips the ~17s torch.load and only pays the host->device copy. | |
| _STATE_DICT = torch.load( | |
| f"{MODEL_PATH}/model.pth", map_location="cpu", weights_only=False, mmap=True | |
| ) | |
| if "model" in _STATE_DICT: | |
| _STATE_DICT = _STATE_DICT["model"] | |
| _STATE_DICT = _normalize_zonos2_state_dict(_STATE_DICT) | |
| def _preloaded_checkpoint_weight(model_path, device): | |
| return {k: v.to(device) for k, v in _STATE_DICT.items()} | |
| zonos2_engine.load_checkpoint_weight = _preloaded_checkpoint_weight | |
| LANGUAGES = { | |
| "English (US)": "en_us", | |
| "English (UK)": "en_gb", | |
| "French": "fr_fr", | |
| "German": "de", | |
| "Spanish": "es", | |
| "Italian": "it", | |
| "Portuguese (BR)": "pt_br", | |
| "Japanese": "ja", | |
| "Mandarin": "cmn", | |
| "Korean": "ko", | |
| } | |
| SPEAKING_RATE_BUCKETS = ["0-8", "8-11", "11-14", "14-17", "17-21", "21-28", "28-40", "40+"] | |
| RATE_CHOICES = ["Auto"] + SPEAKING_RATE_BUCKETS | |
| MAX_SEED = np.iinfo(np.int32).max | |
| NORMALIZER = TTSTextNormalizer() | |
| threading.Thread(target=NORMALIZER.warmup, daemon=True).start() | |
| class ZonosTTSLLM(TTSLLM): | |
| """TTSLLM with speaker-embedding conditioning plumbed into the offline path.""" | |
| def __init__(self, *args, **kwargs): | |
| super().__init__(*args, **kwargs) | |
| self.speaker_embedding = None | |
| self.clean_speaker_background = False | |
| self.accurate_mode = True | |
| def offline_receive_msg(self, blocking: bool = False): | |
| msgs = super().offline_receive_msg(blocking) | |
| for msg in msgs: | |
| if isinstance(msg, TTSUserMsg): | |
| msg.speaker_embedding = self.speaker_embedding | |
| msg.clean_speaker_background = self.clean_speaker_background | |
| msg.accurate_mode = self.accurate_mode | |
| return msgs | |
| MODELS = {} | |
| EMBEDDING_CACHE = {} | |
| def _get_models(): | |
| if "tts" not in MODELS: | |
| from zonos2.models.speaker_cloning import Qwen3SpeakerEmbedding | |
| MODELS["embedder"] = Qwen3SpeakerEmbedding(device="cuda") | |
| MODELS["tts"] = ZonosTTSLLM( | |
| model_path=MODEL_PATH, | |
| cuda_graph_max_bs=4, | |
| num_page_override=65536, | |
| ) | |
| return MODELS | |
| def _embed_speaker(models, speaker_audio): | |
| sr, wav = speaker_audio | |
| key = hashlib.sha256(wav.tobytes() + str(sr).encode()).hexdigest() | |
| if key in EMBEDDING_CACHE: | |
| return EMBEDDING_CACHE[key] | |
| wav = np.asarray(wav) | |
| if wav.dtype == np.int16: | |
| wav = wav.astype(np.float32) / 32768.0 | |
| elif wav.dtype == np.int32: | |
| wav = wav.astype(np.float32) / 2147483648.0 | |
| else: | |
| wav = wav.astype(np.float32) | |
| if wav.ndim == 2: | |
| wav = wav.T # (samples, channels) -> (channels, samples) | |
| else: | |
| # The embedder's reflect-pad requires a 2D (channels, samples) input; | |
| # mono uploads arrive 1D. | |
| wav = wav[None, :] | |
| wav_t = torch.from_numpy(wav) | |
| embedder = models["embedder"] | |
| with torch.inference_mode(): | |
| output = embedder(wav_t, sr) | |
| candidates = output if isinstance(output, tuple) else (output,) | |
| for candidate in candidates: | |
| candidate = candidate.squeeze(0).to(dtype=torch.float32, device="cpu") | |
| if candidate.numel() == 2048: | |
| embedding = candidate.reshape(2048) | |
| EMBEDDING_CACHE[key] = embedding | |
| return embedding | |
| raise gr.Error("Could not compute a speaker embedding from the reference audio.") | |
| def normalize_text(text, language, apply_normalization): | |
| text = (text or "").strip() | |
| if not text: | |
| raise gr.Error("Please enter some text to synthesize.") | |
| if len(text) > 5000: | |
| raise gr.Error("Text is too long — please keep it under 5000 characters.") | |
| if not apply_normalization: | |
| return text | |
| return NORMALIZER.normalize(text, LANGUAGES[language]) | |
| def _gpu_duration( | |
| normalized_text, speaker_audio, accurate_mode, clean_background, speaking_rate, max_seconds, *args | |
| ): | |
| # ~18s engine init + JIT/embedder headroom, decode measured at ~51 frames/s | |
| # (86.13 frames per audio second -> ~1.7x realtime). | |
| return 75 + 2 * float(max_seconds) | |
| def generate( | |
| normalized_text, | |
| speaker_audio, | |
| accurate_mode, | |
| clean_background, | |
| speaking_rate, | |
| max_seconds, | |
| seed, | |
| randomize_seed, | |
| temperature, | |
| top_k, | |
| min_p, | |
| repetition_penalty, | |
| progress=gr.Progress(), | |
| ): | |
| models = _get_models() | |
| tts = models["tts"] | |
| # The scheduler pins its CUDA stream thread-locally at init, but each call | |
| # may run in a new thread; re-pin or run_forever's stream assert fails. | |
| torch.cuda.set_stream(tts.stream) | |
| if randomize_seed: | |
| seed = random.randint(0, MAX_SEED) | |
| seed = int(seed) | |
| progress(0.1, desc="Embedding reference voice...") | |
| embedding = _embed_speaker(models, speaker_audio) if speaker_audio is not None else None | |
| tts.speaker_embedding = embedding | |
| tts.clean_speaker_background = bool(clean_background) | |
| tts.accurate_mode = bool(accurate_mode) | |
| sampling_params = TTSSamplingParams( | |
| temperature=float(temperature), | |
| topk=int(top_k), | |
| min_p=float(min_p), | |
| repetition_penalty=float(repetition_penalty), | |
| max_tokens=int(float(max_seconds) * FRAMES_PER_SECOND), | |
| seed=seed, | |
| ) | |
| rate_bucket = None if speaking_rate == "Auto" else SPEAKING_RATE_BUCKETS.index(speaking_rate) | |
| progress(0.3, desc="Generating speech...") | |
| result = tts.generate_one( | |
| normalized_text, | |
| sampling_params, | |
| speaking_rate_bucket=rate_bucket, | |
| ) | |
| if not result["audio"]: | |
| raise gr.Error("Generation produced no audio — try a different seed or shorter text.") | |
| audio = np.frombuffer(result["audio"], dtype=np.float32).copy() | |
| return (SAMPLE_RATE, audio), seed | |
| css = """ | |
| .gradio-container {max-width: 960px !important; margin: 0 auto !important;} | |
| """ | |
| with gr.Blocks(css=css, title="Zonos 2") as demo: | |
| gr.Markdown( | |
| """ | |
| # 🗣️ Zonos 2 | |
| [Zyphra's ZONOS2](https://huggingface.co/Zyphra/ZONOS2) — an expressive multilingual | |
| text-to-speech model with high-fidelity voice cloning, trained on 6M+ hours of speech. | |
| Upload or record a few seconds of a voice and it will speak your text. | |
| [Blog](https://www.zyphra.com/our-work/zonos2) · [Code](https://github.com/Zyphra/ZONOS2) | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| text = gr.Textbox( | |
| label="Text", | |
| lines=4, | |
| value="Hello! I am Zonos 2, a text to speech model by Zyphra. I can clone anyone's voice from just a few seconds of audio.", | |
| ) | |
| language = gr.Dropdown( | |
| choices=list(LANGUAGES.keys()), value="English (US)", label="Language" | |
| ) | |
| speaker_audio = gr.Audio( | |
| label="Reference voice (upload or record)", | |
| type="numpy", | |
| sources=["upload", "microphone"], | |
| value="voices/AmericanFemale.mp3", | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["voices/AmericanFemale.mp3"], | |
| ["voices/AmericanMale.mp3"], | |
| ["voices/BritishFemale.mp3"], | |
| ], | |
| inputs=[speaker_audio], | |
| label="Default voices", | |
| ) | |
| generate_btn = gr.Button("Generate", variant="primary") | |
| with gr.Column(): | |
| audio_out = gr.Audio(label="Generated speech", type="numpy") | |
| with gr.Accordion("Advanced settings", open=False): | |
| accurate_mode = gr.Checkbox( | |
| value=True, | |
| label="Accurate mode", | |
| info="Disable for more expressive (less literal) delivery", | |
| ) | |
| clean_background = gr.Checkbox( | |
| value=False, | |
| label="Clean reference audio", | |
| info="Mark the reference recording as having a clean background", | |
| ) | |
| normalize_chk = gr.Checkbox( | |
| value=True, | |
| label="Normalize text", | |
| info='Convert written forms to spoken forms ("$5" → "five dollars")', | |
| ) | |
| speaking_rate = gr.Dropdown( | |
| choices=RATE_CHOICES, value="Auto", label="Speaking rate (phonemes/sec)" | |
| ) | |
| max_seconds = gr.Slider( | |
| minimum=2, maximum=60, value=30, step=1, label="Max audio length (seconds)" | |
| ) | |
| temperature = gr.Slider( | |
| minimum=0.1, maximum=2.0, value=1.15, step=0.05, label="Temperature" | |
| ) | |
| top_k = gr.Slider(minimum=1, maximum=1024, value=106, step=1, label="Top-k") | |
| min_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.18, step=0.01, label="Min-p") | |
| repetition_penalty = gr.Slider( | |
| minimum=1.0, maximum=2.0, value=1.2, step=0.05, label="Repetition penalty" | |
| ) | |
| seed = gr.Number(value=42, precision=0, label="Seed") | |
| randomize_seed = gr.Checkbox(value=True, label="Randomize seed") | |
| normalized_text = gr.State("") | |
| gr.Examples( | |
| examples=[ | |
| ["Did you know? The sun is actually a giant ball of plasma — over one million Earths could fit inside it!", "English (US)"], | |
| ["On the 3rd of March 2026, tickets cost $5.32 each.", "English (US)"], | |
| ["Bonjour ! Je peux parler plusieurs langues avec une voix naturelle et expressive.", "French"], | |
| ["私は数秒の音声からどんな声でも再現できます。", "Japanese"], | |
| ["¡Hola! Puedo clonar cualquier voz con solo unos segundos de audio.", "Spanish"], | |
| ], | |
| inputs=[text, language], | |
| label="Example texts", | |
| ) | |
| generate_btn.click( | |
| fn=normalize_text, | |
| inputs=[text, language, normalize_chk], | |
| outputs=[normalized_text], | |
| ).then( | |
| fn=generate, | |
| inputs=[ | |
| normalized_text, | |
| speaker_audio, | |
| accurate_mode, | |
| clean_background, | |
| speaking_rate, | |
| max_seconds, | |
| seed, | |
| randomize_seed, | |
| temperature, | |
| top_k, | |
| min_p, | |
| repetition_penalty, | |
| ], | |
| outputs=[audio_out, seed], | |
| ) | |
| demo.launch() | |