from __future__ import annotations import logging import os import urllib.request from pathlib import Path import gradio as gr import torch logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s") logger = logging.getLogger(__name__) # ─── Device detection ───────────────────────────────────────────────────────── def _get_device() -> str: if torch.cuda.is_available(): return "cuda" if torch.backends.mps.is_available(): return "mps" return "cpu" DEVICE = _get_device() # bfloat16 works on CUDA/MPS; use float32 on CPU for stability DTYPE = torch.bfloat16 if DEVICE in ("cuda", "mps") else torch.float32 logger.info("Running on device: %s, dtype: %s", DEVICE, DTYPE) MODEL_ID = "OpenMOSS-Team/MOSS-TTS-Nano-100M" AUDIO_TOKENIZER_ID = "OpenMOSS-Team/MOSS-Audio-Tokenizer-Nano" OUTPUT_DIR = Path("/tmp/moss-tts-output") SAMPLE_DIR = Path("/tmp/moss-tts-samples") OUTPUT_DIR.mkdir(parents=True, exist_ok=True) SAMPLE_DIR.mkdir(parents=True, exist_ok=True) # ─── Load models ────────────────────────────────────────────────────────────── from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer logger.info("Loading TTS model: %s", MODEL_ID) tts_model = AutoModelForCausalLM.from_pretrained( MODEL_ID, trust_remote_code=True, torch_dtype=DTYPE, low_cpu_mem_usage=True, ) tts_model.eval() tts_model._set_attention_implementation("sdpa") logger.info("Loading audio tokenizer: %s", AUDIO_TOKENIZER_ID) audio_tokenizer = AutoModel.from_pretrained( AUDIO_TOKENIZER_ID, trust_remote_code=True, ) audio_tokenizer.eval() logger.info("Loading text tokenizer") text_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) logger.info("Moving models to %s …", DEVICE) tts_model.to(DEVICE) audio_tokenizer.to(DEVICE) logger.info("All models ready.") # ─── Sample audio files ─────────────────────────────────────────────────────── SAMPLE_AUDIO: dict[str, str] = {} _SAMPLE_URLS = { "en": "https://raw.githubusercontent.com/OpenMOSS/MOSS-TTS-Nano/main/assets/audio/en_2.wav", "zh": "https://raw.githubusercontent.com/OpenMOSS/MOSS-TTS-Nano/main/assets/audio/zh_1.wav", "jp": "https://raw.githubusercontent.com/OpenMOSS/MOSS-TTS-Nano/main/assets/audio/jp_2.wav", } for _lang, _url in _SAMPLE_URLS.items(): _dest = SAMPLE_DIR / f"{_lang}_sample.wav" try: if not _dest.exists(): urllib.request.urlretrieve(_url, _dest) logger.info("Downloaded sample audio: %s", _lang) SAMPLE_AUDIO[_lang] = str(_dest) except Exception as _e: logger.warning("Failed to download sample audio %s: %s", _lang, _e) # ─── Example content ────────────────────────────────────────────────────────── EXAMPLE_TEXTS: dict[str, str] = { "English": ( "The biggest lesson that can be read from 70 years of AI research is that general methods " "that leverage computation are ultimately the most effective, and by a large margin." ), "Chinese": ( "欢迎关注模思智能、上海创智学院与复旦大学自然语言处理实验室。" "今天我们将为您带来最新的人工智能研究进展。" ), "French": ( "Bonjour et bienvenue dans notre émission quotidienne d'actualités. " "Nous vous présenterons les nouvelles les plus importantes de la journée." ), "Japanese": "本日はNHKニュースをご覧いただきありがとうございます。最新のニュースをお伝えします。", "German": ( "Willkommen zu unserem täglichen Nachrichtenüberblick. " "Wir berichten über die wichtigsten Ereignisse des Tages." ), "Spanish": ( "Bienvenidos al noticiero de la tarde. " "Aquí les presentamos las noticias más relevantes del día de hoy." ), "Korean": "안녕하세요, KBS 뉴스입니다. 오늘의 주요 뉴스를 전해드리겠습니다.", } LANG_TO_SAMPLE_AUDIO: dict[str, str | None] = { "English": SAMPLE_AUDIO.get("en"), "Chinese": SAMPLE_AUDIO.get("zh"), "Japanese": SAMPLE_AUDIO.get("jp"), } # ─── Inference ──────────────────────────────────────────────────────────────── def generate_speech( text: str, reference_audio: str | None, max_new_frames: int, do_sample: bool, seed: int, ) -> str: if not text.strip(): raise gr.Error("Please enter text to synthesize.") if reference_audio is None: raise gr.Error("Please upload reference audio for voice cloning.") seed_int = int(seed) if seed else 0 if seed_int != 0: torch.manual_seed(seed_int) if DEVICE == "cuda": torch.cuda.manual_seed(seed_int) output_path = str(OUTPUT_DIR / f"output_{os.getpid()}_{abs(hash(text)) % 1_000_000}.wav") try: result = tts_model.inference( text=text, output_audio_path=output_path, mode="voice_clone", prompt_audio_path=reference_audio, text_tokenizer=text_tokenizer, audio_tokenizer=audio_tokenizer, audio_tokenizer_type="moss-audio-tokenizer-nano", device=DEVICE, max_new_frames=int(max_new_frames), do_sample=bool(do_sample), use_kv_cache=True, voice_clone_max_text_tokens=75, ) logger.info( "Generated: %s sample_rate=%s", result.get("audio_path"), result.get("sample_rate"), ) return str(result["audio_path"]) except Exception as exc: logger.exception("TTS inference failed") raise gr.Error(str(exc)) from exc # ─── UI ─────────────────────────────────────────────────────────────────────── def on_example_select(key: str) -> tuple[str, str | None]: return EXAMPLE_TEXTS.get(key, ""), LANG_TO_SAMPLE_AUDIO.get(key) css = """ .gradio-container { max-width: 1000px !important; margin: 0 auto !important; } footer { display: none !important; } """ with gr.Blocks(title="MOSS-TTS-Nano") as demo: gr.Markdown( """# MOSS-TTS-Nano **Multilingual 0.1B TTS with zero-shot voice cloning** — 20 languages — 48 kHz stereo Upload a reference audio clip (3–15 sec) to clone the voice, then enter text in any of the 20 supported languages. Model: [OpenMOSS-Team/MOSS-TTS-Nano-100M](https://huggingface.co/OpenMOSS-Team/MOSS-TTS-Nano-100M)""" ) with gr.Row(equal_height=False): with gr.Column(scale=3): example_picker = gr.Dropdown( choices=list(EXAMPLE_TEXTS.keys()), value="English", label="Example language", info="Pre-fills text and loads a sample reference audio", ) text_input = gr.Textbox( label="Text to synthesize", value=EXAMPLE_TEXTS["English"], lines=5, placeholder="Enter text in any supported language…", ) ref_audio = gr.Audio( label="Reference audio (voice to clone)", type="filepath", sources=["upload", "microphone"], value=SAMPLE_AUDIO.get("en"), ) with gr.Accordion("Advanced settings", open=False): max_frames_slider = gr.Slider( minimum=64, maximum=512, value=375, step=16, label="Max new frames", info="Controls the maximum length of generated audio", ) do_sample_cb = gr.Checkbox( value=True, label="Sampling", info="Uncheck for deterministic (but potentially repetitive) output", ) seed_input = gr.Number( value=0, precision=0, label="Seed (0 = random)", ) generate_btn = gr.Button("Generate Speech", variant="primary", size="lg") with gr.Column(scale=2): output_audio = gr.Audio(label="Generated speech", type="filepath") gr.Markdown( """**Supported languages** Chinese · English · German · Spanish · French · Japanese · Italian · Hungarian · Korean · Russian · Persian · Arabic · Polish · Portuguese · Czech · Danish · Swedish · Greek · Turkish **Resources**: [Paper (arXiv:2603.18090)](https://arxiv.org/abs/2603.18090) · [GitHub](https://github.com/OpenMOSS/MOSS-TTS-Nano) · [Model card](https://huggingface.co/OpenMOSS-Team/MOSS-TTS-Nano-100M)""" ) example_picker.change( fn=on_example_select, inputs=example_picker, outputs=[text_input, ref_audio], ) generate_btn.click( fn=generate_speech, inputs=[text_input, ref_audio, max_frames_slider, do_sample_cb, seed_input], outputs=output_audio, ) demo.launch(show_error=True, ssr_mode=False)