MOSS-TTS-Nano / app.py
victor's picture
victor HF Staff
Fix: set attn_implementation=sdpa (flash_attn not installed)
749bf48
from __future__ import annotations
import logging
import os
import urllib.request
from pathlib import Path
import gradio as gr
import torch
try:
import spaces
except ImportError:
class _SpacesFallback:
@staticmethod
def GPU(*args, **kwargs):
def decorator(func):
return func
return decorator
spaces = _SpacesFallback()
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
logger = logging.getLogger(__name__)
MODEL_ID = "OpenMOSS-Team/MOSS-TTS-Nano-100M"
AUDIO_TOKENIZER_ID = "OpenMOSS-Team/MOSS-Audio-Tokenizer-Nano"
OUTPUT_DIR = Path("/tmp/moss-tts-output")
SAMPLE_DIR = Path("/tmp/moss-tts-samples")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
SAMPLE_DIR.mkdir(parents=True, exist_ok=True)
# โ”€โ”€โ”€ Load models at module level (ZeroGPU packs them at startup) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer
logger.info("Loading TTS model: %s", MODEL_ID)
tts_model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
trust_remote_code=True,
dtype=torch.bfloat16,
low_cpu_mem_usage=True,
)
tts_model.eval()
tts_model._set_attention_implementation("sdpa") # flash_attn not installed; sdpa is always available
logger.info("Loading audio tokenizer: %s", AUDIO_TOKENIZER_ID)
audio_tokenizer = AutoModel.from_pretrained(
AUDIO_TOKENIZER_ID,
trust_remote_code=True,
)
audio_tokenizer.eval()
logger.info("Loading text tokenizer")
text_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
logger.info("Moving models to CUDA (ZeroGPU packing) โ€ฆ")
tts_model.to("cuda")
audio_tokenizer.to("cuda")
logger.info("All models ready.")
# โ”€โ”€โ”€ Sample audio files โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
SAMPLE_AUDIO: dict[str, str] = {}
_SAMPLE_URLS = {
"en": "https://raw.githubusercontent.com/OpenMOSS/MOSS-TTS-Nano/main/assets/audio/en_2.wav",
"zh": "https://raw.githubusercontent.com/OpenMOSS/MOSS-TTS-Nano/main/assets/audio/zh_1.wav",
"jp": "https://raw.githubusercontent.com/OpenMOSS/MOSS-TTS-Nano/main/assets/audio/jp_2.wav",
}
for _lang, _url in _SAMPLE_URLS.items():
_dest = SAMPLE_DIR / f"{_lang}_sample.wav"
try:
if not _dest.exists():
urllib.request.urlretrieve(_url, _dest)
logger.info("Downloaded sample audio: %s", _lang)
SAMPLE_AUDIO[_lang] = str(_dest)
except Exception as _e:
logger.warning("Failed to download sample audio %s: %s", _lang, _e)
# โ”€โ”€โ”€ Example content โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
EXAMPLE_TEXTS: dict[str, str] = {
"English": (
"The biggest lesson that can be read from 70 years of AI research is that general methods "
"that leverage computation are ultimately the most effective, and by a large margin."
),
"Chinese": (
"ๆฌข่ฟŽๅ…ณๆณจๆจกๆ€ๆ™บ่ƒฝใ€ไธŠๆตทๅˆ›ๆ™บๅญฆ้™ขไธŽๅคๆ—ฆๅคงๅญฆ่‡ช็„ถ่ฏญ่จ€ๅค„็†ๅฎž้ชŒๅฎคใ€‚"
"ไปŠๅคฉๆˆ‘ไปฌๅฐ†ไธบๆ‚จๅธฆๆฅๆœ€ๆ–ฐ็š„ไบบๅทฅๆ™บ่ƒฝ็ ”็ฉถ่ฟ›ๅฑ•ใ€‚"
),
"French": (
"Bonjour et bienvenue dans notre รฉmission quotidienne d'actualitรฉs. "
"Nous vous prรฉsenterons les nouvelles les plus importantes de la journรฉe."
),
"Japanese": "ๆœฌๆ—ฅใฏNHKใƒ‹ใƒฅใƒผใ‚นใ‚’ใ”่ฆงใ„ใŸใ ใใ‚ใ‚ŠใŒใจใ†ใ”ใ–ใ„ใพใ™ใ€‚ๆœ€ๆ–ฐใฎใƒ‹ใƒฅใƒผใ‚นใ‚’ใŠไผใˆใ—ใพใ™ใ€‚",
"German": (
"Willkommen zu unserem tรคglichen Nachrichtenรผberblick. "
"Wir berichten รผber die wichtigsten Ereignisse des Tages."
),
"Spanish": (
"Bienvenidos al noticiero de la tarde. "
"Aquรญ les presentamos las noticias mรกs relevantes del dรญa de hoy."
),
"Korean": "์•ˆ๋…•ํ•˜์„ธ์š”, KBS ๋‰ด์Šค์ž…๋‹ˆ๋‹ค. ์˜ค๋Š˜์˜ ์ฃผ์š” ๋‰ด์Šค๋ฅผ ์ „ํ•ด๋“œ๋ฆฌ๊ฒ ์Šต๋‹ˆ๋‹ค.",
}
LANG_TO_SAMPLE_AUDIO: dict[str, str | None] = {
"English": SAMPLE_AUDIO.get("en"),
"Chinese": SAMPLE_AUDIO.get("zh"),
"Japanese": SAMPLE_AUDIO.get("jp"),
}
# โ”€โ”€โ”€ Inference โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
@spaces.GPU(duration=120)
def generate_speech(
text: str,
reference_audio: str | None,
max_new_frames: int,
do_sample: bool,
seed: int,
) -> str:
if not text.strip():
raise gr.Error("Please enter text to synthesize.")
if reference_audio is None:
raise gr.Error("Please upload reference audio for voice cloning.")
seed_int = int(seed) if seed else 0
if seed_int != 0:
torch.manual_seed(seed_int)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed_int)
device = "cuda" if torch.cuda.is_available() else "cpu"
output_path = str(OUTPUT_DIR / f"output_{os.getpid()}_{abs(hash(text)) % 1_000_000}.wav")
try:
result = tts_model.inference(
text=text,
output_audio_path=output_path,
mode="voice_clone",
prompt_audio_path=reference_audio,
text_tokenizer=text_tokenizer,
audio_tokenizer=audio_tokenizer,
audio_tokenizer_type="moss-audio-tokenizer-nano",
device=device,
max_new_frames=int(max_new_frames),
do_sample=bool(do_sample),
use_kv_cache=True,
voice_clone_max_text_tokens=75,
)
logger.info(
"Generated: %s sample_rate=%s",
result.get("audio_path"),
result.get("sample_rate"),
)
return str(result["audio_path"])
except Exception as exc:
logger.exception("TTS inference failed")
raise gr.Error(str(exc)) from exc
# โ”€โ”€โ”€ UI โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def on_example_select(key: str) -> tuple[str, str | None]:
return EXAMPLE_TEXTS.get(key, ""), LANG_TO_SAMPLE_AUDIO.get(key)
css = """
.gradio-container { max-width: 1000px !important; margin: 0 auto !important; }
footer { display: none !important; }
"""
with gr.Blocks(title="MOSS-TTS-Nano") as demo:
gr.Markdown(
"""# MOSS-TTS-Nano
**Multilingual 0.1B TTS with zero-shot voice cloning** โ€” 20 languages โ€” 48 kHz stereo
Upload a reference audio clip (3โ€“15 sec) to clone the voice, then enter text in any of the 20 supported languages.
Model: [OpenMOSS-Team/MOSS-TTS-Nano-100M](https://huggingface.co/OpenMOSS-Team/MOSS-TTS-Nano-100M)"""
)
with gr.Row(equal_height=False):
with gr.Column(scale=3):
example_picker = gr.Dropdown(
choices=list(EXAMPLE_TEXTS.keys()),
value="English",
label="Example language",
info="Pre-fills text and loads a sample reference audio",
)
text_input = gr.Textbox(
label="Text to synthesize",
value=EXAMPLE_TEXTS["English"],
lines=5,
placeholder="Enter text in any supported languageโ€ฆ",
)
ref_audio = gr.Audio(
label="Reference audio (voice to clone)",
type="filepath",
sources=["upload", "microphone"],
value=SAMPLE_AUDIO.get("en"),
)
with gr.Accordion("Advanced settings", open=False):
max_frames_slider = gr.Slider(
minimum=64, maximum=512, value=375, step=16,
label="Max new frames",
info="Controls the maximum length of generated audio",
)
do_sample_cb = gr.Checkbox(
value=True,
label="Sampling",
info="Uncheck for deterministic (but potentially repetitive) output",
)
seed_input = gr.Number(
value=0, precision=0,
label="Seed (0 = random)",
)
generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
with gr.Column(scale=2):
output_audio = gr.Audio(label="Generated speech", type="filepath")
gr.Markdown(
"""**Supported languages**
Chinese ยท English ยท German ยท Spanish ยท French ยท Japanese ยท Italian ยท Hungarian ยท
Korean ยท Russian ยท Persian ยท Arabic ยท Polish ยท Portuguese ยท Czech ยท Danish ยท Swedish ยท
Greek ยท Turkish
**Resources**: [Paper (arXiv:2603.18090)](https://arxiv.org/abs/2603.18090) ยท
[GitHub](https://github.com/OpenMOSS/MOSS-TTS-Nano) ยท
[Model card](https://huggingface.co/OpenMOSS-Team/MOSS-TTS-Nano-100M)"""
)
example_picker.change(
fn=on_example_select,
inputs=example_picker,
outputs=[text_input, ref_audio],
)
generate_btn.click(
fn=generate_speech,
inputs=[text_input, ref_audio, max_frames_slider, do_sample_cb, seed_input],
outputs=output_audio,
)
demo.launch(show_error=True, ssr_mode=False)