MOSS-Nano-CPU / app.py
eaysu's picture
Convert to CPU-compatible Space: remove ZeroGPU dependency
12b3247
from __future__ import annotations
import logging
import os
import urllib.request
from pathlib import Path
import gradio as gr
import torch
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
logger = logging.getLogger(__name__)
# ─── Device detection ─────────────────────────────────────────────────────────
def _get_device() -> str:
if torch.cuda.is_available():
return "cuda"
if torch.backends.mps.is_available():
return "mps"
return "cpu"
DEVICE = _get_device()
# bfloat16 works on CUDA/MPS; use float32 on CPU for stability
DTYPE = torch.bfloat16 if DEVICE in ("cuda", "mps") else torch.float32
logger.info("Running on device: %s, dtype: %s", DEVICE, DTYPE)
MODEL_ID = "OpenMOSS-Team/MOSS-TTS-Nano-100M"
AUDIO_TOKENIZER_ID = "OpenMOSS-Team/MOSS-Audio-Tokenizer-Nano"
OUTPUT_DIR = Path("/tmp/moss-tts-output")
SAMPLE_DIR = Path("/tmp/moss-tts-samples")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
SAMPLE_DIR.mkdir(parents=True, exist_ok=True)
# ─── Load models ──────────────────────────────────────────────────────────────
from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer
logger.info("Loading TTS model: %s", MODEL_ID)
tts_model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
trust_remote_code=True,
torch_dtype=DTYPE,
low_cpu_mem_usage=True,
)
tts_model.eval()
tts_model._set_attention_implementation("sdpa")
logger.info("Loading audio tokenizer: %s", AUDIO_TOKENIZER_ID)
audio_tokenizer = AutoModel.from_pretrained(
AUDIO_TOKENIZER_ID,
trust_remote_code=True,
)
audio_tokenizer.eval()
logger.info("Loading text tokenizer")
text_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
logger.info("Moving models to %s …", DEVICE)
tts_model.to(DEVICE)
audio_tokenizer.to(DEVICE)
logger.info("All models ready.")
# ─── Sample audio files ───────────────────────────────────────────────────────
SAMPLE_AUDIO: dict[str, str] = {}
_SAMPLE_URLS = {
"en": "https://raw.githubusercontent.com/OpenMOSS/MOSS-TTS-Nano/main/assets/audio/en_2.wav",
"zh": "https://raw.githubusercontent.com/OpenMOSS/MOSS-TTS-Nano/main/assets/audio/zh_1.wav",
"jp": "https://raw.githubusercontent.com/OpenMOSS/MOSS-TTS-Nano/main/assets/audio/jp_2.wav",
}
for _lang, _url in _SAMPLE_URLS.items():
_dest = SAMPLE_DIR / f"{_lang}_sample.wav"
try:
if not _dest.exists():
urllib.request.urlretrieve(_url, _dest)
logger.info("Downloaded sample audio: %s", _lang)
SAMPLE_AUDIO[_lang] = str(_dest)
except Exception as _e:
logger.warning("Failed to download sample audio %s: %s", _lang, _e)
# ─── Example content ──────────────────────────────────────────────────────────
EXAMPLE_TEXTS: dict[str, str] = {
"English": (
"The biggest lesson that can be read from 70 years of AI research is that general methods "
"that leverage computation are ultimately the most effective, and by a large margin."
),
"Chinese": (
"欢迎关注模思智能、上海创智学院与复旦大学自然语言处理实验室。"
"今天我们将为您带来最新的人工智能研究进展。"
),
"French": (
"Bonjour et bienvenue dans notre émission quotidienne d'actualités. "
"Nous vous présenterons les nouvelles les plus importantes de la journée."
),
"Japanese": "本日はNHKニュースをご覧いただきありがとうございます。最新のニュースをお伝えします。",
"German": (
"Willkommen zu unserem täglichen Nachrichtenüberblick. "
"Wir berichten über die wichtigsten Ereignisse des Tages."
),
"Spanish": (
"Bienvenidos al noticiero de la tarde. "
"Aquí les presentamos las noticias más relevantes del día de hoy."
),
"Korean": "안녕하세요, KBS 뉴스입니다. 오늘의 주요 뉴스를 전해드리겠습니다.",
}
LANG_TO_SAMPLE_AUDIO: dict[str, str | None] = {
"English": SAMPLE_AUDIO.get("en"),
"Chinese": SAMPLE_AUDIO.get("zh"),
"Japanese": SAMPLE_AUDIO.get("jp"),
}
# ─── Inference ────────────────────────────────────────────────────────────────
def generate_speech(
text: str,
reference_audio: str | None,
max_new_frames: int,
do_sample: bool,
seed: int,
) -> str:
if not text.strip():
raise gr.Error("Please enter text to synthesize.")
if reference_audio is None:
raise gr.Error("Please upload reference audio for voice cloning.")
seed_int = int(seed) if seed else 0
if seed_int != 0:
torch.manual_seed(seed_int)
if DEVICE == "cuda":
torch.cuda.manual_seed(seed_int)
output_path = str(OUTPUT_DIR / f"output_{os.getpid()}_{abs(hash(text)) % 1_000_000}.wav")
try:
result = tts_model.inference(
text=text,
output_audio_path=output_path,
mode="voice_clone",
prompt_audio_path=reference_audio,
text_tokenizer=text_tokenizer,
audio_tokenizer=audio_tokenizer,
audio_tokenizer_type="moss-audio-tokenizer-nano",
device=DEVICE,
max_new_frames=int(max_new_frames),
do_sample=bool(do_sample),
use_kv_cache=True,
voice_clone_max_text_tokens=75,
)
logger.info(
"Generated: %s sample_rate=%s",
result.get("audio_path"),
result.get("sample_rate"),
)
return str(result["audio_path"])
except Exception as exc:
logger.exception("TTS inference failed")
raise gr.Error(str(exc)) from exc
# ─── UI ───────────────────────────────────────────────────────────────────────
def on_example_select(key: str) -> tuple[str, str | None]:
return EXAMPLE_TEXTS.get(key, ""), LANG_TO_SAMPLE_AUDIO.get(key)
css = """
.gradio-container { max-width: 1000px !important; margin: 0 auto !important; }
footer { display: none !important; }
"""
with gr.Blocks(title="MOSS-TTS-Nano") as demo:
gr.Markdown(
"""# MOSS-TTS-Nano
**Multilingual 0.1B TTS with zero-shot voice cloning** — 20 languages — 48 kHz stereo
Upload a reference audio clip (3–15 sec) to clone the voice, then enter text in any of the 20 supported languages.
Model: [OpenMOSS-Team/MOSS-TTS-Nano-100M](https://huggingface.co/OpenMOSS-Team/MOSS-TTS-Nano-100M)"""
)
with gr.Row(equal_height=False):
with gr.Column(scale=3):
example_picker = gr.Dropdown(
choices=list(EXAMPLE_TEXTS.keys()),
value="English",
label="Example language",
info="Pre-fills text and loads a sample reference audio",
)
text_input = gr.Textbox(
label="Text to synthesize",
value=EXAMPLE_TEXTS["English"],
lines=5,
placeholder="Enter text in any supported language…",
)
ref_audio = gr.Audio(
label="Reference audio (voice to clone)",
type="filepath",
sources=["upload", "microphone"],
value=SAMPLE_AUDIO.get("en"),
)
with gr.Accordion("Advanced settings", open=False):
max_frames_slider = gr.Slider(
minimum=64, maximum=512, value=375, step=16,
label="Max new frames",
info="Controls the maximum length of generated audio",
)
do_sample_cb = gr.Checkbox(
value=True,
label="Sampling",
info="Uncheck for deterministic (but potentially repetitive) output",
)
seed_input = gr.Number(
value=0, precision=0,
label="Seed (0 = random)",
)
generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
with gr.Column(scale=2):
output_audio = gr.Audio(label="Generated speech", type="filepath")
gr.Markdown(
"""**Supported languages**
Chinese · English · German · Spanish · French · Japanese · Italian · Hungarian ·
Korean · Russian · Persian · Arabic · Polish · Portuguese · Czech · Danish · Swedish ·
Greek · Turkish
**Resources**: [Paper (arXiv:2603.18090)](https://arxiv.org/abs/2603.18090) ·
[GitHub](https://github.com/OpenMOSS/MOSS-TTS-Nano) ·
[Model card](https://huggingface.co/OpenMOSS-Team/MOSS-TTS-Nano-100M)"""
)
example_picker.change(
fn=on_example_select,
inputs=example_picker,
outputs=[text_input, ref_audio],
)
generate_btn.click(
fn=generate_speech,
inputs=[text_input, ref_audio, max_frames_slider, do_sample_cb, seed_input],
outputs=output_audio,
)
demo.launch(show_error=True, ssr_mode=False)