Spaces:

victor
/

MOSS-TTS-Nano

Running on Zero

App Files Files Community

MOSS-TTS-Nano / app.py

victor HF Staff

Fix: set attn_implementation=sdpa (flash_attn not installed)

749bf48 5 days ago

raw

history blame contribute delete

9.45 kB

	from __future__ import annotations

	import logging
	import os
	import urllib.request
	from pathlib import Path

	import gradio as gr
	import torch

	try:
	import spaces
	except ImportError:
	class _SpacesFallback:
	@staticmethod
	def GPU(args, *kwargs):
	def decorator(func):
	return func
	return decorator
	spaces = _SpacesFallback()

	logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
	logger = logging.getLogger(__name__)

	MODEL_ID = "OpenMOSS-Team/MOSS-TTS-Nano-100M"
	AUDIO_TOKENIZER_ID = "OpenMOSS-Team/MOSS-Audio-Tokenizer-Nano"
	OUTPUT_DIR = Path("/tmp/moss-tts-output")
	SAMPLE_DIR = Path("/tmp/moss-tts-samples")
	OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
	SAMPLE_DIR.mkdir(parents=True, exist_ok=True)

	# ─── Load models at module level (ZeroGPU packs them at startup) ──────────────
	from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer

	logger.info("Loading TTS model: %s", MODEL_ID)
	tts_model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	trust_remote_code=True,
	dtype=torch.bfloat16,
	low_cpu_mem_usage=True,
	)
	tts_model.eval()
	tts_model._set_attention_implementation("sdpa") # flash_attn not installed; sdpa is always available

	logger.info("Loading audio tokenizer: %s", AUDIO_TOKENIZER_ID)
	audio_tokenizer = AutoModel.from_pretrained(
	AUDIO_TOKENIZER_ID,
	trust_remote_code=True,
	)
	audio_tokenizer.eval()

	logger.info("Loading text tokenizer")
	text_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

	logger.info("Moving models to CUDA (ZeroGPU packing) …")
	tts_model.to("cuda")
	audio_tokenizer.to("cuda")
	logger.info("All models ready.")

	# ─── Sample audio files ───────────────────────────────────────────────────────
	SAMPLE_AUDIO: dict[str, str] = {}
	_SAMPLE_URLS = {
	"en": "https://raw.githubusercontent.com/OpenMOSS/MOSS-TTS-Nano/main/assets/audio/en_2.wav",
	"zh": "https://raw.githubusercontent.com/OpenMOSS/MOSS-TTS-Nano/main/assets/audio/zh_1.wav",
	"jp": "https://raw.githubusercontent.com/OpenMOSS/MOSS-TTS-Nano/main/assets/audio/jp_2.wav",
	}
	for _lang, _url in _SAMPLE_URLS.items():
	_dest = SAMPLE_DIR / f"{_lang}_sample.wav"
	try:
	if not _dest.exists():
	urllib.request.urlretrieve(_url, _dest)
	logger.info("Downloaded sample audio: %s", _lang)
	SAMPLE_AUDIO[_lang] = str(_dest)
	except Exception as _e:
	logger.warning("Failed to download sample audio %s: %s", _lang, _e)

	# ─── Example content ──────────────────────────────────────────────────────────
	EXAMPLE_TEXTS: dict[str, str] = {
	"English": (
	"The biggest lesson that can be read from 70 years of AI research is that general methods "
	"that leverage computation are ultimately the most effective, and by a large margin."
	),
	"Chinese": (
	"欢迎关注模思智能、上海创智学院与复旦大学自然语言处理实验室。"
	"今天我们将为您带来最新的人工智能研究进展。"
	),
	"French": (
	"Bonjour et bienvenue dans notre émission quotidienne d'actualités. "
	"Nous vous présenterons les nouvelles les plus importantes de la journée."
	),
	"Japanese": "本日はNHKニュースをご覧いただきありがとうございます。最新のニュースをお伝えします。",
	"German": (
	"Willkommen zu unserem täglichen Nachrichtenüberblick. "
	"Wir berichten über die wichtigsten Ereignisse des Tages."
	),
	"Spanish": (
	"Bienvenidos al noticiero de la tarde. "
	"Aquí les presentamos las noticias más relevantes del día de hoy."
	),
	"Korean": "안녕하세요, KBS 뉴스입니다. 오늘의 주요 뉴스를 전해드리겠습니다.",
	}

	LANG_TO_SAMPLE_AUDIO: dict[str, str \| None] = {
	"English": SAMPLE_AUDIO.get("en"),
	"Chinese": SAMPLE_AUDIO.get("zh"),
	"Japanese": SAMPLE_AUDIO.get("jp"),
	}


	# ─── Inference ────────────────────────────────────────────────────────────────
	@spaces.GPU(duration=120)
	def generate_speech(
	text: str,
	reference_audio: str \| None,
	max_new_frames: int,
	do_sample: bool,
	seed: int,
	) -> str:
	if not text.strip():
	raise gr.Error("Please enter text to synthesize.")
	if reference_audio is None:
	raise gr.Error("Please upload reference audio for voice cloning.")

	seed_int = int(seed) if seed else 0
	if seed_int != 0:
	torch.manual_seed(seed_int)
	if torch.cuda.is_available():
	torch.cuda.manual_seed(seed_int)

	device = "cuda" if torch.cuda.is_available() else "cpu"
	output_path = str(OUTPUT_DIR / f"output_{os.getpid()}_{abs(hash(text)) % 1_000_000}.wav")

	try:
	result = tts_model.inference(
	text=text,
	output_audio_path=output_path,
	mode="voice_clone",
	prompt_audio_path=reference_audio,
	text_tokenizer=text_tokenizer,
	audio_tokenizer=audio_tokenizer,
	audio_tokenizer_type="moss-audio-tokenizer-nano",
	device=device,
	max_new_frames=int(max_new_frames),
	do_sample=bool(do_sample),
	use_kv_cache=True,
	voice_clone_max_text_tokens=75,
	)
	logger.info(
	"Generated: %s sample_rate=%s",
	result.get("audio_path"),
	result.get("sample_rate"),
	)
	return str(result["audio_path"])
	except Exception as exc:
	logger.exception("TTS inference failed")
	raise gr.Error(str(exc)) from exc


	# ─── UI ───────────────────────────────────────────────────────────────────────
	def on_example_select(key: str) -> tuple[str, str \| None]:
	return EXAMPLE_TEXTS.get(key, ""), LANG_TO_SAMPLE_AUDIO.get(key)


	css = """
	.gradio-container { max-width: 1000px !important; margin: 0 auto !important; }
	footer { display: none !important; }
	"""

	with gr.Blocks(title="MOSS-TTS-Nano") as demo:
	gr.Markdown(
	"""# MOSS-TTS-Nano
	Multilingual 0.1B TTS with zero-shot voice cloning — 20 languages — 48 kHz stereo

	Upload a reference audio clip (3–15 sec) to clone the voice, then enter text in any of the 20 supported languages.
	Model: [OpenMOSS-Team/MOSS-TTS-Nano-100M](https://huggingface.co/OpenMOSS-Team/MOSS-TTS-Nano-100M)"""
	)

	with gr.Row(equal_height=False):
	with gr.Column(scale=3):
	example_picker = gr.Dropdown(
	choices=list(EXAMPLE_TEXTS.keys()),
	value="English",
	label="Example language",
	info="Pre-fills text and loads a sample reference audio",
	)
	text_input = gr.Textbox(
	label="Text to synthesize",
	value=EXAMPLE_TEXTS["English"],
	lines=5,
	placeholder="Enter text in any supported language…",
	)
	ref_audio = gr.Audio(
	label="Reference audio (voice to clone)",
	type="filepath",
	sources=["upload", "microphone"],
	value=SAMPLE_AUDIO.get("en"),
	)
	with gr.Accordion("Advanced settings", open=False):
	max_frames_slider = gr.Slider(
	minimum=64, maximum=512, value=375, step=16,
	label="Max new frames",
	info="Controls the maximum length of generated audio",
	)
	do_sample_cb = gr.Checkbox(
	value=True,
	label="Sampling",
	info="Uncheck for deterministic (but potentially repetitive) output",
	)
	seed_input = gr.Number(
	value=0, precision=0,
	label="Seed (0 = random)",
	)
	generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")

	with gr.Column(scale=2):
	output_audio = gr.Audio(label="Generated speech", type="filepath")
	gr.Markdown(
	"""Supported languages

	Chinese · English · German · Spanish · French · Japanese · Italian · Hungarian ·
	Korean · Russian · Persian · Arabic · Polish · Portuguese · Czech · Danish · Swedish ·
	Greek · Turkish

	Resources: [Paper (arXiv:2603.18090)](https://arxiv.org/abs/2603.18090) ·
	[GitHub](https://github.com/OpenMOSS/MOSS-TTS-Nano) ·
	[Model card](https://huggingface.co/OpenMOSS-Team/MOSS-TTS-Nano-100M)"""
	)

	example_picker.change(
	fn=on_example_select,
	inputs=example_picker,
	outputs=[text_input, ref_audio],
	)
	generate_btn.click(
	fn=generate_speech,
	inputs=[text_input, ref_audio, max_frames_slider, do_sample_cb, seed_input],
	outputs=output_audio,
	)

	demo.launch(show_error=True, ssr_mode=False)