Spaces:
Running on Zero
Running on Zero
| from __future__ import annotations | |
| import logging | |
| import os | |
| import urllib.request | |
| from pathlib import Path | |
| import gradio as gr | |
| import torch | |
| try: | |
| import spaces | |
| except ImportError: | |
| class _SpacesFallback: | |
| def GPU(*args, **kwargs): | |
| def decorator(func): | |
| return func | |
| return decorator | |
| spaces = _SpacesFallback() | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s") | |
| logger = logging.getLogger(__name__) | |
| MODEL_ID = "OpenMOSS-Team/MOSS-TTS-Nano-100M" | |
| AUDIO_TOKENIZER_ID = "OpenMOSS-Team/MOSS-Audio-Tokenizer-Nano" | |
| OUTPUT_DIR = Path("/tmp/moss-tts-output") | |
| SAMPLE_DIR = Path("/tmp/moss-tts-samples") | |
| OUTPUT_DIR.mkdir(parents=True, exist_ok=True) | |
| SAMPLE_DIR.mkdir(parents=True, exist_ok=True) | |
| # โโโ Load models at module level (ZeroGPU packs them at startup) โโโโโโโโโโโโโโ | |
| from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer | |
| logger.info("Loading TTS model: %s", MODEL_ID) | |
| tts_model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| trust_remote_code=True, | |
| dtype=torch.bfloat16, | |
| low_cpu_mem_usage=True, | |
| ) | |
| tts_model.eval() | |
| tts_model._set_attention_implementation("sdpa") # flash_attn not installed; sdpa is always available | |
| logger.info("Loading audio tokenizer: %s", AUDIO_TOKENIZER_ID) | |
| audio_tokenizer = AutoModel.from_pretrained( | |
| AUDIO_TOKENIZER_ID, | |
| trust_remote_code=True, | |
| ) | |
| audio_tokenizer.eval() | |
| logger.info("Loading text tokenizer") | |
| text_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| logger.info("Moving models to CUDA (ZeroGPU packing) โฆ") | |
| tts_model.to("cuda") | |
| audio_tokenizer.to("cuda") | |
| logger.info("All models ready.") | |
| # โโโ Sample audio files โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| SAMPLE_AUDIO: dict[str, str] = {} | |
| _SAMPLE_URLS = { | |
| "en": "https://raw.githubusercontent.com/OpenMOSS/MOSS-TTS-Nano/main/assets/audio/en_2.wav", | |
| "zh": "https://raw.githubusercontent.com/OpenMOSS/MOSS-TTS-Nano/main/assets/audio/zh_1.wav", | |
| "jp": "https://raw.githubusercontent.com/OpenMOSS/MOSS-TTS-Nano/main/assets/audio/jp_2.wav", | |
| } | |
| for _lang, _url in _SAMPLE_URLS.items(): | |
| _dest = SAMPLE_DIR / f"{_lang}_sample.wav" | |
| try: | |
| if not _dest.exists(): | |
| urllib.request.urlretrieve(_url, _dest) | |
| logger.info("Downloaded sample audio: %s", _lang) | |
| SAMPLE_AUDIO[_lang] = str(_dest) | |
| except Exception as _e: | |
| logger.warning("Failed to download sample audio %s: %s", _lang, _e) | |
| # โโโ Example content โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| EXAMPLE_TEXTS: dict[str, str] = { | |
| "English": ( | |
| "The biggest lesson that can be read from 70 years of AI research is that general methods " | |
| "that leverage computation are ultimately the most effective, and by a large margin." | |
| ), | |
| "Chinese": ( | |
| "ๆฌข่ฟๅ ณๆณจๆจกๆๆบ่ฝใไธๆตทๅๆบๅญฆ้ขไธๅคๆฆๅคงๅญฆ่ช็ถ่ฏญ่จๅค็ๅฎ้ชๅฎคใ" | |
| "ไปๅคฉๆไปฌๅฐไธบๆจๅธฆๆฅๆๆฐ็ไบบๅทฅๆบ่ฝ็ ็ฉถ่ฟๅฑใ" | |
| ), | |
| "French": ( | |
| "Bonjour et bienvenue dans notre รฉmission quotidienne d'actualitรฉs. " | |
| "Nous vous prรฉsenterons les nouvelles les plus importantes de la journรฉe." | |
| ), | |
| "Japanese": "ๆฌๆฅใฏNHKใใฅใผในใใ่ฆงใใใ ใใใใใจใใใใใพใใๆๆฐใฎใใฅใผในใใไผใใใพใใ", | |
| "German": ( | |
| "Willkommen zu unserem tรคglichen Nachrichtenรผberblick. " | |
| "Wir berichten รผber die wichtigsten Ereignisse des Tages." | |
| ), | |
| "Spanish": ( | |
| "Bienvenidos al noticiero de la tarde. " | |
| "Aquรญ les presentamos las noticias mรกs relevantes del dรญa de hoy." | |
| ), | |
| "Korean": "์๋ ํ์ธ์, KBS ๋ด์ค์ ๋๋ค. ์ค๋์ ์ฃผ์ ๋ด์ค๋ฅผ ์ ํด๋๋ฆฌ๊ฒ ์ต๋๋ค.", | |
| } | |
| LANG_TO_SAMPLE_AUDIO: dict[str, str | None] = { | |
| "English": SAMPLE_AUDIO.get("en"), | |
| "Chinese": SAMPLE_AUDIO.get("zh"), | |
| "Japanese": SAMPLE_AUDIO.get("jp"), | |
| } | |
| # โโโ Inference โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def generate_speech( | |
| text: str, | |
| reference_audio: str | None, | |
| max_new_frames: int, | |
| do_sample: bool, | |
| seed: int, | |
| ) -> str: | |
| if not text.strip(): | |
| raise gr.Error("Please enter text to synthesize.") | |
| if reference_audio is None: | |
| raise gr.Error("Please upload reference audio for voice cloning.") | |
| seed_int = int(seed) if seed else 0 | |
| if seed_int != 0: | |
| torch.manual_seed(seed_int) | |
| if torch.cuda.is_available(): | |
| torch.cuda.manual_seed(seed_int) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| output_path = str(OUTPUT_DIR / f"output_{os.getpid()}_{abs(hash(text)) % 1_000_000}.wav") | |
| try: | |
| result = tts_model.inference( | |
| text=text, | |
| output_audio_path=output_path, | |
| mode="voice_clone", | |
| prompt_audio_path=reference_audio, | |
| text_tokenizer=text_tokenizer, | |
| audio_tokenizer=audio_tokenizer, | |
| audio_tokenizer_type="moss-audio-tokenizer-nano", | |
| device=device, | |
| max_new_frames=int(max_new_frames), | |
| do_sample=bool(do_sample), | |
| use_kv_cache=True, | |
| voice_clone_max_text_tokens=75, | |
| ) | |
| logger.info( | |
| "Generated: %s sample_rate=%s", | |
| result.get("audio_path"), | |
| result.get("sample_rate"), | |
| ) | |
| return str(result["audio_path"]) | |
| except Exception as exc: | |
| logger.exception("TTS inference failed") | |
| raise gr.Error(str(exc)) from exc | |
| # โโโ UI โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def on_example_select(key: str) -> tuple[str, str | None]: | |
| return EXAMPLE_TEXTS.get(key, ""), LANG_TO_SAMPLE_AUDIO.get(key) | |
| css = """ | |
| .gradio-container { max-width: 1000px !important; margin: 0 auto !important; } | |
| footer { display: none !important; } | |
| """ | |
| with gr.Blocks(title="MOSS-TTS-Nano") as demo: | |
| gr.Markdown( | |
| """# MOSS-TTS-Nano | |
| **Multilingual 0.1B TTS with zero-shot voice cloning** โ 20 languages โ 48 kHz stereo | |
| Upload a reference audio clip (3โ15 sec) to clone the voice, then enter text in any of the 20 supported languages. | |
| Model: [OpenMOSS-Team/MOSS-TTS-Nano-100M](https://huggingface.co/OpenMOSS-Team/MOSS-TTS-Nano-100M)""" | |
| ) | |
| with gr.Row(equal_height=False): | |
| with gr.Column(scale=3): | |
| example_picker = gr.Dropdown( | |
| choices=list(EXAMPLE_TEXTS.keys()), | |
| value="English", | |
| label="Example language", | |
| info="Pre-fills text and loads a sample reference audio", | |
| ) | |
| text_input = gr.Textbox( | |
| label="Text to synthesize", | |
| value=EXAMPLE_TEXTS["English"], | |
| lines=5, | |
| placeholder="Enter text in any supported languageโฆ", | |
| ) | |
| ref_audio = gr.Audio( | |
| label="Reference audio (voice to clone)", | |
| type="filepath", | |
| sources=["upload", "microphone"], | |
| value=SAMPLE_AUDIO.get("en"), | |
| ) | |
| with gr.Accordion("Advanced settings", open=False): | |
| max_frames_slider = gr.Slider( | |
| minimum=64, maximum=512, value=375, step=16, | |
| label="Max new frames", | |
| info="Controls the maximum length of generated audio", | |
| ) | |
| do_sample_cb = gr.Checkbox( | |
| value=True, | |
| label="Sampling", | |
| info="Uncheck for deterministic (but potentially repetitive) output", | |
| ) | |
| seed_input = gr.Number( | |
| value=0, precision=0, | |
| label="Seed (0 = random)", | |
| ) | |
| generate_btn = gr.Button("Generate Speech", variant="primary", size="lg") | |
| with gr.Column(scale=2): | |
| output_audio = gr.Audio(label="Generated speech", type="filepath") | |
| gr.Markdown( | |
| """**Supported languages** | |
| Chinese ยท English ยท German ยท Spanish ยท French ยท Japanese ยท Italian ยท Hungarian ยท | |
| Korean ยท Russian ยท Persian ยท Arabic ยท Polish ยท Portuguese ยท Czech ยท Danish ยท Swedish ยท | |
| Greek ยท Turkish | |
| **Resources**: [Paper (arXiv:2603.18090)](https://arxiv.org/abs/2603.18090) ยท | |
| [GitHub](https://github.com/OpenMOSS/MOSS-TTS-Nano) ยท | |
| [Model card](https://huggingface.co/OpenMOSS-Team/MOSS-TTS-Nano-100M)""" | |
| ) | |
| example_picker.change( | |
| fn=on_example_select, | |
| inputs=example_picker, | |
| outputs=[text_input, ref_audio], | |
| ) | |
| generate_btn.click( | |
| fn=generate_speech, | |
| inputs=[text_input, ref_audio, max_frames_slider, do_sample_cb, seed_input], | |
| outputs=output_audio, | |
| ) | |
| demo.launch(show_error=True, ssr_mode=False) | |