| import os |
|
|
| os.environ.setdefault("NO_TORCH_COMPILE", "1") |
|
|
| import spaces |
| import numpy as np |
| import torch |
| import torchaudio |
| import gradio as gr |
|
|
| from transformers import AutoProcessor, MoonshineForConditionalGeneration |
|
|
| from generator import Segment, load_miso_8b |
|
|
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| generator = load_miso_8b(device=device, model_path_or_repo_id="MisoLabs/MisoTTS") |
| SAMPLE_RATE = generator.sample_rate |
| |
| |
| |
| MIMI_FRAME_SIZE = int(generator._audio_tokenizer.frame_size) |
|
|
| |
| |
| ASR_SAMPLE_RATE = 16000 |
| asr_processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-base") |
| asr_model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-base").eval() |
|
|
| MAX_INPUT_CHARS = 1000 |
|
|
| DESCRIPTION = """ |
| # Miso TTS 8B |
| |
| Text-to-speech with the [MisoLabs/MisoTTS](https://huggingface.co/MisoLabs/MisoTTS) model — an |
| 8B [Sesame CSM](https://github.com/SesameAILabs/csm)-style model that generates Mimi audio codes |
| from text, with optional voice continuation from a reference clip. |
| """ |
|
|
|
|
| def _resample_to_model(audio: torch.Tensor, sr: int) -> torch.Tensor: |
| audio = audio.mean(dim=0) if audio.ndim > 1 else audio |
| if sr != SAMPLE_RATE: |
| audio = torchaudio.functional.resample(audio, orig_freq=sr, new_freq=SAMPLE_RATE) |
| return audio |
|
|
|
|
| def transcribe(ref_audio_path): |
| """CPU-only auto-transcription of the reference clip (runs on the always-on host).""" |
| if not ref_audio_path: |
| return gr.update() |
| wav, sr = torchaudio.load(ref_audio_path) |
| wav = wav.mean(dim=0) if wav.ndim > 1 else wav |
| if sr != ASR_SAMPLE_RATE: |
| wav = torchaudio.functional.resample(wav, orig_freq=sr, new_freq=ASR_SAMPLE_RATE) |
| inputs = asr_processor(wav.numpy(), sampling_rate=ASR_SAMPLE_RATE, return_tensors="pt") |
| with torch.no_grad(): |
| tokens = asr_model.generate(**inputs) |
| return asr_processor.decode(tokens[0], skip_special_tokens=True).strip() |
|
|
|
|
| @spaces.GPU(duration=120) |
| def synthesize(text, ref_audio_path, ref_text, speaker_id, max_length_s, temperature, topk): |
| text = (text or "").strip() |
| if not text: |
| raise gr.Error("Please enter some text to synthesize.") |
| if len(text) > MAX_INPUT_CHARS: |
| raise gr.Error(f"Text too long (>{MAX_INPUT_CHARS} characters).") |
|
|
| |
| |
| |
| generator._model.to(device) |
| generator._audio_tokenizer.to(device) |
|
|
| context = [] |
| if ref_audio_path: |
| if not (ref_text or "").strip(): |
| raise gr.Error("Please provide the transcript of the reference audio.") |
| wav, sr = torchaudio.load(ref_audio_path) |
| wav = _resample_to_model(wav, sr) |
| usable = (wav.shape[-1] // MIMI_FRAME_SIZE) * MIMI_FRAME_SIZE |
| if usable < SAMPLE_RATE: |
| gr.Warning("The reference audio may be too short; result quality may suffer.") |
| if usable > 0: |
| wav = wav[:usable].to(device) |
| context = [Segment(speaker=int(speaker_id), text=ref_text.strip(), audio=wav)] |
|
|
| audio = generator.generate( |
| text=text, |
| speaker=int(speaker_id), |
| context=context, |
| max_audio_length_ms=float(max_length_s) * 1000.0, |
| temperature=float(temperature), |
| topk=int(topk), |
| ) |
|
|
| audio_np = (audio * 32768).clamp(-32768, 32767).to(torch.int16).cpu().numpy() |
| return SAMPLE_RATE, audio_np |
|
|
|
|
| with gr.Blocks(title="Miso TTS 8B") as demo: |
| gr.Markdown(DESCRIPTION) |
| with gr.Row(): |
| with gr.Column(): |
| text = gr.Textbox( |
| label="Text to synthesize", |
| placeholder="Hello from Miso.", |
| lines=3, |
| value="Hello from Miso. This is an eight billion parameter text to speech model.", |
| ) |
| with gr.Accordion("Voice cloning (optional)", open=False): |
| ref_audio = gr.Audio(label="Reference audio", type="filepath") |
| ref_text = gr.Textbox( |
| label="Reference transcript (auto-filled on upload)", |
| placeholder="The exact words spoken in the reference audio.", |
| lines=2, |
| ) |
| with gr.Accordion("Advanced", open=False): |
| speaker_id = gr.Slider(0, 1, value=0, step=1, label="Speaker ID") |
| max_length = gr.Slider(2, 60, value=10, step=1, label="Max audio length (s)") |
| temperature = gr.Slider( |
| 0.1, 1.5, value=0.7, step=0.05, |
| label="Temperature (auto-lowered when cloning a voice)", |
| ) |
| topk = gr.Slider(1, 100, value=50, step=1, label="Top-k") |
| run = gr.Button("Generate", variant="primary") |
| with gr.Column(): |
| out = gr.Audio(label="Generated speech") |
|
|
| ref_audio.change(transcribe, inputs=[ref_audio], outputs=[ref_text]) |
| |
| ref_audio.change( |
| lambda p: 0.4 if p else 0.7, inputs=[ref_audio], outputs=[temperature] |
| ) |
|
|
| run.click( |
| synthesize, |
| inputs=[text, ref_audio, ref_text, speaker_id, max_length, temperature, topk], |
| outputs=[out], |
| ) |
|
|
| demo.queue().launch() |
|
|