import os os.environ.setdefault("NO_TORCH_COMPILE", "1") import spaces import numpy as np import torch import torchaudio import gradio as gr from transformers import AutoProcessor, MoonshineForConditionalGeneration from generator import Segment, load_miso_8b device = "cuda" if torch.cuda.is_available() else "cpu" generator = load_miso_8b(device=device, model_path_or_repo_id="MisoLabs/MisoTTS") SAMPLE_RATE = generator.sample_rate # Mimi encodes in fixed-size frames. moshi 0.2.12 and the repo's pinned 0.2.2 pad a # partial trailing frame differently, so trim the reference to a whole number of frames # to get byte-identical reference codes (every full frame already matches 1:1). MIMI_FRAME_SIZE = int(generator._audio_tokenizer.frame_size) # Moonshine ASR for auto-transcribing reference clips. Kept on CPU and never called # from inside an @spaces.GPU function, so it does not consume the ZeroGPU quota. ASR_SAMPLE_RATE = 16000 asr_processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-base") asr_model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-base").eval() MAX_INPUT_CHARS = 1000 DESCRIPTION = """ # Miso TTS 8B Text-to-speech with the [MisoLabs/MisoTTS](https://huggingface.co/MisoLabs/MisoTTS) model — an 8B [Sesame CSM](https://github.com/SesameAILabs/csm)-style model that generates Mimi audio codes from text, with optional voice continuation from a reference clip. """ def _resample_to_model(audio: torch.Tensor, sr: int) -> torch.Tensor: audio = audio.mean(dim=0) if audio.ndim > 1 else audio if sr != SAMPLE_RATE: audio = torchaudio.functional.resample(audio, orig_freq=sr, new_freq=SAMPLE_RATE) return audio def transcribe(ref_audio_path): """CPU-only auto-transcription of the reference clip (runs on the always-on host).""" if not ref_audio_path: return gr.update() wav, sr = torchaudio.load(ref_audio_path) wav = wav.mean(dim=0) if wav.ndim > 1 else wav if sr != ASR_SAMPLE_RATE: wav = torchaudio.functional.resample(wav, orig_freq=sr, new_freq=ASR_SAMPLE_RATE) inputs = asr_processor(wav.numpy(), sampling_rate=ASR_SAMPLE_RATE, return_tensors="pt") with torch.no_grad(): tokens = asr_model.generate(**inputs) return asr_processor.decode(tokens[0], skip_special_tokens=True).strip() @spaces.GPU(duration=120) def synthesize(text, ref_audio_path, ref_text, speaker_id, max_length_s, temperature, topk): text = (text or "").strip() if not text: raise gr.Error("Please enter some text to synthesize.") if len(text) > MAX_INPUT_CHARS: raise gr.Error(f"Text too long (>{MAX_INPUT_CHARS} characters).") # ZeroGPU streams weights to the real GPU on first entry but leaves the torchtune # KV-cache's non-persistent buffers (e.g. cache_pos) behind, causing a cuda/cpu # device mismatch. Re-place the model on the device here, inside the GPU worker. generator._model.to(device) generator._audio_tokenizer.to(device) context = [] if ref_audio_path: if not (ref_text or "").strip(): raise gr.Error("Please provide the transcript of the reference audio.") wav, sr = torchaudio.load(ref_audio_path) wav = _resample_to_model(wav, sr) usable = (wav.shape[-1] // MIMI_FRAME_SIZE) * MIMI_FRAME_SIZE if usable < SAMPLE_RATE: # under ~1s of usable audio gr.Warning("The reference audio may be too short; result quality may suffer.") if usable > 0: wav = wav[:usable].to(device) context = [Segment(speaker=int(speaker_id), text=ref_text.strip(), audio=wav)] audio = generator.generate( text=text, speaker=int(speaker_id), context=context, max_audio_length_ms=float(max_length_s) * 1000.0, temperature=float(temperature), topk=int(topk), ) audio_np = (audio * 32768).clamp(-32768, 32767).to(torch.int16).cpu().numpy() return SAMPLE_RATE, audio_np with gr.Blocks(title="Miso TTS 8B") as demo: gr.Markdown(DESCRIPTION) with gr.Row(): with gr.Column(): text = gr.Textbox( label="Text to synthesize", placeholder="Hello from Miso.", lines=3, value="Hello from Miso. This is an eight billion parameter text to speech model.", ) with gr.Accordion("Voice cloning (optional)", open=False): ref_audio = gr.Audio(label="Reference audio", type="filepath") ref_text = gr.Textbox( label="Reference transcript (auto-filled on upload)", placeholder="The exact words spoken in the reference audio.", lines=2, ) with gr.Accordion("Advanced", open=False): speaker_id = gr.Slider(0, 1, value=0, step=1, label="Speaker ID") max_length = gr.Slider(2, 60, value=10, step=1, label="Max audio length (s)") temperature = gr.Slider( 0.1, 1.5, value=0.7, step=0.05, label="Temperature (auto-lowered when cloning a voice)", ) topk = gr.Slider(1, 100, value=50, step=1, label="Top-k") run = gr.Button("Generate", variant="primary") with gr.Column(): out = gr.Audio(label="Generated speech") ref_audio.change(transcribe, inputs=[ref_audio], outputs=[ref_text]) # Cloning tracks the reference much more closely at low temperature. ref_audio.change( lambda p: 0.4 if p else 0.7, inputs=[ref_audio], outputs=[temperature] ) run.click( synthesize, inputs=[text, ref_audio, ref_text, speaker_id, max_length, temperature, topk], outputs=[out], ) demo.queue().launch()