Spaces:
Running on Zero
Running on Zero
| import os | |
| os.environ.setdefault("NO_TORCH_COMPILE", "1") | |
| import spaces | |
| import numpy as np | |
| import torch | |
| import torchaudio | |
| import gradio as gr | |
| from transformers import AutoProcessor, MoonshineForConditionalGeneration | |
| from generator import Segment, load_miso_8b | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| generator = load_miso_8b(device=device, model_path_or_repo_id="MisoLabs/MisoTTS") | |
| SAMPLE_RATE = generator.sample_rate | |
| # Mimi encodes in fixed-size frames. moshi 0.2.12 and the repo's pinned 0.2.2 pad a | |
| # partial trailing frame differently, so trim the reference to a whole number of frames | |
| # to get byte-identical reference codes (every full frame already matches 1:1). | |
| MIMI_FRAME_SIZE = int(generator._audio_tokenizer.frame_size) | |
| # Moonshine ASR for auto-transcribing reference clips. Kept on CPU and never called | |
| # from inside an @spaces.GPU function, so it does not consume the ZeroGPU quota. | |
| ASR_SAMPLE_RATE = 16000 | |
| asr_processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-base") | |
| asr_model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-base").eval() | |
| MAX_INPUT_CHARS = 1000 | |
| DESCRIPTION = """ | |
| # Miso TTS 8B | |
| Text-to-speech with the [MisoLabs/MisoTTS](https://huggingface.co/MisoLabs/MisoTTS) model — an | |
| 8B [Sesame CSM](https://github.com/SesameAILabs/csm)-style model that generates Mimi audio codes | |
| from text, with optional voice continuation from a reference clip. | |
| """ | |
| def _resample_to_model(audio: torch.Tensor, sr: int) -> torch.Tensor: | |
| audio = audio.mean(dim=0) if audio.ndim > 1 else audio | |
| if sr != SAMPLE_RATE: | |
| audio = torchaudio.functional.resample(audio, orig_freq=sr, new_freq=SAMPLE_RATE) | |
| return audio | |
| def transcribe(ref_audio_path): | |
| """CPU-only auto-transcription of the reference clip (runs on the always-on host).""" | |
| if not ref_audio_path: | |
| return gr.update() | |
| wav, sr = torchaudio.load(ref_audio_path) | |
| wav = wav.mean(dim=0) if wav.ndim > 1 else wav | |
| if sr != ASR_SAMPLE_RATE: | |
| wav = torchaudio.functional.resample(wav, orig_freq=sr, new_freq=ASR_SAMPLE_RATE) | |
| inputs = asr_processor(wav.numpy(), sampling_rate=ASR_SAMPLE_RATE, return_tensors="pt") | |
| with torch.no_grad(): | |
| tokens = asr_model.generate(**inputs) | |
| return asr_processor.decode(tokens[0], skip_special_tokens=True).strip() | |
| def synthesize(text, ref_audio_path, ref_text, speaker_id, max_length_s, temperature, topk): | |
| text = (text or "").strip() | |
| if not text: | |
| raise gr.Error("Please enter some text to synthesize.") | |
| if len(text) > MAX_INPUT_CHARS: | |
| raise gr.Error(f"Text too long (>{MAX_INPUT_CHARS} characters).") | |
| # ZeroGPU streams weights to the real GPU on first entry but leaves the torchtune | |
| # KV-cache's non-persistent buffers (e.g. cache_pos) behind, causing a cuda/cpu | |
| # device mismatch. Re-place the model on the device here, inside the GPU worker. | |
| generator._model.to(device) | |
| generator._audio_tokenizer.to(device) | |
| context = [] | |
| if ref_audio_path: | |
| if not (ref_text or "").strip(): | |
| raise gr.Error("Please provide the transcript of the reference audio.") | |
| wav, sr = torchaudio.load(ref_audio_path) | |
| wav = _resample_to_model(wav, sr) | |
| usable = (wav.shape[-1] // MIMI_FRAME_SIZE) * MIMI_FRAME_SIZE | |
| if usable < SAMPLE_RATE: # under ~1s of usable audio | |
| gr.Warning("The reference audio may be too short; result quality may suffer.") | |
| if usable > 0: | |
| wav = wav[:usable].to(device) | |
| context = [Segment(speaker=int(speaker_id), text=ref_text.strip(), audio=wav)] | |
| audio = generator.generate( | |
| text=text, | |
| speaker=int(speaker_id), | |
| context=context, | |
| max_audio_length_ms=float(max_length_s) * 1000.0, | |
| temperature=float(temperature), | |
| topk=int(topk), | |
| ) | |
| audio_np = (audio * 32768).clamp(-32768, 32767).to(torch.int16).cpu().numpy() | |
| return SAMPLE_RATE, audio_np | |
| with gr.Blocks(title="Miso TTS 8B") as demo: | |
| gr.Markdown(DESCRIPTION) | |
| with gr.Row(): | |
| with gr.Column(): | |
| text = gr.Textbox( | |
| label="Text to synthesize", | |
| placeholder="Hello from Miso.", | |
| lines=3, | |
| value="Hello from Miso. This is an eight billion parameter text to speech model.", | |
| ) | |
| with gr.Accordion("Voice cloning (optional)", open=False): | |
| ref_audio = gr.Audio(label="Reference audio", type="filepath") | |
| ref_text = gr.Textbox( | |
| label="Reference transcript (auto-filled on upload)", | |
| placeholder="The exact words spoken in the reference audio.", | |
| lines=2, | |
| ) | |
| with gr.Accordion("Advanced", open=False): | |
| speaker_id = gr.Slider(0, 1, value=0, step=1, label="Speaker ID") | |
| max_length = gr.Slider(2, 60, value=10, step=1, label="Max audio length (s)") | |
| temperature = gr.Slider( | |
| 0.1, 1.5, value=0.7, step=0.05, | |
| label="Temperature (auto-lowered when cloning a voice)", | |
| ) | |
| topk = gr.Slider(1, 100, value=50, step=1, label="Top-k") | |
| run = gr.Button("Generate", variant="primary") | |
| with gr.Column(): | |
| out = gr.Audio(label="Generated speech") | |
| ref_audio.change(transcribe, inputs=[ref_audio], outputs=[ref_text]) | |
| # Cloning tracks the reference much more closely at low temperature. | |
| ref_audio.change( | |
| lambda p: 0.4 if p else 0.7, inputs=[ref_audio], outputs=[temperature] | |
| ) | |
| run.click( | |
| synthesize, | |
| inputs=[text, ref_audio, ref_text, speaker_id, max_length, temperature, topk], | |
| outputs=[out], | |
| ) | |
| demo.queue().launch() | |