import time, os, shutil, subprocess, tempfile import numpy as np import gradio as gr import soundfile as sf import torch from speechbrain.inference.TTS import Tacotron2 from speechbrain.inference.vocoders import HIFIGAN from speechbrain.utils.fetching import LocalStrategy SAMPLE_RATE = 22050 # ---- Load models once (on Space startup) ---- taco = Tacotron2.from_hparams( source="Sunbird/tts-tacotron2-lug", savedir="pretrained/tts-tacotron2-lug", local_strategy=LocalStrategy.COPY, ) vocoder = HIFIGAN.from_hparams( source="speechbrain/tts-hifigan-ljspeech", savedir="pretrained/tts-hifigan-ljspeech", local_strategy=LocalStrategy.COPY, ) def _ensure_mel_shape(mel): # Make sure mel is [B, n_mels, T] if isinstance(mel, (tuple, list)): mel = mel[0] if mel.dim() == 3 and mel.shape[1] != 80 and mel.shape[2] == 80: mel = mel.transpose(1, 2) return mel def _have_ffmpeg(): return shutil.which("ffmpeg") is not None def _save_wav_np(path, wav_tensor): """Save float32 mono [-1,1] to WAV using soundfile (no torchaudio backend needed).""" x = wav_tensor.detach().cpu().numpy().astype(np.float32) sf.write(path, x, SAMPLE_RATE, subtype="PCM_16") def tts_luganda(text): text = (text or "").strip() if not text: return None, None, "Please enter Luganda text." # Synthesize mel = _ensure_mel_shape(taco.encode_text(text)) wav = vocoder.decode_batch(mel)[0].squeeze(0) # 1D torch tensor # Save a temporary WAV ts = int(time.time()) base = f"luganda_tts_{ts}" wav_path = os.path.join(tempfile.gettempdir(), base + ".wav") _save_wav_np(wav_path, wav) # Optional MP3 via ffmpeg mp3_path = None if _have_ffmpeg(): mp3_path = os.path.join(tempfile.gettempdir(), base + ".mp3") try: subprocess.run( ["ffmpeg", "-y", "-i", wav_path, "-codec:a", "libmp3lame", "-q:a", "2", mp3_path], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) except Exception: mp3_path = None status = "✅ Done." if mp3_path: status += " (WAV + MP3 ready)" else: status += " (WAV ready)" return wav_path, (mp3_path if mp3_path else None), status with gr.Blocks(title="Luganda TTS") as demo: gr.Markdown("# 🌍 Luganda Text-to-Speech\nType Luganda, click **Generate**, and listen/download the audio.") text = gr.Textbox(label="Luganda text", lines=6, value="Ngenda mu kibuga Kampala olunaku lwa leero.") btn = gr.Button("Generate", variant="primary") out_wav = gr.Audio(label="WAV (22.05 kHz)", type="filepath") out_mp3 = gr.File(label="Download MP3", interactive=False) status = gr.Markdown("Ready.") btn.click(fn=tts_luganda, inputs=text, outputs=[out_wav, out_mp3, status]) # Just enable queue with defaults (no unsupported args) demo.queue().launch()