File size: 2,964 Bytes
888244a
61aea24
888244a
61aea24
888244a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61aea24
 
 
 
 
888244a
 
 
 
 
 
 
61aea24
888244a
 
 
61aea24
888244a
61aea24
888244a
61aea24
888244a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61aea24
 
 
 
888244a
 
 
 
61aea24
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import time, os, shutil, subprocess, tempfile
import numpy as np
import gradio as gr
import soundfile as sf
import torch
from speechbrain.inference.TTS import Tacotron2
from speechbrain.inference.vocoders import HIFIGAN
from speechbrain.utils.fetching import LocalStrategy

SAMPLE_RATE = 22050

# ---- Load models once (on Space startup) ----
taco = Tacotron2.from_hparams(
    source="Sunbird/tts-tacotron2-lug",
    savedir="pretrained/tts-tacotron2-lug",
    local_strategy=LocalStrategy.COPY,
)
vocoder = HIFIGAN.from_hparams(
    source="speechbrain/tts-hifigan-ljspeech",
    savedir="pretrained/tts-hifigan-ljspeech",
    local_strategy=LocalStrategy.COPY,
)

def _ensure_mel_shape(mel):
    # Make sure mel is [B, n_mels, T]
    if isinstance(mel, (tuple, list)):
        mel = mel[0]
    if mel.dim() == 3 and mel.shape[1] != 80 and mel.shape[2] == 80:
        mel = mel.transpose(1, 2)
    return mel

def _have_ffmpeg():
    return shutil.which("ffmpeg") is not None

def _save_wav_np(path, wav_tensor):
    """Save float32 mono [-1,1] to WAV using soundfile (no torchaudio backend needed)."""
    x = wav_tensor.detach().cpu().numpy().astype(np.float32)
    sf.write(path, x, SAMPLE_RATE, subtype="PCM_16")

def tts_luganda(text):
    text = (text or "").strip()
    if not text:
        return None, None, "Please enter Luganda text."

    # Synthesize
    mel = _ensure_mel_shape(taco.encode_text(text))
    wav = vocoder.decode_batch(mel)[0].squeeze(0)  # 1D torch tensor

    # Save a temporary WAV
    ts = int(time.time())
    base = f"luganda_tts_{ts}"
    wav_path = os.path.join(tempfile.gettempdir(), base + ".wav")
    _save_wav_np(wav_path, wav)

    # Optional MP3 via ffmpeg
    mp3_path = None
    if _have_ffmpeg():
        mp3_path = os.path.join(tempfile.gettempdir(), base + ".mp3")
        try:
            subprocess.run(
                ["ffmpeg", "-y", "-i", wav_path, "-codec:a", "libmp3lame", "-q:a", "2", mp3_path],
                check=True,
                stdout=subprocess.DEVNULL,
                stderr=subprocess.DEVNULL,
            )
        except Exception:
            mp3_path = None

    status = "✅ Done."
    if mp3_path:
        status += " (WAV + MP3 ready)"
    else:
        status += " (WAV ready)"

    return wav_path, (mp3_path if mp3_path else None), status

with gr.Blocks(title="Luganda TTS") as demo:
    gr.Markdown("# 🌍 Luganda Text-to-Speech\nType Luganda, click **Generate**, and listen/download the audio.")
    text = gr.Textbox(label="Luganda text", lines=6, value="Ngenda mu kibuga Kampala olunaku lwa leero.")
    btn = gr.Button("Generate", variant="primary")
    out_wav = gr.Audio(label="WAV (22.05 kHz)", type="filepath")
    out_mp3 = gr.File(label="Download MP3", interactive=False)
    status = gr.Markdown("Ready.")

    btn.click(fn=tts_luganda, inputs=text, outputs=[out_wav, out_mp3, status])

# Just enable queue with defaults (no unsupported args)
demo.queue().launch()