Spaces:
Sleeping
Sleeping
File size: 6,321 Bytes
c2e0442 913382e c2e0442 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 | import gradio as gr
import numpy as np
import torch
import tempfile
import os
from scipy.io.wavfile import write
from transformers import (
SpeechT5Processor,
SpeechT5ForTextToSpeech,
SpeechT5HifiGan
)
# =========================
# Model loading
# =========================
checkpoint = "Chithekitale/chichewa_tts_norules"
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# Make all keys consistent
speaker_embeddings = {
"SPK1": "cmu_us_slt_arctic-wav-arctic_a0508.npy",
"SPK2": "cmu_us_rms_arctic-wav-arctic_b0353.npy",
"SPK3": "cmu_us_ksp_arctic-wav-arctic_b0087.npy",
"SPK4": "cmu_us_rms_arctic-wav-arctic_b0353.npy",
"SPK5": "cmu_us_slt_arctic-wav-arctic_a0508.npy",
}
SPEAKER_CHOICES = [
"SPK1 (female)",
"SPK2 (male)",
"SPK3 (male)",
"SPK4 (male)",
"SPK5 (female)"
]
EXAMPLES = [
["Ndapita, koma ndibweranso pompano.", "SPK1 (female)"],
["Koma apapa zikuoneka kuti ziyenda bwino.", "SPK2 (male)"],
["Ineyo ndikuona kuti sizizasithanso.", "SPK3 (male)"],
["Mwina kusogolo kuno anthu ena azalimba mtima, koma panopana ndakaika.", "SPK4 (male)"],
["Simungasankhe munthu oti bola linamukana.", "SPK5 (female)"],
["Kodi chimanga panopa chikugulisidwa zingati, kapena nanunso simukudziwa?", "SPK5 (female)"],
]
SAMPLE_RATE = 16000
# =========================
# Helpers
# =========================
def get_speaker_key(speaker_label: str) -> str:
# "SPK1 (female)" -> "SPK1"
return speaker_label.split()[0]
def load_speaker_embedding(speaker: str) -> np.ndarray:
speaker_key = get_speaker_key(speaker)
if speaker_key not in speaker_embeddings:
raise ValueError(f"Unknown speaker key: {speaker_key}")
path = speaker_embeddings[speaker_key]
try:
speaker_embedding = np.load(path).astype(np.float32)
except Exception as e:
raise FileNotFoundError(
f"Could not load speaker embedding file: {path}. Error: {e}"
)
if speaker_embedding.ndim == 2:
speaker_embedding = speaker_embedding.mean(axis=0)
speaker_embedding = np.squeeze(speaker_embedding)
if speaker_embedding.shape != (512,):
raise ValueError(
f"Unexpected speaker embedding shape after processing: "
f"{speaker_embedding.shape}. Expected (512,)"
)
return speaker_embedding
def save_audio_to_wav(audio: np.ndarray, sample_rate: int = SAMPLE_RATE) -> str:
"""
Save generated int16 audio to a temporary WAV file and return its path.
"""
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
temp_file.close()
write(temp_file.name, sample_rate, audio)
return temp_file.name
# =========================
# Inference
# =========================
def predict(text, speaker):
try:
if not text or len(text.strip()) == 0:
return None, None, "Please enter some Chichewa text."
inputs = processor(text=text, return_tensors="pt")
input_ids = inputs["input_ids"][..., :model.config.max_text_positions]
speaker_embedding = load_speaker_embedding(speaker)
speaker_embedding = torch.tensor(
speaker_embedding, dtype=torch.float32
).unsqueeze(0)
with torch.no_grad():
speech = model.generate_speech(
input_ids,
speaker_embedding,
vocoder=vocoder
)
speech = speech.cpu().numpy()
# Normalize safely before int16 conversion
max_val = np.max(np.abs(speech))
if max_val > 0:
speech = speech / max_val
speech = (speech * 32767).astype(np.int16)
# Save WAV file for downloading
wav_path = save_audio_to_wav(speech, SAMPLE_RATE)
status = f"Generated speech successfully using speaker: {speaker}"
return (SAMPLE_RATE, speech), wav_path, status
except Exception as e:
return None, None, f"Error during generation: {str(e)}"
def clear_all():
return "", "SPK1 (female)", None, None, "Ready."
# =========================
# UI
# =========================
custom_css = """
.gradio-container {
max-width: 1100px !important;
margin: 0 auto;
}
.hero {
text-align: center;
padding: 10px 0 0 0;
}
.section-note {
font-size: 0.95rem;
opacity: 0.9;
}
"""
with gr.Blocks(css=custom_css, title="Baseline Chichewa Speech Synthesis Demo") as demo:
gr.HTML(
"""
<div class="hero">
<h1>Baseline Chichewa Synthesis</h1>
<p class="section-note">
Enter Chichewa text, choose a speaker voice, and generate speech audio.
</p>
</div>
"""
)
with gr.Row():
with gr.Column(scale=5):
text_input = gr.Textbox(
label="Input Text",
placeholder="Type Chichewa text here...",
lines=6
)
speaker_input = gr.Radio(
label="Speaker Voice",
choices=SPEAKER_CHOICES,
value="SPK1 (female)"
)
with gr.Row():
generate_btn = gr.Button("Generate Speech", variant="primary")
clear_btn = gr.Button("Clear")
status_box = gr.Textbox(
label="System Status",
value="Ready.",
interactive=False
)
with gr.Column(scale=5):
audio_output = gr.Audio(
label="Generated Speech",
type="numpy",
autoplay=False
)
download_file = gr.File(
label="Download Audio File"
)
gr.Markdown("### Example Inputs")
gr.Examples(
examples=EXAMPLES,
inputs=[text_input, speaker_input]
)
generate_btn.click(
fn=predict,
inputs=[text_input, speaker_input],
outputs=[audio_output, download_file, status_box],
show_progress="full"
)
clear_btn.click(
fn=clear_all,
inputs=[],
outputs=[text_input, speaker_input, audio_output, download_file, status_box]
)
demo.launch() |