File size: 4,237 Bytes
9b498f2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | #!/usr/bin/env python3
"""
Nigerian TTS API - YarnGPT-based TTS for Nigerian languages.
Runs on HuggingFace Spaces with T4 GPU.
"""
import io
import logging
import time
import tempfile
import torch
import torchaudio
import gradio as gr
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)
# Check GPU
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
log.info(f"Device: {DEVICE}")
if DEVICE == "cuda":
log.info(f"GPU: {torch.cuda.get_device_name(0)}")
def get_system_info():
"""Return system information."""
info = f"Device: {DEVICE}\n"
info += f"PyTorch: {torch.__version__}\n"
if DEVICE == "cuda":
info += f"GPU: {torch.cuda.get_device_name(0)}\n"
mem = torch.cuda.get_device_properties(0).total_memory / 1e9
info += f"Memory: {mem:.1f} GB"
return info
def tts_synthesize(text: str, speaker: str, language: str):
"""TTS synthesis using YarnGPT."""
if not text.strip():
return None, "Empty text"
log.info(f"TTS: text='{text[:50]}...', speaker={speaker}, lang={language}")
start = time.time()
try:
from yarngpt import generate_speech
# Generate speech
audio_tensor = generate_speech(
text=text,
speaker=speaker,
language=language,
temperature=0.1,
repetition_penalty=1.1,
max_length=4000,
)
elapsed = time.time() - start
log.info(f"TTS done in {elapsed:.1f}s")
# Save to temp file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
torchaudio.save(f.name, audio_tensor, sample_rate=24000)
return f.name, None
except Exception as e:
log.error(f"TTS error: {e}")
import traceback
traceback.print_exc()
return None, str(e)
# Available speakers by language
SPEAKERS = {
"english": ["idera", "chinenye", "jude", "emma", "umar", "joke", "zainab", "osagie", "remi", "tayo"],
"yoruba": ["abayomi", "aisha", "folake"],
"igbo": ["chioma", "obinna", "adanna"],
"hausa": ["amina", "fatima", "ibrahim", "yusuf"],
}
ALL_SPEAKERS = []
for speakers in SPEAKERS.values():
ALL_SPEAKERS.extend(speakers)
def update_speakers(language):
"""Update speaker dropdown based on language."""
speakers = SPEAKERS.get(language, SPEAKERS["english"])
return gr.Dropdown(choices=speakers, value=speakers[0])
# Gradio UI
with gr.Blocks(title="Nigerian TTS API") as demo:
gr.Markdown("# 🎙️ Nigerian TTS API")
gr.Markdown("YarnGPT-based Text-to-Speech for Nigerian languages")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Text",
placeholder="Enter text to synthesize...",
lines=3,
)
language = gr.Dropdown(
label="Language",
choices=["english", "yoruba", "igbo", "hausa"],
value="english",
)
speaker = gr.Dropdown(
label="Speaker",
choices=SPEAKERS["english"],
value="idera",
)
submit_btn = gr.Button("🔊 Synthesize", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="Output Audio", type="filepath")
error_output = gr.Textbox(label="Status", visible=True)
sys_info = gr.Textbox(label="System Info", value=get_system_info(), lines=4)
# Update speakers when language changes
language.change(fn=update_speakers, inputs=[language], outputs=[speaker])
# Generate speech on button click
submit_btn.click(
fn=tts_synthesize,
inputs=[text_input, speaker, language],
outputs=[audio_output, error_output],
)
# Example inputs
gr.Examples(
examples=[
["Hello, how are you today?", "idera", "english"],
["The weather in Lagos is beautiful.", "emma", "english"],
["Bawo ni, e ku ojo.", "abayomi", "yoruba"],
["How you dey, my brother?", "jude", "english"],
],
inputs=[text_input, speaker, language],
)
if __name__ == "__main__":
demo.launch()
|