File size: 6,748 Bytes
8c275c0 7e57681 8c275c0 7e57681 8c275c0 f836ee2 8c275c0 f836ee2 8c275c0 33739e4 b6f5dfb 8c275c0 33739e4 f836ee2 33739e4 f836ee2 33739e4 f836ee2 33739e4 f836ee2 33739e4 f836ee2 33739e4 f836ee2 33739e4 f836ee2 33739e4 f836ee2 33739e4 f836ee2 33739e4 f836ee2 33739e4 f836ee2 33739e4 f836ee2 33739e4 8c275c0 f836ee2 33739e4 8c275c0 f836ee2 50f6f95 8c275c0 f836ee2 33739e4 f836ee2 8c275c0 50f6f95 f836ee2 50f6f95 8c275c0 50f6f95 f836ee2 8c275c0 f836ee2 33739e4 f836ee2 33739e4 f836ee2 33739e4 f836ee2 33739e4 cd7d46e f836ee2 33739e4 f836ee2 33739e4 cd7d46e 50f6f95 f836ee2 33739e4 50f6f95 33739e4 f836ee2 33739e4 f836ee2 33739e4 f836ee2 50f6f95 f836ee2 33739e4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 | import gradio as gr
import torch
import torchaudio
import re
import os
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
import numpy as np
# --- Configuration ---
device = "cuda" if torch.cuda.is_available() else "cpu"
# --- ADD ALL YOUR VOICE FILES HERE ---
# The code will automatically create a dropdown for these files.
# Make sure these files are in the same directory as your script.
VOICE_SAMPLE_FILES = ["46.wav", "90.wav", "150.wav", "355.wav"]
# Directory to store speaker embedding files
EMBEDDING_DIR = "speaker_embeddings"
os.makedirs(EMBEDDING_DIR, exist_ok=True)
# --- Load Models ---
# This part loads all the necessary AI models.
try:
print("Loading models... This may take a moment.")
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/5aad").to(device)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
speaker_model = EncoderClassifier.from_hparams(
source="speechbrain/spkrec-xvect-voxceleb",
run_opts={"device": device},
savedir=os.path.join("pretrained_models", "spkrec-xvect-voxceleb")
)
print("Models loaded successfully.")
except Exception as e:
raise gr.Error(f"Error loading models: {e}. Check your internet connection.")
# A dictionary to cache loaded speaker embeddings in memory
speaker_embeddings_cache = {}
# --- Function to Get or Create Speaker Embedding ---
def get_speaker_embedding(wav_file_path):
"""
Loads a speaker embedding from cache or file. If not found, creates and saves it.
"""
# Check cache first
if wav_file_path in speaker_embeddings_cache:
return speaker_embeddings_cache[wav_file_path]
embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
if os.path.exists(embedding_path):
print(f"Loading existing embedding for {wav_file_path}")
embedding = torch.load(embedding_path, map_location=device)
speaker_embeddings_cache[wav_file_path] = embedding
return embedding
print(f"Creating new speaker embedding for {wav_file_path}...")
if not os.path.exists(wav_file_path):
raise gr.Error(f"Audio file not found: {wav_file_path}. Please make sure it's in the correct directory.")
try:
audio, sr = torchaudio.load(wav_file_path)
if sr != 16000:
audio = torchaudio.functional.resample(audio, sr, 16000)
if audio.shape[0] > 1:
audio = torch.mean(audio, dim=0, keepdim=True)
with torch.no_grad():
embedding = speaker_model.encode_batch(audio.to(device))
embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
torch.save(embedding.cpu(), embedding_path)
speaker_embeddings_cache[wav_file_path] = embedding.to(device)
print(f"Embedding created and saved for {wav_file_path}.")
return embedding.to(device)
except Exception as e:
raise gr.Error(f"Could not process audio file {wav_file_path}. Is it a valid WAV file? Error: {e}")
# --- Text Processing Functions (Somali Number Conversion) ---
# These functions remain the same.
number_words = {
0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
11: "kow iyo toban", 12: "labo iyo toban", 13: "saddex iyo toban",
14: "afar iyo toban", 15: "shan iyo toban", 16: "lix iyo toban",
17: "toddobo iyo toban", 18: "siddeed iyo toban", 19: "sagaal iyo toban",
20: "labaatan", 30: "soddon", 40: "afartan", 50: "konton",
60: "lixdan", 70: "toddobaatan", 80: "siddeetan", 90: "sagaashan",
100: "boqol", 1000: "kun",
}
def number_to_words(n):
if n in number_words: return number_words[n]
if n < 100: return number_words[n//10 * 10] + (" iyo " + number_words[n%10] if n%10 else "")
if n < 1000: return (number_words[n//100] + " boqol" if n//100 > 1 else "boqol") + (" iyo " + number_to_words(n%100) if n%100 else "")
if n < 1000000: return (number_to_words(n//1000) + " kun" if n//1000 > 1 else "kun") + (" iyo " + number_to_words(n%1000) if n%1000 else "")
return str(n)
def replace_numbers_with_words(text):
return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
def normalize_text(text):
text = text.lower()
text = replace_numbers_with_words(text)
text = re.sub(r'[^\w\s\']', '', text)
return text
# --- Main Text-to-Speech Function ---
def text_to_speech(text, voice_choice):
"""
Takes text and the chosen voice file, and returns audio.
"""
if not text:
gr.Warning("Please enter some text.")
return None, None
if not voice_choice:
gr.Warning("Please select a voice from the dropdown.")
return None, None
# Get the correct speaker embedding for the chosen voice
speaker_embedding = get_speaker_embedding(voice_choice)
normalized_text = normalize_text(text)
inputs = processor(text=normalized_text, return_tensors="pt").to(device)
with torch.no_grad():
speech = model.generate_speech(
inputs["input_ids"],
speaker_embeddings=speaker_embedding.unsqueeze(0),
vocoder=vocoder
)
return (16000, speech.cpu().numpy())
# --- Gradio Interface ---
# The user interface now includes a dropdown menu for voice selection.
iface = gr.Interface(
fn=text_to_speech,
inputs=[
gr.Textbox(label="Geli qoraalka af-Soomaaliga (Enter Somali Text)"),
gr.Dropdown(
VOICE_SAMPLE_FILES,
label="Select Voice",
info="Choose the voice you want to use for the speech.",
value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None # Default to the first voice
)
],
outputs=gr.Audio(label="Codka La Abuuray (Generated Voice)", type="numpy"),
title="Multi-Voice Somali Text-to-Speech",
description="Enter Somali text, choose a voice from the dropdown, and click submit to generate speech.",
examples=[
["Sidee tahay saaxiib? Maanta waa maalin wanaagsan.", VOICE_SAMPLE_FILES[0]],
["Nabad gelyo, is arag dambe.", VOICE_SAMPLE_FILES[1] if len(VOICE_SAMPLE_FILES) > 1 else VOICE_SAMPLE_FILES[0]],
]
)
# Launch the web interface
if __name__ == "__main__":
# Pre-load embeddings for a faster startup experience
print("Pre-loading all voice embeddings...")
for voice_file in VOICE_SAMPLE_FILES:
get_speaker_embedding(voice_file)
print("All voices are ready. Launching interface.")
iface.launch(share=True) |