8aad / app.py
Somalitts's picture
Update app.py
4aa5331 verified
raw
history blame
6.34 kB
import gradio as gr
import torch
import torchaudio
import re
import os
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
import numpy as np
# --- Configuration ---
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# --- KU DAR FAYLKA CODADKAAGA ---
# TAYADA CODADKAN AYAA UGU MUHIMSAN NATIIJADA
VOICE_SAMPLE_FILES = ["1.wav"] # Hubi in faylkan tayadiisu fiican tahay
EMBEDDING_DIR = "speaker_embeddings"
os.makedirs(EMBEDDING_DIR, exist_ok=True)
# --- Soo Dejinta Model-yada ---
try:
print("Loading models...")
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
speaker_model = EncoderClassifier.from_hparams(
source="speechbrain/spkrec-xvect-voxceleb",
run_opts={"device": device},
savedir=os.path.join("pretrained_models", "spkrec-xvect-voxceleb")
)
print("Models loaded successfully.")
except Exception as e:
raise gr.Error(f"Error loading models: {e}.")
speaker_embeddings_cache = {}
def get_speaker_embedding(wav_file_path):
# Shaqadan sidii hore ayay u egtahay
if wav_file_path in speaker_embeddings_cache:
return speaker_embeddings_cache[wav_file_path]
embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
if os.path.exists(embedding_path):
embedding = torch.load(embedding_path, map_location=device)
speaker_embeddings_cache[wav_file_path] = embedding
return embedding
if not os.path.exists(wav_file_path):
raise gr.Error(f"Audio file not found: {wav_file_path}")
try:
audio, sr = torchaudio.load(wav_file_path)
if sr != 16000: audio = torchaudio.functional.resample(audio, sr, 16000)
if audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True)
with torch.no_grad():
embedding = speaker_model.encode_batch(audio.to(device))
embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
torch.save(embedding.cpu(), embedding_path)
speaker_embeddings_cache[wav_file_path] = embedding.to(device)
return embedding.to(device)
except Exception as e:
raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")
# --- Text Processing Functions (sidoodii) ---
number_words = {
0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
11: "kow iyo toban", 12: "labo iyo toban", 13: "saddex iyo toban",
14: "afar iyo toban", 15: "shan iyo toban", 16: "lix iyo toban",
17: "toddobo iyo toban", 18: "siddeed iyo toban", 19: "sagaal iyo toban",
20: "labaatan", 30: "soddon", 40: "afartan", 50: "konton",
60: "lixdan", 70: "toddobaatan", 80: "siddeetan", 90: "sagaashan",
100: "boqol", 1000: "kun",
}
def number_to_words(n):
if n in number_words:
return number_words[n]
if n < 100:
return number_words[n // 10 * 10] + (" iyo " + number_words[n % 10] if n % 10 else "")
if n < 1000:
return (number_words[n // 100] + " boqol" if n // 100 > 1 else "boqol") + (
" iyo " + number_to_words(n % 100) if n % 100 else "")
if n < 1_000_000:
return (number_to_words(n // 1000) + " kun" if n // 1000 > 1 else "kun") + (
" iyo " + number_to_words(n % 1000) if n % 1000 else "")
if n < 1_000_000_000:
return (number_to_words(n // 1_000_000) + " milyan" if n // 1_000_000 > 1 else "milyan") + (
" iyo " + number_to_words(n % 1_000_000) if n % 1_000_000 else "")
return str(n)
def replace_numbers_with_words(text):
return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
def normalize_text(text):
text = text.lower()
text = replace_numbers_with_words(text)
text = re.sub(r'[^\w\s\']', '', text)
return text
# --- Main Text-to-Speech Function (oo la Hagaajiyay) ---
def text_to_speech(text, voice_choice):
if not text or not voice_choice:
gr.Warning("Fadlan geli qoraal oo dooro cod.")
return None
speaker_embedding = get_speaker_embedding(voice_choice)
normalized_text = normalize_text(text)
inputs = processor(text=normalized_text, return_tensors="pt").to(device)
with torch.no_grad():
speech = model.generate(
input_ids=inputs["input_ids"],
speaker_embeddings=speaker_embedding.unsqueeze(0),
# --- Halbeegyada Tayada Codka ---
do_sample=True, # MUHIIM: Waxay ka dhigaysaa codka mid dabiici ah
top_k=50, # Waxay xaddidaysaa hal-abuurka si uusan u qaldamin
temperature=0.75, # Kani wuxuu xakameeyaa hal-abuurka. (0.7 - 0.8 waa fiican yahay)
repetition_penalty=1.2, # Waxay ka hortagtaa inuu ku celceliyo isku dhawaaq
max_new_tokens=512 # Waxay siinaysaa model-ka meel ku filan oo uu ku dhameystiro hadalka
)
speech = vocoder(speech)
return (16000, speech.cpu().numpy())
# --- Gradio Interface (sidii hore) ---
iface = gr.Interface(
fn=text_to_speech,
inputs=[
gr.Textbox(label="Geli qoraalka af-Soomaaliga (Enter Somali Text)"),
gr.Dropdown(
VOICE_SAMPLE_FILES,
label="Select Voice",
info="Dooro codka aad rabto inaad isticmaasho.",
value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None
)
],
outputs=gr.Audio(label="Codka La Abuuray (Generated Voice)", type="numpy"),
title="Multi-Voice Somali Text-to-Speech",
description="Geli qoraal Soomaali ah, dooro cod, kadibna riix 'Submit' si aad u abuurto hadal."
)
if __name__ == "__main__":
if not all(os.path.exists(f) for f in VOICE_SAMPLE_FILES):
raise FileNotFoundError("Fadlan hubi inaad faylasha codka soo gelisay Space-ka.")
print("Diyaarinta codadka...")
for voice_file in VOICE_SAMPLE_FILES:
get_speaker_embedding(voice_file)
print("Dhammaan waa diyaar. Barnaamijku wuu furmayaa.")
iface.launch(share=True)