Spaces:

Somalitts
/

orph

Sleeping

File size: 6,267 Bytes

import gradio as gr
import torch
import torchaudio
import re
import os
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
import numpy as np

# --- Configuration ---
device = "cuda" if torch.cuda.is_available() else "cpu"

# --- ADD ALL YOUR VOICE FILES HERE ---
# Hubi in faylashan ay ku jiraan isla galka uu koodhkani ku jiro.
VOICE_SAMPLE_FILES = ["1.wav", "1005.wav", "1060.wav", "737.wav"]

# Directory to store speaker embedding files
EMBEDDING_DIR = "speaker_embeddings"
os.makedirs(EMBEDDING_DIR, exist_ok=True)

# --- Load Models ---
try:
    print("Loading models... This may take a moment.")
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
    speaker_model = EncoderClassifier.from_hparams(
        source="speechbrain/spkrec-xvect-voxceleb",
        run_opts={"device": device},
        savedir=os.path.join("pretrained_models", "spkrec-xvect-voxceleb")
    )
    print("Models loaded successfully.")
except Exception as e:
    raise gr.Error(f"Error loading models: {e}. Check your internet connection.")

speaker_embeddings_cache = {}

def get_speaker_embedding(wav_file_path):
    if wav_file_path in speaker_embeddings_cache:
        return speaker_embeddings_cache[wav_file_path]

    embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")

    if os.path.exists(embedding_path):
        print(f"Loading existing embedding for {wav_file_path}")
        embedding = torch.load(embedding_path, map_location=device)
        speaker_embeddings_cache[wav_file_path] = embedding
        return embedding

    print(f"Creating new speaker embedding for {wav_file_path}...")
    if not os.path.exists(wav_file_path):
        raise gr.Error(f"Audio file not found: {wav_file_path}.")

    try:
        audio, sr = torchaudio.load(wav_file_path)
        if sr != 16000:
            audio = torchaudio.functional.resample(audio, sr, 16000)
        if audio.shape[0] > 1:
            audio = torch.mean(audio, dim=0, keepdim=True)
        
        with torch.no_grad():
            embedding = speaker_model.encode_batch(audio.to(device))
            embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()

        torch.save(embedding.cpu(), embedding_path)
        speaker_embeddings_cache[wav_file_path] = embedding.to(device)
        print(f"Embedding created and saved for {wav_file_path}.")
        return embedding.to(device)
    except Exception as e:
        raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")

# --- Text Processing Functions (Remains the same) ---
number_words = {
    0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
    6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
    11: "kow iyo toban", 12: "labo iyo toban", 13: "saddex iyo toban",
    14: "afar iyo toban", 15: "shan iyo toban", 16: "lix iyo toban",
    17: "toddobo iyo toban", 18: "siddeed iyo toban", 19: "sagaal iyo toban",
    20: "labaatan", 30: "soddon", 40: "afartan", 50: "konton",
    60: "lixdan", 70: "toddobaatan", 80: "siddeetan", 90: "sagaashan",
    100: "boqol", 1000: "kun",
}
def number_to_words(n):
    if n in number_words: return number_words[n]
    if n < 100: return number_words[n//10 * 10] + (" iyo " + number_words[n%10] if n%10 else "")
    if n < 1000: return (number_words[n//100] + " boqol" if n//100 > 1 else "boqol") + (" iyo " + number_to_words(n%100) if n%100 else "")
    if n < 1000000: return (number_to_words(n//1000) + " kun" if n//1000 > 1 else "kun") + (" iyo " + number_to_words(n%1000) if n%1000 else "")
    return str(n)
def replace_numbers_with_words(text):
    return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
def normalize_text(text):
    text = text.lower()
    text = replace_numbers_with_words(text)
    text = re.sub(r'[^\w\s\']', '', text)
    return text

# --- Main Text-to-Speech Function (WAXAAN KU DARNAY HAGAAJIN HAKAN) ---
def text_to_speech(text, voice_choice):
    if not text:
        gr.Warning("Please enter some text.")
        return None, None
    if not voice_choice:
        gr.Warning("Please select a voice from the dropdown.")
        return None, None
        
    speaker_embedding = get_speaker_embedding(voice_choice)
    normalized_text = normalize_text(text)
    inputs = processor(text=normalized_text, return_tensors="pt").to(device)

    # Waa kan isbeddelka la sameeyay si codka loo hagaajiyo
    with torch.no_grad():
        speech = model.generate(
            input_ids=inputs["input_ids"],
            speaker_embeddings=speaker_embedding.unsqueeze(0),
            # Halbeegyada lagu daray si loo yareeyo dareenka AI-ga
            do_sample=True, 
            top_k=50,
        )
        
        # Vocoder-ka si gooni ah ayaa loo isticmaalayaa hadda
        speech = vocoder(speech)

    return (16000, speech.cpu().numpy())

# --- Gradio Interface (Remains the same) ---
iface = gr.Interface(
    fn=text_to_speech,
    inputs=[
        gr.Textbox(label="Geli qoraalka af-Soomaaliga (Enter Somali Text)"),
        gr.Dropdown(
            VOICE_SAMPLE_FILES,
            label="Select Voice",
            info="Choose the voice you want to use for the speech.",
            value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None
        )
    ],
    outputs=gr.Audio(label="Codka La Abuuray (Generated Voice)", type="numpy"),
    title="Multi-Voice Somali Text-to-Speech",
    description="Enter Somali text, choose a voice from the dropdown, and click submit to generate speech.",
    examples=[
        ["Sidee tahay saaxiib? Maanta waa maalin wanaagsan.", VOICE_SAMPLE_FILES[0]],
        ["Nabad gelyo, is arag dambe.", VOICE_SAMPLE_FILES[1] if len(VOICE_SAMPLE_FILES) > 1 else VOICE_SAMPLE_FILES[0]],
    ]
)

if __name__ == "__main__":
    print("Pre-loading all voice embeddings...")
    for voice_file in VOICE_SAMPLE_FILES:
        get_speaker_embedding(voice_file)
    print("All voices are ready. Launching interface.")
    
    iface.launch(share=True)