Spaces:

Somalitts
/

8aad

Running

File size: 6,902 Bytes

import gradio as gr
import torch
import torchaudio
import re
import os
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
import numpy as np

# --- Configuration ---
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

VOICE_SAMPLE_FILES = ["1.wav"]  # Hubi in faylkan tayadiisu fiican tahay
EMBEDDING_DIR = "speaker_embeddings"
os.makedirs(EMBEDDING_DIR, exist_ok=True)

# --- Load models ---
try:
    print("Loading models...")
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
    speaker_model = EncoderClassifier.from_hparams(
        source="speechbrain/spkrec-xvect-voxceleb",
        run_opts={"device": device},
        savedir=os.path.join("pretrained_models", "spkrec-xvect-voxceleb")
    )
    print("Models loaded successfully.")
except Exception as e:
    raise gr.Error(f"Error loading models: {e}.")

speaker_embeddings_cache = {}

def get_speaker_embedding(wav_file_path):
    if wav_file_path in speaker_embeddings_cache:
        return speaker_embeddings_cache[wav_file_path]
    embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
    if os.path.exists(embedding_path):
        embedding = torch.load(embedding_path, map_location=device)
        speaker_embeddings_cache[wav_file_path] = embedding
        return embedding
    if not os.path.exists(wav_file_path):
        raise gr.Error(f"Audio file not found: {wav_file_path}")
    try:
        audio, sr = torchaudio.load(wav_file_path)
        if sr != 16000:
            audio = torchaudio.functional.resample(audio, sr, 16000)
        if audio.shape[0] > 1:
            audio = torch.mean(audio, dim=0, keepdim=True)
        with torch.no_grad():
            embedding = speaker_model.encode_batch(audio.to(device))
            embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
        torch.save(embedding.cpu(), embedding_path)
        speaker_embeddings_cache[wav_file_path] = embedding.to(device)
        return embedding.to(device)
    except Exception as e:
        raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")

# Number to words functions (as before) ...
number_words = {
    0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
    6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
    11: "kow iyo toban", 12: "labo iyo toban", 13: "saddex iyo toban",
    14: "afar iyo toban", 15: "shan iyo toban", 16: "lix iyo toban",
    17: "toddobo iyo toban", 18: "siddeed iyo toban", 19: "sagaal iyo toban",
    20: "labaatan", 30: "soddon", 40: "afartan", 50: "konton",
    60: "lixdan", 70: "toddobaatan", 80: "siddeetan", 90: "sagaashan",
    100: "boqol", 1000: "kun",
}
def number_to_words(n):
    if n in number_words:
        return number_words[n]
    if n < 100:
        return number_words[n // 10 * 10] + (" iyo " + number_words[n % 10] if n % 10 else "")
    if n < 1000:
        return (number_words[n // 100] + " boqol" if n // 100 > 1 else "boqol") + (
            " iyo " + number_to_words(n % 100) if n % 100 else "")
    if n < 1_000_000:
        return (number_to_words(n // 1000) + " kun" if n // 1000 > 1 else "kun") + (
            " iyo " + number_to_words(n % 1000) if n % 1000 else "")
    if n < 1_000_000_000:
        return (number_to_words(n // 1_000_000) + " milyan" if n // 1_000_000 > 1 else "milyan") + (
            " iyo " + number_to_words(n % 1_000_000) if n % 1_000_000 else "")
    return str(n)
def replace_numbers_with_words(text):
    return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
def normalize_text(text):
    text = text.lower()
    text = replace_numbers_with_words(text)
    text = re.sub(r'[^\w\s\']', '', text)
    return text

# **Jumladaha kala saar (split into sentences) function**
def split_into_sentences(text):
    # Qaar ka mid ah hababka fudud ee jumladaha kala saarista
    sentence_endings = re.compile(r'(?<=[.!?])\s+')
    sentences = sentence_endings.split(text)
    # Haddii qoraalka uusan lahayn calaamadaha dhamaadka jumlada, iska hubi oo qaybi ereyo waaweyn
    if len(sentences) == 1:
        # Ku kala jar ereyo waaweyn maxaa yeelay lama helin calaamad
        sentences = re.split(r'(?<=\.)\s+|(?<=\?)\s+|(?<=!)\s+', text)
    # Nadiifi meelaha banaan iyo jumladaha madhan
    sentences = [s.strip() for s in sentences if s.strip()]
    return sentences

def text_to_speech(text, voice_choice):
    if not text or not voice_choice:
        gr.Warning("Fadlan geli qoraal oo dooro cod.")
        return None

    speaker_embedding = get_speaker_embedding(voice_choice)

    sentences = split_into_sentences(text)

    all_audios = []
    for i, sentence in enumerate(sentences):
        normalized_text = normalize_text(sentence)
        inputs = processor(text=normalized_text, return_tensors="pt").to(device)
        with torch.no_grad():
            speech = model.generate(
                input_ids=inputs["input_ids"],
                speaker_embeddings=speaker_embedding.unsqueeze(0),
                do_sample=True,
                top_k=50,
                temperature=0.75,
                repetition_penalty=1.2,
                max_new_tokens=512
            )
            audio = vocoder(speech).cpu()

        all_audios.append(audio)
        # Nasasho 0.5 ilbiriqsi haddii uusan ahayn jumladii ugu dambeysay
        if i < len(sentences) - 1:
            pause = torch.zeros((1, int(16000 * 0.5)))  # 0.5 sec silence
            all_audios.append(pause)

    final_audio = torch.cat(all_audios, dim=1)
    return (16000, final_audio.numpy())

iface = gr.Interface(
    fn=text_to_speech,
    inputs=[
        gr.Textbox(label="Geli qoraalka af-Soomaaliga (Enter Somali Text)", lines=7, placeholder="Qoraalka geli halkan..."),
        gr.Dropdown(
            VOICE_SAMPLE_FILES,
            label="Select Voice",
            info="Dooro codka aad rabto inaad isticmaasho.",
            value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None
        )
    ],
    outputs=gr.Audio(label="Codka La Abuuray (Generated Voice)", type="numpy"),
    title="Multi-Voice Somali Text-to-Speech",
    description="Geli qoraal Soomaali ah, dooro cod, kadibna riix 'Submit' si aad u abuurto hadal."
)

if __name__ == "__main__":
    if not all(os.path.exists(f) for f in VOICE_SAMPLE_FILES):
        raise FileNotFoundError("Fadlan hubi inaad faylasha codka soo gelisay Space-ka.")

    print("Diyaarinta codadka...")
    for voice_file in VOICE_SAMPLE_FILES:
        get_speaker_embedding(voice_file)
    print("Dhammaan waa diyaar. Barnaamijku wuu furmayaa.")

    iface.launch(share=True)