Spaces:

Somalitts
/

8aad

Running

File size: 6,341 Bytes

import gradio as gr
import torch
import torchaudio
import re
import os
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
import numpy as np

# --- Configuration ---
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# --- KU DAR FAYLKA CODADKAAGA ---
# TAYADA CODADKAN AYAA UGU MUHIMSAN NATIIJADA
VOICE_SAMPLE_FILES = ["1.wav"] # Hubi in faylkan tayadiisu fiican tahay
EMBEDDING_DIR = "speaker_embeddings"
os.makedirs(EMBEDDING_DIR, exist_ok=True)

# --- Soo Dejinta Model-yada ---
try:
    print("Loading models...")
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
    speaker_model = EncoderClassifier.from_hparams(
        source="speechbrain/spkrec-xvect-voxceleb",
        run_opts={"device": device},
        savedir=os.path.join("pretrained_models", "spkrec-xvect-voxceleb")
    )
    print("Models loaded successfully.")
except Exception as e:
    raise gr.Error(f"Error loading models: {e}.")

speaker_embeddings_cache = {}

def get_speaker_embedding(wav_file_path):
    # Shaqadan sidii hore ayay u egtahay
    if wav_file_path in speaker_embeddings_cache:
        return speaker_embeddings_cache[wav_file_path]
    embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
    if os.path.exists(embedding_path):
        embedding = torch.load(embedding_path, map_location=device)
        speaker_embeddings_cache[wav_file_path] = embedding
        return embedding
    if not os.path.exists(wav_file_path):
        raise gr.Error(f"Audio file not found: {wav_file_path}")
    try:
        audio, sr = torchaudio.load(wav_file_path)
        if sr != 16000: audio = torchaudio.functional.resample(audio, sr, 16000)
        if audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True)
        with torch.no_grad():
            embedding = speaker_model.encode_batch(audio.to(device))
            embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
        torch.save(embedding.cpu(), embedding_path)
        speaker_embeddings_cache[wav_file_path] = embedding.to(device)
        return embedding.to(device)
    except Exception as e:
        raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")

# --- Text Processing Functions (sidoodii) ---
number_words = {
    0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
    6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
    11: "kow iyo toban", 12: "labo iyo toban", 13: "saddex iyo toban",
    14: "afar iyo toban", 15: "shan iyo toban", 16: "lix iyo toban",
    17: "toddobo iyo toban", 18: "siddeed iyo toban", 19: "sagaal iyo toban",
    20: "labaatan", 30: "soddon", 40: "afartan", 50: "konton",
    60: "lixdan", 70: "toddobaatan", 80: "siddeetan", 90: "sagaashan",
    100: "boqol", 1000: "kun",
}
def number_to_words(n):
    if n in number_words:
        return number_words[n]
    if n < 100:
        return number_words[n // 10 * 10] + (" iyo " + number_words[n % 10] if n % 10 else "")
    if n < 1000:
        return (number_words[n // 100] + " boqol" if n // 100 > 1 else "boqol") + (
            " iyo " + number_to_words(n % 100) if n % 100 else "")
    if n < 1_000_000:
        return (number_to_words(n // 1000) + " kun" if n // 1000 > 1 else "kun") + (
            " iyo " + number_to_words(n % 1000) if n % 1000 else "")
    if n < 1_000_000_000:
        return (number_to_words(n // 1_000_000) + " milyan" if n // 1_000_000 > 1 else "milyan") + (
            " iyo " + number_to_words(n % 1_000_000) if n % 1_000_000 else "")
    return str(n)

def replace_numbers_with_words(text):
    return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
def normalize_text(text):
    text = text.lower()
    text = replace_numbers_with_words(text)
    text = re.sub(r'[^\w\s\']', '', text)
    return text

# --- Main Text-to-Speech Function (oo la Hagaajiyay) ---
def text_to_speech(text, voice_choice):
    if not text or not voice_choice:
        gr.Warning("Fadlan geli qoraal oo dooro cod.")
        return None

    speaker_embedding = get_speaker_embedding(voice_choice)
    normalized_text = normalize_text(text)
    inputs = processor(text=normalized_text, return_tensors="pt").to(device)

    with torch.no_grad():
        speech = model.generate(
            input_ids=inputs["input_ids"],
            speaker_embeddings=speaker_embedding.unsqueeze(0),
            
            # --- Halbeegyada Tayada Codka ---
            do_sample=True,          # MUHIIM: Waxay ka dhigaysaa codka mid dabiici ah
            top_k=50,                # Waxay xaddidaysaa hal-abuurka si uusan u qaldamin
            temperature=0.75,        # Kani wuxuu xakameeyaa hal-abuurka. (0.7 - 0.8 waa fiican yahay)
            repetition_penalty=1.2,  # Waxay ka hortagtaa inuu ku celceliyo isku dhawaaq
            max_new_tokens=512       # Waxay siinaysaa model-ka meel ku filan oo uu ku dhameystiro hadalka
        )
        speech = vocoder(speech)

    return (16000, speech.cpu().numpy())

# --- Gradio Interface (sidii hore) ---
iface = gr.Interface(
    fn=text_to_speech,
    inputs=[
        gr.Textbox(label="Geli qoraalka af-Soomaaliga (Enter Somali Text)"),
        gr.Dropdown(
            VOICE_SAMPLE_FILES,
            label="Select Voice",
            info="Dooro codka aad rabto inaad isticmaasho.",
            value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None
        )
    ],
    outputs=gr.Audio(label="Codka La Abuuray (Generated Voice)", type="numpy"),
    title="Multi-Voice Somali Text-to-Speech",
    description="Geli qoraal Soomaali ah, dooro cod, kadibna riix 'Submit' si aad u abuurto hadal."
)

if __name__ == "__main__":
    if not all(os.path.exists(f) for f in VOICE_SAMPLE_FILES):
        raise FileNotFoundError("Fadlan hubi inaad faylasha codka soo gelisay Space-ka.")
    
    print("Diyaarinta codadka...")
    for voice_file in VOICE_SAMPLE_FILES:
        get_speaker_embedding(voice_file)
    print("Dhammaan waa diyaar. Barnaamijku wuu furmayaa.")
    
    iface.launch(share=True)