Spaces:

Somalitts
/

8aad

Running

File size: 3,548 Bytes

import gradio as gr
import torch
import torchaudio
import re
import os
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
import numpy as np

# --- Configuration ---
device = "cuda" if torch.cuda.is_available() else "cpu"

# --- HUBI INAAD SOO GELISAY FAYLASHAN ---
# Faylashan waa inay ku jiraan Hugging Face Spaces, isla galka uu ku jiro "app.py"
VOICE_SAMPLE_FILES = ["1.wav"]

# Directory to store speaker embedding files
EMBEDDING_DIR = "speaker_embeddings"
os.makedirs(EMBEDDING_DIR, exist_ok=True)

# --- Load Models ---
try:
    print("Loading models... This may take a moment.")
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
    speaker_model = EncoderClassifier.from_hparams(
        source="speechbrain/spkrec-xvect-voxceleb",
        run_opts={"device": device},
        savedir=os.path.join("pretrained_models", "spkrec-xvect-voxceleb")
    )
    print("Models loaded successfully.")
except Exception as e:
    raise gr.Error(f"Error loading models: {e}. Check your internet connection.")

speaker_embeddings_cache = {}

def get_speaker_embedding(wav_file_path):
    if wav_file_path in speaker_embeddings_cache:
        return speaker_embeddings_cache[wav_file_path]
    embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
    if os.path.exists(embedding_path):
        embedding = torch.load(embedding_path, map_location=device)
        speaker_embeddings_cache[wav_file_path] = embedding
        return embedding
    if not os.path.exists(wav_file_path):
        # Kani waa qaladka dhacay. Markaad faylasha soo geliso, meeshan wuu ka gudbayaa.
        raise FileNotFoundError(f"Lama helin faylka codka: {wav_file_path}")
    try:
        audio, sr = torchaudio.load(wav_file_path)
        if sr != 16000: audio = torchaudio.functional.resample(audio, sr, 16000)
        if audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True)
        with torch.no_grad():
            embedding = speaker_model.encode_batch(audio.to(device))
            embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
        torch.save(embedding.cpu(), embedding_path)
        speaker_embeddings_cache[wav_file_path] = embedding.to(device)
        return embedding.to(device)
    except Exception as e:
        raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")

# ... (Inta kale ee koodhka way saxantahay) ...

# --- Main Text-to-Speech Function ---
def text_to_speech(text, voice_choice):
    # ... (sidaadii hore) ...
    pass # Koodhka intiisa kale halkan geli

# --- Gradio Interface ---
iface = gr.Interface(
    # ... (sidaadii hore) ...
    pass # Koodhka intiisa kale halkan geli
)

# --- Launch the web interface ---
if __name__ == "__main__":
    print("Hubinta faylasha codadka...")
    for f in VOICE_SAMPLE_FILES:
        if not os.path.exists(f):
            # Qaladku halkan ayuu ka bilaabmayaa
            raise FileNotFoundError(f"Mid ka mid ah faylasha lama helin: '{f}'. Fadlan hubi inaad soo gelisay Hugging Face Spaces.")
    
    print("Diyaarinta astaamaha codadka...")
    for voice_file in VOICE_SAMPLE_FILES:
        get_speaker_embedding(voice_file)
    print("Dhammaan codadka waa diyaar. Waxaa la furayaa interface-ka.")
    
    iface.launch(share=True)