File size: 3,548 Bytes
43ec985
b7217f7
 
43ec985
 
e365862
43ec985
 
e365862
2342a7b
b7217f7
 
43ec985
 
 
f47a2b0
43ec985
 
 
2342a7b
43ec985
2342a7b
43ec985
 
 
 
2342a7b
 
 
43ec985
2342a7b
43ec985
2342a7b
43ec985
2342a7b
43ec985
2342a7b
43ec985
 
 
 
2342a7b
43ec985
 
 
2342a7b
43ec985
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2342a7b
43ec985
 
2860b2a
43ec985
b7217f7
43ec985
 
b7217f7
 
43ec985
2342a7b
 
43ec985
 
 
 
2342a7b
43ec985
 
 
 
2342a7b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import gradio as gr
import torch
import torchaudio
import re
import os
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
import numpy as np

# --- Configuration ---
device = "cuda" if torch.cuda.is_available() else "cpu"

# --- HUBI INAAD SOO GELISAY FAYLASHAN ---
# Faylashan waa inay ku jiraan Hugging Face Spaces, isla galka uu ku jiro "app.py"
VOICE_SAMPLE_FILES = ["1.wav"]

# Directory to store speaker embedding files
EMBEDDING_DIR = "speaker_embeddings"
os.makedirs(EMBEDDING_DIR, exist_ok=True)

# --- Load Models ---
try:
    print("Loading models... This may take a moment.")
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
    speaker_model = EncoderClassifier.from_hparams(
        source="speechbrain/spkrec-xvect-voxceleb",
        run_opts={"device": device},
        savedir=os.path.join("pretrained_models", "spkrec-xvect-voxceleb")
    )
    print("Models loaded successfully.")
except Exception as e:
    raise gr.Error(f"Error loading models: {e}. Check your internet connection.")

speaker_embeddings_cache = {}

def get_speaker_embedding(wav_file_path):
    if wav_file_path in speaker_embeddings_cache:
        return speaker_embeddings_cache[wav_file_path]
    embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
    if os.path.exists(embedding_path):
        embedding = torch.load(embedding_path, map_location=device)
        speaker_embeddings_cache[wav_file_path] = embedding
        return embedding
    if not os.path.exists(wav_file_path):
        # Kani waa qaladka dhacay. Markaad faylasha soo geliso, meeshan wuu ka gudbayaa.
        raise FileNotFoundError(f"Lama helin faylka codka: {wav_file_path}")
    try:
        audio, sr = torchaudio.load(wav_file_path)
        if sr != 16000: audio = torchaudio.functional.resample(audio, sr, 16000)
        if audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True)
        with torch.no_grad():
            embedding = speaker_model.encode_batch(audio.to(device))
            embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
        torch.save(embedding.cpu(), embedding_path)
        speaker_embeddings_cache[wav_file_path] = embedding.to(device)
        return embedding.to(device)
    except Exception as e:
        raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")

# ... (Inta kale ee koodhka way saxantahay) ...

# --- Main Text-to-Speech Function ---
def text_to_speech(text, voice_choice):
    # ... (sidaadii hore) ...
    pass # Koodhka intiisa kale halkan geli

# --- Gradio Interface ---
iface = gr.Interface(
    # ... (sidaadii hore) ...
    pass # Koodhka intiisa kale halkan geli
)

# --- Launch the web interface ---
if __name__ == "__main__":
    print("Hubinta faylasha codadka...")
    for f in VOICE_SAMPLE_FILES:
        if not os.path.exists(f):
            # Qaladku halkan ayuu ka bilaabmayaa
            raise FileNotFoundError(f"Mid ka mid ah faylasha lama helin: '{f}'. Fadlan hubi inaad soo gelisay Hugging Face Spaces.")
    
    print("Diyaarinta astaamaha codadka...")
    for voice_file in VOICE_SAMPLE_FILES:
        get_speaker_embedding(voice_file)
    print("Dhammaan codadka waa diyaar. Waxaa la furayaa interface-ka.")
    
    iface.launch(share=True)