File size: 6,267 Bytes
fc5325e
 
b074847
fc5325e
4b84ef4
b074847
 
 
a968a0c
4b84ef4
 
0ff498c
b074847
4853a7f
80cd488
a968a0c
b074847
 
 
 
 
4b84ef4
b074847
 
 
 
 
 
 
 
 
 
4b84ef4
b074847
 
 
 
 
 
 
a968a0c
b074847
 
 
 
 
 
 
 
 
 
4853a7f
b074847
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4853a7f
b074847
4853a7f
4b84ef4
b074847
 
 
 
 
 
 
 
4b84ef4
b074847
 
 
 
 
 
 
 
4b84ef4
b074847
 
 
 
8a9cb69
4853a7f
b074847
 
 
 
4b84ef4
b074847
 
 
 
4b84ef4
b074847
 
4853a7f
b074847
4853a7f
 
b074847
4853a7f
 
 
b074847
4853a7f
 
 
8a9cb69
b074847
 
4853a7f
b074847
 
 
 
 
 
 
 
4853a7f
b074847
 
 
 
 
 
 
 
 
 
 
4b84ef4
b074847
 
 
 
 
4853a7f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import gradio as gr
import torch
import torchaudio
import re
import os
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
import numpy as np

# --- Configuration ---
device = "cuda" if torch.cuda.is_available() else "cpu"

# --- ADD ALL YOUR VOICE FILES HERE ---
# Hubi in faylashan ay ku jiraan isla galka uu koodhkani ku jiro.
VOICE_SAMPLE_FILES = ["1.wav", "1005.wav", "1060.wav", "737.wav"]

# Directory to store speaker embedding files
EMBEDDING_DIR = "speaker_embeddings"
os.makedirs(EMBEDDING_DIR, exist_ok=True)

# --- Load Models ---
try:
    print("Loading models... This may take a moment.")
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
    speaker_model = EncoderClassifier.from_hparams(
        source="speechbrain/spkrec-xvect-voxceleb",
        run_opts={"device": device},
        savedir=os.path.join("pretrained_models", "spkrec-xvect-voxceleb")
    )
    print("Models loaded successfully.")
except Exception as e:
    raise gr.Error(f"Error loading models: {e}. Check your internet connection.")

speaker_embeddings_cache = {}

def get_speaker_embedding(wav_file_path):
    if wav_file_path in speaker_embeddings_cache:
        return speaker_embeddings_cache[wav_file_path]

    embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")

    if os.path.exists(embedding_path):
        print(f"Loading existing embedding for {wav_file_path}")
        embedding = torch.load(embedding_path, map_location=device)
        speaker_embeddings_cache[wav_file_path] = embedding
        return embedding

    print(f"Creating new speaker embedding for {wav_file_path}...")
    if not os.path.exists(wav_file_path):
        raise gr.Error(f"Audio file not found: {wav_file_path}.")

    try:
        audio, sr = torchaudio.load(wav_file_path)
        if sr != 16000:
            audio = torchaudio.functional.resample(audio, sr, 16000)
        if audio.shape[0] > 1:
            audio = torch.mean(audio, dim=0, keepdim=True)
        
        with torch.no_grad():
            embedding = speaker_model.encode_batch(audio.to(device))
            embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()

        torch.save(embedding.cpu(), embedding_path)
        speaker_embeddings_cache[wav_file_path] = embedding.to(device)
        print(f"Embedding created and saved for {wav_file_path}.")
        return embedding.to(device)
    except Exception as e:
        raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")

# --- Text Processing Functions (Remains the same) ---
number_words = {
    0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
    6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
    11: "kow iyo toban", 12: "labo iyo toban", 13: "saddex iyo toban",
    14: "afar iyo toban", 15: "shan iyo toban", 16: "lix iyo toban",
    17: "toddobo iyo toban", 18: "siddeed iyo toban", 19: "sagaal iyo toban",
    20: "labaatan", 30: "soddon", 40: "afartan", 50: "konton",
    60: "lixdan", 70: "toddobaatan", 80: "siddeetan", 90: "sagaashan",
    100: "boqol", 1000: "kun",
}
def number_to_words(n):
    if n in number_words: return number_words[n]
    if n < 100: return number_words[n//10 * 10] + (" iyo " + number_words[n%10] if n%10 else "")
    if n < 1000: return (number_words[n//100] + " boqol" if n//100 > 1 else "boqol") + (" iyo " + number_to_words(n%100) if n%100 else "")
    if n < 1000000: return (number_to_words(n//1000) + " kun" if n//1000 > 1 else "kun") + (" iyo " + number_to_words(n%1000) if n%1000 else "")
    return str(n)
def replace_numbers_with_words(text):
    return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
def normalize_text(text):
    text = text.lower()
    text = replace_numbers_with_words(text)
    text = re.sub(r'[^\w\s\']', '', text)
    return text

# --- Main Text-to-Speech Function (WAXAAN KU DARNAY HAGAAJIN HAKAN) ---
def text_to_speech(text, voice_choice):
    if not text:
        gr.Warning("Please enter some text.")
        return None, None
    if not voice_choice:
        gr.Warning("Please select a voice from the dropdown.")
        return None, None
        
    speaker_embedding = get_speaker_embedding(voice_choice)
    normalized_text = normalize_text(text)
    inputs = processor(text=normalized_text, return_tensors="pt").to(device)

    # Waa kan isbeddelka la sameeyay si codka loo hagaajiyo
    with torch.no_grad():
        speech = model.generate(
            input_ids=inputs["input_ids"],
            speaker_embeddings=speaker_embedding.unsqueeze(0),
            # Halbeegyada lagu daray si loo yareeyo dareenka AI-ga
            do_sample=True, 
            top_k=50,
        )
        
        # Vocoder-ka si gooni ah ayaa loo isticmaalayaa hadda
        speech = vocoder(speech)

    return (16000, speech.cpu().numpy())

# --- Gradio Interface (Remains the same) ---
iface = gr.Interface(
    fn=text_to_speech,
    inputs=[
        gr.Textbox(label="Geli qoraalka af-Soomaaliga (Enter Somali Text)"),
        gr.Dropdown(
            VOICE_SAMPLE_FILES,
            label="Select Voice",
            info="Choose the voice you want to use for the speech.",
            value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None
        )
    ],
    outputs=gr.Audio(label="Codka La Abuuray (Generated Voice)", type="numpy"),
    title="Multi-Voice Somali Text-to-Speech",
    description="Enter Somali text, choose a voice from the dropdown, and click submit to generate speech.",
    examples=[
        ["Sidee tahay saaxiib? Maanta waa maalin wanaagsan.", VOICE_SAMPLE_FILES[0]],
        ["Nabad gelyo, is arag dambe.", VOICE_SAMPLE_FILES[1] if len(VOICE_SAMPLE_FILES) > 1 else VOICE_SAMPLE_FILES[0]],
    ]
)

if __name__ == "__main__":
    print("Pre-loading all voice embeddings...")
    for voice_file in VOICE_SAMPLE_FILES:
        get_speaker_embedding(voice_file)
    print("All voices are ready. Launching interface.")
    
    iface.launch(share=True)