File size: 6,902 Bytes
43ec985
b7217f7
 
43ec985
 
e365862
43ec985
 
e365862
2342a7b
b7217f7
e1c9728
b7217f7
f685632
43ec985
 
2342a7b
5a3bbd1
2342a7b
e1c9728
43ec985
 
 
2342a7b
 
 
43ec985
2342a7b
43ec985
2342a7b
e1c9728
2342a7b
43ec985
2342a7b
43ec985
 
 
 
2342a7b
43ec985
 
 
2342a7b
e1c9728
43ec985
 
1229011
 
 
 
43ec985
 
 
 
 
 
 
 
 
5a3bbd1
2e7b63f
 
 
 
 
 
 
 
 
 
 
4aa5331
 
 
 
 
 
 
 
 
 
 
 
 
2e7b63f
 
 
 
 
 
f685632
2e7b63f
43ec985
5a3bbd1
 
 
 
 
 
 
 
 
 
 
 
 
2342a7b
100e7c2
 
 
2e7b63f
100e7c2
 
5a3bbd1
f685632
 
5a3bbd1
 
f685632
1229011
 
 
 
 
 
 
 
 
 
f685632
 
 
5a3bbd1
 
 
 
f685632
 
 
2860b2a
b7217f7
2e7b63f
 
f685632
2e7b63f
 
 
100e7c2
2e7b63f
 
 
 
 
100e7c2
b7217f7
 
2342a7b
100e7c2
 
1229011
100e7c2
43ec985
 
100e7c2
1229011
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import gradio as gr
import torch
import torchaudio
import re
import os
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
import numpy as np

# --- Configuration ---
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

VOICE_SAMPLE_FILES = ["1.wav"]  # Hubi in faylkan tayadiisu fiican tahay
EMBEDDING_DIR = "speaker_embeddings"
os.makedirs(EMBEDDING_DIR, exist_ok=True)

# --- Load models ---
try:
    print("Loading models...")
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
    speaker_model = EncoderClassifier.from_hparams(
        source="speechbrain/spkrec-xvect-voxceleb",
        run_opts={"device": device},
        savedir=os.path.join("pretrained_models", "spkrec-xvect-voxceleb")
    )
    print("Models loaded successfully.")
except Exception as e:
    raise gr.Error(f"Error loading models: {e}.")

speaker_embeddings_cache = {}

def get_speaker_embedding(wav_file_path):
    if wav_file_path in speaker_embeddings_cache:
        return speaker_embeddings_cache[wav_file_path]
    embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
    if os.path.exists(embedding_path):
        embedding = torch.load(embedding_path, map_location=device)
        speaker_embeddings_cache[wav_file_path] = embedding
        return embedding
    if not os.path.exists(wav_file_path):
        raise gr.Error(f"Audio file not found: {wav_file_path}")
    try:
        audio, sr = torchaudio.load(wav_file_path)
        if sr != 16000:
            audio = torchaudio.functional.resample(audio, sr, 16000)
        if audio.shape[0] > 1:
            audio = torch.mean(audio, dim=0, keepdim=True)
        with torch.no_grad():
            embedding = speaker_model.encode_batch(audio.to(device))
            embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
        torch.save(embedding.cpu(), embedding_path)
        speaker_embeddings_cache[wav_file_path] = embedding.to(device)
        return embedding.to(device)
    except Exception as e:
        raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")

# Number to words functions (as before) ...
number_words = {
    0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
    6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
    11: "kow iyo toban", 12: "labo iyo toban", 13: "saddex iyo toban",
    14: "afar iyo toban", 15: "shan iyo toban", 16: "lix iyo toban",
    17: "toddobo iyo toban", 18: "siddeed iyo toban", 19: "sagaal iyo toban",
    20: "labaatan", 30: "soddon", 40: "afartan", 50: "konton",
    60: "lixdan", 70: "toddobaatan", 80: "siddeetan", 90: "sagaashan",
    100: "boqol", 1000: "kun",
}
def number_to_words(n):
    if n in number_words:
        return number_words[n]
    if n < 100:
        return number_words[n // 10 * 10] + (" iyo " + number_words[n % 10] if n % 10 else "")
    if n < 1000:
        return (number_words[n // 100] + " boqol" if n // 100 > 1 else "boqol") + (
            " iyo " + number_to_words(n % 100) if n % 100 else "")
    if n < 1_000_000:
        return (number_to_words(n // 1000) + " kun" if n // 1000 > 1 else "kun") + (
            " iyo " + number_to_words(n % 1000) if n % 1000 else "")
    if n < 1_000_000_000:
        return (number_to_words(n // 1_000_000) + " milyan" if n // 1_000_000 > 1 else "milyan") + (
            " iyo " + number_to_words(n % 1_000_000) if n % 1_000_000 else "")
    return str(n)
def replace_numbers_with_words(text):
    return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
def normalize_text(text):
    text = text.lower()
    text = replace_numbers_with_words(text)
    text = re.sub(r'[^\w\s\']', '', text)
    return text

# **Jumladaha kala saar (split into sentences) function**
def split_into_sentences(text):
    # Qaar ka mid ah hababka fudud ee jumladaha kala saarista
    sentence_endings = re.compile(r'(?<=[.!?])\s+')
    sentences = sentence_endings.split(text)
    # Haddii qoraalka uusan lahayn calaamadaha dhamaadka jumlada, iska hubi oo qaybi ereyo waaweyn
    if len(sentences) == 1:
        # Ku kala jar ereyo waaweyn maxaa yeelay lama helin calaamad
        sentences = re.split(r'(?<=\.)\s+|(?<=\?)\s+|(?<=!)\s+', text)
    # Nadiifi meelaha banaan iyo jumladaha madhan
    sentences = [s.strip() for s in sentences if s.strip()]
    return sentences

def text_to_speech(text, voice_choice):
    if not text or not voice_choice:
        gr.Warning("Fadlan geli qoraal oo dooro cod.")
        return None

    speaker_embedding = get_speaker_embedding(voice_choice)

    sentences = split_into_sentences(text)

    all_audios = []
    for i, sentence in enumerate(sentences):
        normalized_text = normalize_text(sentence)
        inputs = processor(text=normalized_text, return_tensors="pt").to(device)
        with torch.no_grad():
            speech = model.generate(
                input_ids=inputs["input_ids"],
                speaker_embeddings=speaker_embedding.unsqueeze(0),
                do_sample=True,
                top_k=50,
                temperature=0.75,
                repetition_penalty=1.2,
                max_new_tokens=512
            )
            audio = vocoder(speech).cpu()

        all_audios.append(audio)
        # Nasasho 0.5 ilbiriqsi haddii uusan ahayn jumladii ugu dambeysay
        if i < len(sentences) - 1:
            pause = torch.zeros((1, int(16000 * 0.5)))  # 0.5 sec silence
            all_audios.append(pause)

    final_audio = torch.cat(all_audios, dim=1)
    return (16000, final_audio.numpy())

iface = gr.Interface(
    fn=text_to_speech,
    inputs=[
        gr.Textbox(label="Geli qoraalka af-Soomaaliga (Enter Somali Text)", lines=7, placeholder="Qoraalka geli halkan..."),
        gr.Dropdown(
            VOICE_SAMPLE_FILES,
            label="Select Voice",
            info="Dooro codka aad rabto inaad isticmaasho.",
            value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None
        )
    ],
    outputs=gr.Audio(label="Codka La Abuuray (Generated Voice)", type="numpy"),
    title="Multi-Voice Somali Text-to-Speech",
    description="Geli qoraal Soomaali ah, dooro cod, kadibna riix 'Submit' si aad u abuurto hadal."
)

if __name__ == "__main__":
    if not all(os.path.exists(f) for f in VOICE_SAMPLE_FILES):
        raise FileNotFoundError("Fadlan hubi inaad faylasha codka soo gelisay Space-ka.")

    print("Diyaarinta codadka...")
    for voice_file in VOICE_SAMPLE_FILES:
        get_speaker_embedding(voice_file)
    print("Dhammaan waa diyaar. Barnaamijku wuu furmayaa.")

    iface.launch(share=True)