File size: 4,599 Bytes
4fb7378
b283a59
 
 
 
 
 
35e3ab8
013ca2c
b283a59
4ab180c
 
b283a59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47fa993
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35e3ab8
47fa993
 
 
 
35e3ab8
 
 
 
47fa993
b283a59
 
4fb7378
35e3ab8
47fa993
 
35e3ab8
 
 
47fa993
 
 
 
028a2a8
35e3ab8
 
 
 
 
 
47fa993
 
35e3ab8
 
4fb7378
 
 
 
47fa993
4fb7378
 
47fa993
4fb7378
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import gradio as gr
import torch
import numpy as np
import scipy.io.wavfile
from transformers import VitsModel, AutoTokenizer
import re

# Load fine-tuned model from Hugging Face Hub or local path
model = VitsModel.from_pretrained("Somali-tts/somali_tts_model")
tokenizer = AutoTokenizer.from_pretrained("saleolow/somali-mms-tts")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

number_words = {
    0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
    6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
    11: "toban iyo koow", 12: "toban iyo labo", 13: "toban iyo seddex",
    14: "toban iyo afar", 15: "toban iyo shan", 16: "toban iyo lix",
    17: "toban iyo todobo", 18: "toban iyo sideed", 19: "toban iyo sagaal",
    20: "labaatan", 30: "sodon", 40: "afartan", 50: "konton",
    60: "lixdan", 70: "todobaatan", 80: "sideetan", 90: "sagaashan",
    100: "boqol", 1000: "kun"
}

def number_to_words(number):
    number = int(number)
    if number < 20:
        return number_words[number]
    elif number < 100:
        tens, unit = divmod(number, 10)
        return number_words[tens * 10] + (" iyo " + number_words[unit] if unit else "")
    elif number < 1000:
        hundreds, remainder = divmod(number, 100)
        part = (number_words[hundreds] + " boqol") if hundreds > 1 else "boqol"
        if remainder:
            part += " iyo " + number_to_words(remainder)
        return part
    elif number < 1000000:
        thousands, remainder = divmod(number, 1000)
        words = []
        if thousands == 1:
            words.append("kun")
        else:
            words.append(number_to_words(thousands) + " kun")
        if remainder >= 100:
            hundreds, rem2 = divmod(remainder, 100)
            if hundreds:
                boqol_text = (number_words[hundreds] + " boqol") if hundreds > 1 else "boqol"
                words.append(boqol_text)
            if rem2:
                words.append("iyo " + number_to_words(rem2))
        elif remainder:
            words.append("iyo " + number_to_words(remainder))
        return " ".join(words)
    elif number < 1000000000:
        millions, remainder = divmod(number, 1000000)
        words = []
        if millions == 1:
            words.append("milyan")
        else:
            words.append(number_to_words(millions) + " milyan")
        if remainder:
            words.append(number_to_words(remainder))
        return " ".join(words)
    else:
        return str(number)

def normalize_text(text):
    # Remove commas from numbers like 1,000,000
    text = re.sub(r'(\d{1,3})(,\d{3})+', lambda m: m.group(0).replace(",", ""), text)

    # Remove decimals (e.g., .00)
    text = re.sub(r'\.\d+', '', text)

    # Replace numbers with Somali words
    def replace_num(match):
        return number_to_words(match.group())

    text = re.sub(r'\d+', replace_num, text)

    # Replace special symbols
    symbol_map = {
        '$': 'doolar',
        '=': 'egwal',
        '+': 'balaas',
        '-': 'miinas'
    }
    for sym, word in symbol_map.items():
        text = text.replace(sym, ' ' + word + ' ')

    # Optional character normalization
    text = text.replace("KH", "qa").replace("Z", "S")
    text = text.replace("SH", "SHa'a").replace("DH", "Dha'a")
    text = text.replace("ZamZam", "SamSam")

    return text

def tts(text):
    paragraphs = text.strip().split("\n")
    audio_list = []

    for i, para in enumerate(paragraphs):
        if not para.strip():
            continue
        norm_para = normalize_text(para)
        inputs = tokenizer(norm_para, return_tensors="pt").to(device)
        with torch.no_grad():
            waveform = model(**inputs).waveform.squeeze().cpu().numpy()

        # Add pause between paragraphs (only if it's not the last one)
        if i < len(paragraphs) - 1:
            pause = np.zeros(int(model.config.sampling_rate * 0.8))  # 0.8 seconds pause
            audio_list.append(np.concatenate((waveform, pause)))
        else:
            audio_list.append(waveform)

    final_audio = np.concatenate(audio_list)
    filename = "output.wav"
    scipy.io.wavfile.write(filename, rate=model.config.sampling_rate, data=(final_audio * 32767).astype(np.int16))
    return filename

gr.Interface(
    fn=tts,
    inputs=gr.Textbox(label="Geli qoraal Soomaali ah", lines=10, placeholder="Ku qor 1 ama in ka badan paragraph..."),
    outputs=gr.Audio(label="Codka TTS"),
    title="Somali TTS",
    description="Ku qor qoraal Soomaaliyeed si aad u maqasho cod dabiici ah."
).launch()