jellecali8 commited on
Commit
164b72b
·
verified ·
1 Parent(s): b657362

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +121 -21
app.py CHANGED
@@ -1,30 +1,130 @@
1
  import gradio as gr
2
- from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
3
  import torch
4
- import soundfile as sf
5
- import tempfile
 
 
6
 
7
- model_id = "jellecali8/somali_tts_model"
 
 
 
 
 
8
 
9
- processor = AutoProcessor.from_pretrained(model_id)
10
- model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  def tts(text):
13
- inputs = processor(text, return_tensors="pt")
14
- with torch.no_grad():
15
- outputs = model.generate(**inputs)
16
- audio = outputs[0].cpu().numpy()
17
 
18
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
19
- sf.write(f.name, audio, samplerate=16000)
20
- return f.name
 
21
 
22
- iface = gr.Interface(
23
- fn=tts,
24
- inputs=gr.Textbox(lines=2, placeholder="Ku qor qoraalka Somali halkan...", label="Qoraalka Somali"),
25
- outputs=gr.Audio(label="Codka la soo saaray"),
26
- title="Somali TTS Demo",
27
- description="Qoraal ku qor af-Soomaali kadib dhageyso codka."
28
- )
 
 
 
 
29
 
30
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
2
  import torch
3
+ import numpy as np
4
+ import scipy.io.wavfile
5
+ from transformers import VitsModel, AutoTokenizer
6
+ import re
7
 
8
+ # Load model and tokenizer
9
+ model = VitsModel.from_pretrained("Somali-tts/somali_tts_model")
10
+ tokenizer = AutoTokenizer.from_pretrained("saleolow/somali-mms-tts")
11
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12
+ model.to(device)
13
+ model.eval()
14
 
15
+ # Numbers in Somali
16
+ number_words = {
17
+ 0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
18
+ 6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
19
+ 11: "toban iyo koow", 12: "toban iyo labo", 13: "toban iyo seddex",
20
+ 14: "toban iyo afar", 15: "toban iyo shan", 16: "toban iyo lix",
21
+ 17: "toban iyo todobo", 18: "toban iyo sideed", 19: "toban iyo sagaal",
22
+ 20: "labaatan", 30: "sodon", 40: "afartan", 50: "konton",
23
+ 60: "lixdan", 70: "todobaatan", 80: "sideetan", 90: "sagaashan",
24
+ 100: "boqol", 1000: "kun"
25
+ }
26
+
27
+ def number_to_words(number):
28
+ number = int(number)
29
+ if number < 20:
30
+ return number_words[number]
31
+ elif number < 100:
32
+ tens, unit = divmod(number, 10)
33
+ return number_words[tens * 10] + (" iyo " + number_words[unit] if unit else "")
34
+ elif number < 1000:
35
+ hundreds, remainder = divmod(number, 100)
36
+ part = (number_words[hundreds] + " boqol") if hundreds > 1 else "boqol"
37
+ if remainder:
38
+ part += " iyo " + number_to_words(remainder)
39
+ return part
40
+ elif number < 1000000:
41
+ thousands, remainder = divmod(number, 1000)
42
+ words = []
43
+ if thousands == 1:
44
+ words.append("kun")
45
+ else:
46
+ words.append(number_to_words(thousands) + " kun")
47
+ if remainder >= 100:
48
+ hundreds, rem2 = divmod(remainder, 100)
49
+ if hundreds:
50
+ boqol_text = (number_words[hundreds] + " boqol") if hundreds > 1 else "boqol"
51
+ words.append(boqol_text)
52
+ if rem2:
53
+ words.append("iyo " + number_to_words(rem2))
54
+ elif remainder:
55
+ words.append("iyo " + number_to_words(remainder))
56
+ return " ".join(words)
57
+ elif number < 1000000000:
58
+ millions, remainder = divmod(number, 1000000)
59
+ words = []
60
+ if millions == 1:
61
+ words.append("milyan")
62
+ else:
63
+ words.append(number_to_words(millions) + " milyan")
64
+ if remainder:
65
+ words.append(number_to_words(remainder))
66
+ return " ".join(words)
67
+ else:
68
+ return str(number)
69
+
70
+ def normalize_text(text):
71
+ text = re.sub(r'(\d{1,3})(,\d{3})+', lambda m: m.group(0).replace(",", ""), text)
72
+ text = re.sub(r'\.\d+', '', text)
73
+ def replace_num(match):
74
+ return number_to_words(match.group())
75
+ text = re.sub(r'\d+', replace_num, text)
76
+ symbol_map = {
77
+ '$': 'doolar',
78
+ '=': 'egwal',
79
+ '+': 'balaas',
80
+ '#': 'haash'
81
+ }
82
+ for sym, word in symbol_map.items():
83
+ text = text.replace(sym, ' ' + word + ' ')
84
+ text = text.replace("KH", "qa").replace("Z", "S")
85
+ text = text.replace("SH", "SHa'a").replace("DH", "Dha'a")
86
+ text = text.replace("ZamZam", "SamSam")
87
+ return text
88
 
89
  def tts(text):
90
+ paragraphs = text.strip().split("\n")
91
+ audio_list = []
92
+ max_chars = 500 # Qiyaasta ugu badan 2 daqiiqo
93
+ warn_msg = ""
94
 
95
+ for i, para in enumerate(paragraphs):
96
+ para = para.strip()
97
+ if not para:
98
+ continue
99
 
100
+ if len(para) > max_chars:
101
+ warn_msg += f"❗ Qaybta {i+1} aad ayaa ka badan 2 daqiiqo. Waan kala jaray.\n"
102
+ sub_parts = [para[j:j+max_chars] for j in range(0, len(para), max_chars)]
103
+ else:
104
+ sub_parts = [para]
105
+
106
+ for part in sub_parts:
107
+ norm_para = normalize_text(part)
108
+ inputs = tokenizer(norm_para, return_tensors="pt").to(device)
109
+ with torch.no_grad():
110
+ waveform = model(**inputs).waveform.squeeze().cpu().numpy()
111
 
112
+ pause = np.zeros(int(model.config.sampling_rate * 0.8)) # 0.8s pause
113
+ audio_list.append(np.concatenate((waveform, pause)))
114
+
115
+ final_audio = np.concatenate(audio_list)
116
+ filename = "output.wav"
117
+ scipy.io.wavfile.write(filename, rate=model.config.sampling_rate, data=(final_audio * 32767).astype(np.int16))
118
+
119
+ if warn_msg:
120
+ print(warn_msg)
121
+ return filename
122
+
123
+ # Gradio interface
124
+ gr.Interface(
125
+ fn=tts,
126
+ inputs=gr.Textbox(label="Geli qoraal Soomaali ah", lines=10, placeholder="Ku qor 1 ama in ka badan paragraph..."),
127
+ outputs=gr.Audio(label="Codka TTS"),
128
+ title="Somali TTS",
129
+ description="Ku qor qoraal Soomaaliyeed si aad u maqasho cod dabiici ah. Qoraalka ha ka badnaan 2 daqiiqo per jumlad."
130
+ ).launch()