Somalitts commited on
Commit
0ff498c
·
verified ·
1 Parent(s): 51d9ba5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -102
app.py CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  import gradio as gr
2
  import torch
3
  import numpy as np
@@ -5,128 +8,66 @@ import scipy.io.wavfile
5
  from transformers import VitsModel, AutoTokenizer
6
  import re
7
 
8
- # Load model and tokenizer
 
 
9
  model = VitsModel.from_pretrained("Somali-tts/somali_tts_model")
10
  tokenizer = AutoTokenizer.from_pretrained("saleolow/somali-mms-tts")
 
 
11
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12
  model.to(device)
13
  model.eval()
14
 
15
- # Numbers in Somali
16
- number_words = {
17
- 0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
18
- 6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
19
- 11: "toban iyo koow", 12: "toban iyo labo", 13: "toban iyo seddex",
20
- 14: "toban iyo afar", 15: "toban iyo shan", 16: "toban iyo lix",
21
- 17: "toban iyo todobo", 18: "toban iyo sideed", 19: "toban iyo sagaal",
22
- 20: "labaatan", 30: "sodon", 40: "afartan", 50: "konton",
23
- 60: "lixdan", 70: "todobaatan", 80: "sideetan", 90: "sagaashan",
24
- 100: "boqol", 1000: "kun"
25
- }
26
 
27
- def number_to_words(number):
28
- number = int(number)
29
- if number < 20:
30
- return number_words[number]
31
- elif number < 100:
32
- tens, unit = divmod(number, 10)
33
- return number_words[tens * 10] + (" iyo " + number_words[unit] if unit else "")
34
- elif number < 1000:
35
- hundreds, remainder = divmod(number, 100)
36
- part = (number_words[hundreds] + " boqol") if hundreds > 1 else "boqol"
37
- if remainder:
38
- part += " iyo " + number_to_words(remainder)
39
- return part
40
- elif number < 1000000:
41
- thousands, remainder = divmod(number, 1000)
42
- words = []
43
- if thousands == 1:
44
- words.append("kun")
45
- else:
46
- words.append(number_to_words(thousands) + " kun")
47
- if remainder >= 100:
48
- hundreds, rem2 = divmod(remainder, 100)
49
- if hundreds:
50
- boqol_text = (number_words[hundreds] + " boqol") if hundreds > 1 else "boqol"
51
- words.append(boqol_text)
52
- if rem2:
53
- words.append("iyo " + number_to_words(rem2))
54
- elif remainder:
55
- words.append("iyo " + number_to_words(remainder))
56
- return " ".join(words)
57
- elif number < 1000000000:
58
- millions, remainder = divmod(number, 1000000)
59
- words = []
60
- if millions == 1:
61
- words.append("milyan")
62
- else:
63
- words.append(number_to_words(millions) + " milyan")
64
- if remainder:
65
- words.append(number_to_words(remainder))
66
- return " ".join(words)
67
- else:
68
- return str(number)
69
-
70
- def normalize_text(text):
71
- text = re.sub(r'(\d{1,3})(,\d{3})+', lambda m: m.group(0).replace(",", ""), text)
72
- text = re.sub(r'\.\d+', '', text)
73
- def replace_num(match):
74
- return number_to_words(match.group())
75
- text = re.sub(r'\d+', replace_num, text)
76
- symbol_map = {
77
- '$': 'doolar',
78
- '=': 'egwal',
79
- '+': 'balaas',
80
- '#': 'haash'
81
- }
82
- for sym, word in symbol_map.items():
83
- text = text.replace(sym, ' ' + word + ' ')
84
- text = text.replace("KH", "qa").replace("Z", "S")
85
- text = text.replace("SH", "SHa'a").replace("DH", "Dha'a")
86
- text = text.replace("ZamZam", "samsam")
87
- text = text.replace("Zamzam", "samsam")
88
- text = text.replace("zamzam", "samsam")
89
- return text
90
 
91
  def tts(text):
 
 
 
 
 
 
 
 
 
92
  paragraphs = text.strip().split("\n")
93
  audio_list = []
94
- max_chars = 500 # Qiyaasta ugu badan 2 daqiiqo
95
- warn_msg = ""
 
 
 
 
96
 
97
- for i, para in enumerate(paragraphs):
98
- para = para.strip()
99
- if not para:
100
- continue
 
 
 
 
 
 
101
 
102
- if len(para) > max_chars:
103
- warn_msg += f"❗ Qaybta {i+1} aad ayaa ka badan 2 daqiiqo. Waan kala jaray.\n"
104
- sub_parts = [para[j:j+max_chars] for j in range(0, len(para), max_chars)]
105
- else:
106
- sub_parts = [para]
107
-
108
- for part in sub_parts:
109
- norm_para = normalize_text(part)
110
- inputs = tokenizer(norm_para, return_tensors="pt").to(device)
111
- with torch.no_grad():
112
- waveform = model(**inputs).waveform.squeeze().cpu().numpy()
113
-
114
- pause = np.zeros(int(model.config.sampling_rate * 0.8)) # 0.8s pause
115
- audio_list.append(np.concatenate((waveform, pause)))
116
 
117
  final_audio = np.concatenate(audio_list)
118
  filename = "output.wav"
119
  scipy.io.wavfile.write(filename, rate=model.config.sampling_rate, data=(final_audio * 32767).astype(np.int16))
120
-
121
- if warn_msg:
122
- print(warn_msg)
123
  return filename
124
 
125
- # Gradio interface
126
  gr.Interface(
127
  fn=tts,
128
- inputs=gr.Textbox(label="Geli qoraal Soomaali ah", lines=10, placeholder="Ku qor 1 ama in ka badan paragraph..."),
129
  outputs=gr.Audio(label="Codka TTS"),
130
- title="Somali TTS",
131
- description="Ku qor qoraal Soomaaliyeed si aad u maqasho cod dabiici ah. Qoraalka ha ka badnaan 2 daqiiqo per jumlad."
132
  ).launch()
 
1
+ # WARNING: THIS CODE IS FOR ILLUSTRATION ONLY AND WILL NOT WORK.
2
+ # The 'Somalitts/somali_tts_model' does not support voice cloning.
3
+
4
  import gradio as gr
5
  import torch
6
  import numpy as np
 
8
  from transformers import VitsModel, AutoTokenizer
9
  import re
10
 
11
+ # --- The problem starts here ---
12
+ # This model is a single-speaker model. It CANNOT clone voices.
13
+ # To make this work, you would need a different model designed for cloning.
14
  model = VitsModel.from_pretrained("Somali-tts/somali_tts_model")
15
  tokenizer = AutoTokenizer.from_pretrained("saleolow/somali-mms-tts")
16
+ # ---
17
+
18
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19
  model.to(device)
20
  model.eval()
21
 
22
+ # For this to work, you would need to upload your voice files to your Space
23
+ # and provide the path here.
24
+ YOUR_VOICE_SAMPLE_PATH = ["46.wav", "90.wav", "150.wav", "355.wav"]
 
 
 
 
 
 
 
 
25
 
26
+ # [Your other functions like number_to_words and normalize_text would remain here]
27
+ # ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  def tts(text):
30
+ # --- The core logic would need to change entirely ---
31
+
32
+ # 1. THIS IS THE MISSING STEP:
33
+ # A real voice cloning model would need to extract voice characteristics
34
+ # from your audio file. The VitsModel you are using has NO such function.
35
+ #
36
+ # PSEUDO-CODE (DOES NOT EXIST FOR THIS MODEL):
37
+ # voice_characteristics = model.extract_speaker_embedding(YOUR_VOICE_SAMPLE_PATH)
38
+
39
  paragraphs = text.strip().split("\n")
40
  audio_list = []
41
+
42
+ for para in paragraphs:
43
+ # [Text processing would be the same]
44
+ # ...
45
+ norm_para = normalize_text(para)
46
+ inputs = tokenizer(norm_para, return_tensors="pt").to(device)
47
 
48
+ with torch.no_grad():
49
+ # 2. THIS IS THE SECOND MISSING STEP:
50
+ # You would need to pass your voice characteristics to the model.
51
+ # The current model does not accept a 'speaker_embedding' or similar argument.
52
+ #
53
+ # PSEUDO-CODE (DOES NOT EXIST FOR THIS MODEL):
54
+ # waveform = model(**inputs, speaker_embedding=voice_characteristics).waveform
55
+
56
+ # The actual line of code below does not and cannot use your voice:
57
+ waveform = model(**inputs).waveform.squeeze().cpu().numpy()
58
 
59
+ pause = np.zeros(int(model.config.sampling_rate * 0.8))
60
+ audio_list.append(np.concatenate((waveform, pause)))
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  final_audio = np.concatenate(audio_list)
63
  filename = "output.wav"
64
  scipy.io.wavfile.write(filename, rate=model.config.sampling_rate, data=(final_audio * 32767).astype(np.int16))
 
 
 
65
  return filename
66
 
67
+ # The interface would also need an input for the audio file.
68
  gr.Interface(
69
  fn=tts,
70
+ inputs=gr.Textbox(label="Geli qoraal Soomaali ah"),
71
  outputs=gr.Audio(label="Codka TTS"),
72
+ title="Somali TTS (Non-Cloning)"
 
73
  ).launch()