Somalitts commited on
Commit
4b84ef4
·
verified ·
1 Parent(s): 0ff498c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -60
app.py CHANGED
@@ -1,73 +1,142 @@
1
- # WARNING: THIS CODE IS FOR ILLUSTRATION ONLY AND WILL NOT WORK.
2
- # The 'Somalitts/somali_tts_model' does not support voice cloning.
3
-
4
  import gradio as gr
5
  import torch
6
- import numpy as np
7
- import scipy.io.wavfile
8
- from transformers import VitsModel, AutoTokenizer
9
  import re
 
 
10
 
11
- # --- The problem starts here ---
12
- # This model is a single-speaker model. It CANNOT clone voices.
13
- # To make this work, you would need a different model designed for cloning.
14
- model = VitsModel.from_pretrained("Somali-tts/somali_tts_model")
15
- tokenizer = AutoTokenizer.from_pretrained("saleolow/somali-mms-tts")
16
- # ---
17
 
18
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19
- model.to(device)
20
- model.eval()
21
 
22
- # For this to work, you would need to upload your voice files to your Space
23
- # and provide the path here.
24
- YOUR_VOICE_SAMPLE_PATH = ["46.wav", "90.wav", "150.wav", "355.wav"]
 
 
 
 
 
 
25
 
26
- # [Your other functions like number_to_words and normalize_text would remain here]
27
- # ...
 
 
 
 
 
 
 
 
 
 
28
 
29
- def tts(text):
30
- # --- The core logic would need to change entirely ---
31
-
32
- # 1. THIS IS THE MISSING STEP:
33
- # A real voice cloning model would need to extract voice characteristics
34
- # from your audio file. The VitsModel you are using has NO such function.
35
- #
36
- # PSEUDO-CODE (DOES NOT EXIST FOR THIS MODEL):
37
- # voice_characteristics = model.extract_speaker_embedding(YOUR_VOICE_SAMPLE_PATH)
 
 
 
 
 
 
 
 
 
 
 
38
 
39
- paragraphs = text.strip().split("\n")
40
- audio_list = []
41
-
42
- for para in paragraphs:
43
- # [Text processing would be the same]
44
- # ...
45
- norm_para = normalize_text(para)
46
- inputs = tokenizer(norm_para, return_tensors="pt").to(device)
 
 
 
47
 
48
- with torch.no_grad():
49
- # 2. THIS IS THE SECOND MISSING STEP:
50
- # You would need to pass your voice characteristics to the model.
51
- # The current model does not accept a 'speaker_embedding' or similar argument.
52
- #
53
- # PSEUDO-CODE (DOES NOT EXIST FOR THIS MODEL):
54
- # waveform = model(**inputs, speaker_embedding=voice_characteristics).waveform
55
-
56
- # The actual line of code below does not and cannot use your voice:
57
- waveform = model(**inputs).waveform.squeeze().cpu().numpy()
 
 
 
58
 
59
- pause = np.zeros(int(model.config.sampling_rate * 0.8))
60
- audio_list.append(np.concatenate((waveform, pause)))
61
 
62
- final_audio = np.concatenate(audio_list)
63
- filename = "output.wav"
64
- scipy.io.wavfile.write(filename, rate=model.config.sampling_rate, data=(final_audio * 32767).astype(np.int16))
65
- return filename
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
- # The interface would also need an input for the audio file.
68
- gr.Interface(
69
- fn=tts,
70
- inputs=gr.Textbox(label="Geli qoraal Soomaali ah"),
71
- outputs=gr.Audio(label="Codka TTS"),
72
- title="Somali TTS (Non-Cloning)"
73
- ).launch()
 
 
 
 
1
  import gradio as gr
2
  import torch
 
 
 
3
  import re
4
+ import os
5
+ from TTS.api import TTS
6
 
7
+ # --- Configuration ---
8
+ # Set the device for computation
9
+ device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
10
 
11
+ # --- Add Your Voice Files Here ---
12
+ # A dropdown menu will be created using this list.
13
+ VOICE_SAMPLE_FILES = ["46.wav", "90.wav", "150.wav", "355.wav"]
14
 
15
+ # --- Load the VITS-based Voice Cloning Model ---
16
+ # This uses the powerful XTTS-v2 model from Coqui TTS, which is designed for this task.
17
+ # It will be downloaded on the first run.
18
+ try:
19
+ print("Loading VITS-based voice cloning model (XTTS-v2)...")
20
+ tts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
21
+ print("Model loaded successfully.")
22
+ except Exception as e:
23
+ raise gr.Error(f"Error loading the TTS model: {e}. Check your internet connection.")
24
 
25
+ # --- Somali Text Processing Functions (From Your Original Script) ---
26
+ # This logic is preserved exactly as you provided it.
27
+ number_words = {
28
+ 0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
29
+ 6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
30
+ 11: "toban iyo koow", 12: "toban iyo labo", 13: "toban iyo seddex",
31
+ 14: "toban iyo afar", 15: "toban iyo shan", 16: "toban iyo lix",
32
+ 17: "toban iyo todobo", 18: "toban iyo sideed", 19: "toban iyo sagaal",
33
+ 20: "labaatan", 30: "sodon", 40: "afartan", 50: "konton",
34
+ 60: "lixdan", 70: "todobaatan", 80: "sideetan", 90: "sagaashan",
35
+ 100: "boqol", 1000: "kun"
36
+ }
37
 
38
+ def number_to_words(number):
39
+ try:
40
+ number = int(number)
41
+ if number < 20: return number_words.get(number, str(number))
42
+ if number < 100:
43
+ tens, unit = divmod(number, 10)
44
+ return number_words[tens * 10] + (" iyo " + number_words[unit] if unit else "")
45
+ if number < 1000:
46
+ hundreds, remainder = divmod(number, 100)
47
+ part = (number_words.get(hundreds) + " boqol" if hundreds > 1 else "boqol")
48
+ if remainder: part += " iyo " + number_to_words(remainder)
49
+ return part
50
+ if number < 1_000_000:
51
+ thousands, remainder = divmod(number, 1000)
52
+ part = (number_to_words(thousands) + " kun") if thousands > 1 else "kun"
53
+ if remainder: part += " " + number_to_words(remainder)
54
+ return part
55
+ return str(number)
56
+ except (ValueError, KeyError):
57
+ return str(number)
58
 
59
+ def normalize_text(text):
60
+ text = re.sub(r'(\d+)', lambda m: number_to_words(m.group(1)), text)
61
+ symbol_map = {
62
+ '$': 'doolar', '=': 'egwal', '+': 'balaas', '#': 'haash'
63
+ }
64
+ for sym, word in symbol_map.items():
65
+ text = text.replace(sym, f' {word} ')
66
+ text = text.replace("KH", "qa").replace("Z", "s")
67
+ text = text.replace("SH", "sha'a").replace("DH", "dha'a")
68
+ text = text.replace("ZamZam", "samsam").replace("zamzam", "samsam")
69
+ return text.lower()
70
 
71
+ # --- Main Text-to-Speech Function ---
72
+ def generate_cloned_speech(text, voice_choice):
73
+ """
74
+ Generates speech using the VITS model by cloning the selected voice.
75
+ """
76
+ if not text or not text.strip():
77
+ gr.Warning("Qoraalka geli, fadlan (Please enter some text).")
78
+ return None
79
+ if not voice_choice:
80
+ gr.Warning("Cod dooro, fadlan (Please select a voice).")
81
+ return None
82
+ if not os.path.exists(voice_choice):
83
+ raise gr.Error(f"File-ka codka lama helin: {voice_choice}. Hubi inuu ku jiro galka saxda ah.")
84
 
85
+ print(f"Generating speech for text: '{text}' using voice: '{voice_choice}'")
 
86
 
87
+ # Process the text using your normalization function
88
+ normalized_text = normalize_text(text)
89
+
90
+ output_path = "output.wav"
91
+
92
+ # Use the VITS model to generate speech with the cloned voice
93
+ tts_model.tts_to_file(
94
+ text=normalized_text,
95
+ speaker_wav=voice_choice, # This is the key parameter for voice cloning
96
+ language="so", # Set the language to Somali
97
+ file_path=output_path
98
+ )
99
+
100
+ print("Speech generated successfully.")
101
+ return output_path
102
+
103
+ # --- Gradio User Interface ---
104
+ with gr.Blocks(theme=gr.themes.Base()) as app:
105
+ gr.Markdown(
106
+ "# Somali TTS with VITS Voice Cloning\n"
107
+ "Ku qor qoraal Soomaaliyeed, dooro mid ka mid ah codadkaaga, oo riix 'Soo Saar Codka' si aad u maqasho qoraalka oo codkaas ku hadlaya."
108
+ )
109
+ with gr.Row():
110
+ with gr.Column():
111
+ text_input = gr.Textbox(
112
+ label="Qoraalka Geli (Enter Text)",
113
+ lines=5,
114
+ placeholder="Ku qor qoraalkaaga halkan..."
115
+ )
116
+ voice_dropdown = gr.Dropdown(
117
+ choices=VOICE_SAMPLE_FILES,
118
+ label="Codka Dooro (Select Your Voice)",
119
+ info="Dooro codka aad rabto inaad ku hadasho.",
120
+ value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None
121
+ )
122
+ submit_button = gr.Button("Soo Saar Codka (Generate Speech)", variant="primary")
123
+ with gr.Column():
124
+ audio_output = gr.Audio(label="Codka La Abuuray (Generated Audio)", type="filepath")
125
+
126
+ submit_button.click(
127
+ fn=generate_cloned_speech,
128
+ inputs=[text_input, voice_dropdown],
129
+ outputs=audio_output
130
+ )
131
+
132
+ gr.Examples(
133
+ examples=[
134
+ ["Waa imisa qiimaha badeecadan? waa 1500 oo shilin.", VOICE_SAMPLE_FILES[0]],
135
+ ["Bari waxaan aadayaa magaalada Muqdisho.", VOICE_SAMPLE_FILES[1] if len(VOICE_SAMPLE_FILES) > 1 else VOICE_SAMPLE_FILES[0]],
136
+ ],
137
+ inputs=[text_input, voice_dropdown]
138
+ )
139
 
140
+ # --- Launch the Application ---
141
+ if __name__ == "__main__":
142
+ app.launch()