Spaces:

Somalitts
/

orph

Build error

App Files Files Community

Somalitts commited on Jul 13, 2025

Commit

4b84ef4

verified ·

1 Parent(s): 0ff498c

Update app.py

Browse files

Files changed (1) hide show

app.py +129 -60

app.py CHANGED Viewed

@@ -1,73 +1,142 @@
-# WARNING: THIS CODE IS FOR ILLUSTRATION ONLY AND WILL NOT WORK.
-# The 'Somalitts/somali_tts_model' does not support voice cloning.
 import gradio as gr
 import torch
-import numpy as np
-import scipy.io.wavfile
-from transformers import VitsModel, AutoTokenizer
 import re
-# --- The problem starts here ---
-# This model is a single-speaker model. It CANNOT clone voices.
-# To make this work, you would need a different model designed for cloning.
-model = VitsModel.from_pretrained("Somali-tts/somali_tts_model")
-tokenizer = AutoTokenizer.from_pretrained("saleolow/somali-mms-tts")
-# ---
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model.to(device)
-model.eval()
-# For this to work, you would need to upload your voice files to your Space
-# and provide the path here.
-YOUR_VOICE_SAMPLE_PATH = ["46.wav", "90.wav", "150.wav", "355.wav"]
-# [Your other functions like number_to_words and normalize_text would remain here]
-# ...
-def tts(text):
-    # --- The core logic would need to change entirely ---
-    # 1. THIS IS THE MISSING STEP:
-    # A real voice cloning model would need to extract voice characteristics
-    # from your audio file. The VitsModel you are using has NO such function.
-    #
-    # PSEUDO-CODE (DOES NOT EXIST FOR THIS MODEL):
-    # voice_characteristics = model.extract_speaker_embedding(YOUR_VOICE_SAMPLE_PATH)
-    paragraphs = text.strip().split("\n")
-    audio_list = []
-    for para in paragraphs:
-        # [Text processing would be the same]
-        # ...
-        norm_para = normalize_text(para)
-        inputs = tokenizer(norm_para, return_tensors="pt").to(device)
-        with torch.no_grad():
-            # 2. THIS IS THE SECOND MISSING STEP:
-            # You would need to pass your voice characteristics to the model.
-            # The current model does not accept a 'speaker_embedding' or similar argument.
-            #
-            # PSEUDO-CODE (DOES NOT EXIST FOR THIS MODEL):
-            # waveform = model(**inputs, speaker_embedding=voice_characteristics).waveform
-            # The actual line of code below does not and cannot use your voice:
-            waveform = model(**inputs).waveform.squeeze().cpu().numpy()
-        pause = np.zeros(int(model.config.sampling_rate * 0.8))
-        audio_list.append(np.concatenate((waveform, pause)))
-    final_audio = np.concatenate(audio_list)
-    filename = "output.wav"
-    scipy.io.wavfile.write(filename, rate=model.config.sampling_rate, data=(final_audio * 32767).astype(np.int16))
-    return filename
-# The interface would also need an input for the audio file.
-gr.Interface(
-    fn=tts,
-    inputs=gr.Textbox(label="Geli qoraal Soomaali ah"),
-    outputs=gr.Audio(label="Codka TTS"),
-    title="Somali TTS (Non-Cloning)"
-).launch()

 import gradio as gr
 import torch
 import re
+import os
+from TTS.api import TTS
+# --- Configuration ---
+# Set the device for computation
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# --- Add Your Voice Files Here ---
+# A dropdown menu will be created using this list.
+VOICE_SAMPLE_FILES = ["46.wav", "90.wav", "150.wav", "355.wav"]
+# --- Load the VITS-based Voice Cloning Model ---
+# This uses the powerful XTTS-v2 model from Coqui TTS, which is designed for this task.
+# It will be downloaded on the first run.
+try:
+    print("Loading VITS-based voice cloning model (XTTS-v2)...")
+    tts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
+    print("Model loaded successfully.")
+except Exception as e:
+    raise gr.Error(f"Error loading the TTS model: {e}. Check your internet connection.")
+# --- Somali Text Processing Functions (From Your Original Script) ---
+# This logic is preserved exactly as you provided it.
+number_words = {
+    0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
+    6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
+    11: "toban iyo koow", 12: "toban iyo labo", 13: "toban iyo seddex",
+    14: "toban iyo afar", 15: "toban iyo shan", 16: "toban iyo lix",
+    17: "toban iyo todobo", 18: "toban iyo sideed", 19: "toban iyo sagaal",
+    20: "labaatan", 30: "sodon", 40: "afartan", 50: "konton",
+    60: "lixdan", 70: "todobaatan", 80: "sideetan", 90: "sagaashan",
+    100: "boqol", 1000: "kun"
+}
+def number_to_words(number):
+    try:
+        number = int(number)
+        if number < 20: return number_words.get(number, str(number))
+        if number < 100:
+            tens, unit = divmod(number, 10)
+            return number_words[tens * 10] + (" iyo " + number_words[unit] if unit else "")
+        if number < 1000:
+            hundreds, remainder = divmod(number, 100)
+            part = (number_words.get(hundreds) + " boqol" if hundreds > 1 else "boqol")
+            if remainder: part += " iyo " + number_to_words(remainder)
+            return part
+        if number < 1_000_000:
+            thousands, remainder = divmod(number, 1000)
+            part = (number_to_words(thousands) + " kun") if thousands > 1 else "kun"
+            if remainder: part += " " + number_to_words(remainder)
+            return part
+        return str(number)
+    except (ValueError, KeyError):
+        return str(number)
+def normalize_text(text):
+    text = re.sub(r'(\d+)', lambda m: number_to_words(m.group(1)), text)
+    symbol_map = {
+        '$': 'doolar', '=': 'egwal', '+': 'balaas', '#': 'haash'
+    }
+    for sym, word in symbol_map.items():
+        text = text.replace(sym, f' {word} ')
+    text = text.replace("KH", "qa").replace("Z", "s")
+    text = text.replace("SH", "sha'a").replace("DH", "dha'a")
+    text = text.replace("ZamZam", "samsam").replace("zamzam", "samsam")
+    return text.lower()
+# --- Main Text-to-Speech Function ---
+def generate_cloned_speech(text, voice_choice):
+    """
+    Generates speech using the VITS model by cloning the selected voice.
+    """
+    if not text or not text.strip():
+        gr.Warning("Qoraalka geli, fadlan (Please enter some text).")
+        return None
+    if not voice_choice:
+        gr.Warning("Cod dooro, fadlan (Please select a voice).")
+        return None
+    if not os.path.exists(voice_choice):
+        raise gr.Error(f"File-ka codka lama helin: {voice_choice}. Hubi inuu ku jiro galka saxda ah.")
+    print(f"Generating speech for text: '{text}' using voice: '{voice_choice}'")
+    # Process the text using your normalization function
+    normalized_text = normalize_text(text)
+    output_path = "output.wav"
+    # Use the VITS model to generate speech with the cloned voice
+    tts_model.tts_to_file(
+        text=normalized_text,
+        speaker_wav=voice_choice,  # This is the key parameter for voice cloning
+        language="so",             # Set the language to Somali
+        file_path=output_path
+    )
+    print("Speech generated successfully.")
+    return output_path
+# --- Gradio User Interface ---
+with gr.Blocks(theme=gr.themes.Base()) as app:
+    gr.Markdown(
+        "# Somali TTS with VITS Voice Cloning\n"
+        "Ku qor qoraal Soomaaliyeed, dooro mid ka mid ah codadkaaga, oo riix 'Soo Saar Codka' si aad u maqasho qoraalka oo codkaas ku hadlaya."
+    )
+    with gr.Row():
+        with gr.Column():
+            text_input = gr.Textbox(
+                label="Qoraalka Geli (Enter Text)",
+                lines=5,
+                placeholder="Ku qor qoraalkaaga halkan..."
+            )
+            voice_dropdown = gr.Dropdown(
+                choices=VOICE_SAMPLE_FILES,
+                label="Codka Dooro (Select Your Voice)",
+                info="Dooro codka aad rabto inaad ku hadasho.",
+                value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None
+            )
+            submit_button = gr.Button("Soo Saar Codka (Generate Speech)", variant="primary")
+        with gr.Column():
+            audio_output = gr.Audio(label="Codka La Abuuray (Generated Audio)", type="filepath")
+    submit_button.click(
+        fn=generate_cloned_speech,
+        inputs=[text_input, voice_dropdown],
+        outputs=audio_output
+    )
+    gr.Examples(
+        examples=[
+            ["Waa imisa qiimaha badeecadan? waa 1500 oo shilin.", VOICE_SAMPLE_FILES[0]],
+            ["Bari waxaan aadayaa magaalada Muqdisho.", VOICE_SAMPLE_FILES[1] if len(VOICE_SAMPLE_FILES) > 1 else VOICE_SAMPLE_FILES[0]],
+        ],
+        inputs=[text_input, voice_dropdown]
+    )
+# --- Launch the Application ---
+if __name__ == "__main__":
+    app.launch()