Spaces:

Somalitts
/

8aad

Running

App Files Files Community

Somalitts commited on Jul 16, 2025

Commit

2860b2a

verified ·

1 Parent(s): 767e58a

Update app.py

Browse files

Files changed (1) hide show

app.py +141 -35

app.py CHANGED Viewed

@@ -10,71 +10,177 @@ from speechbrain.pretrained import EncoderClassifier
 device = "cuda" if torch.cuda.is_available() else "cpu"
-processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts", cache_dir=CACHE_DIR)
-vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan", cache_dir=CACHE_DIR).to(device)
-model_female = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad", cache_dir=CACHE_DIR).to(device)
-# Speaker encoder
 speaker_model = EncoderClassifier.from_hparams(
     source="speechbrain/spkrec-xvect-voxceleb",
     run_opts={"device": device},
-    savedir="/tmp/spk_model"
 )
-# Load female embedding only
-def get_embedding(wav_path, pt_path):
-    if os.path.exists(pt_path):
-        return torch.load(pt_path).to(device)
-    audio, sr = torchaudio.load(wav_path)
     audio = torchaudio.functional.resample(audio, sr, 16000).mean(dim=0).unsqueeze(0).to(device)
     with torch.no_grad():
         emb = speaker_model.encode_batch(audio)
         emb = torch.nn.functional.normalize(emb, dim=2).squeeze()
-    torch.save(emb.cpu(), pt_path)
-    return emb
-embedding_female = get_embedding("caasho.wav", "/tmp/female_embedding.pt")
-# Text normalization
 number_words = {
     0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
     6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
     20: "labaatan", 30: "sodon", 40: "afartan", 50: "konton",
     60: "lixdan", 70: "todobaatan", 80: "sideetan", 90: "sagaashan",
     100: "boqol", 1000: "kun"
 }
-def number_to_words(n):
-    if n < 20:
-        return number_words.get(n, str(n))
-    elif n < 100:
-        tens, unit = divmod(n, 10)
-        return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
-    elif n < 1000:
-        hundreds, rem = divmod(n, 100)
-        return (number_words[hundreds] + " boqol" if hundreds > 1 else "boqol") + (" " + number_to_words(rem) if rem else "")
-    elif n < 1_000_000:
-        th, rem = divmod(n, 1000)
-        return (number_to_words(th) + " kun") + (" " + number_to_words(rem) if rem else "")
     else:
-        return str(n)
 def replace_numbers_with_words(text):
-    return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
 def normalize_text(text):
     text = text.lower()
     text = replace_numbers_with_words(text)
     text = re.sub(r'[^\w\s]', '', text)
     return text
-# Gradio interface
 iface = gr.Interface(
-    fn=tts,
-    inputs=gr.Textbox(label="Geli qoraalka af-soomaali", lines=10, placeholder="Ku qor qoraalka..."),
-    outputs=gr.Audio(label="Codka la abuuray", type="filepath"),
-    title="Somali TTS - Qaybo Dheer & Cod Gaar ah",
-    description="Qoraal dheer ayaad gali kartaa oo lagu kala jarayo paragraphs. Waxaa lagu abuurayaa cod TTS af Soomaali ah."
 )
 iface.launch()

 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load models
+processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
+vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
 speaker_model = EncoderClassifier.from_hparams(
     source="speechbrain/spkrec-xvect-voxceleb",
     run_opts={"device": device},
+    savedir="./spk_model"
 )
+# Speaker embedding
+EMB_PATH = "speaker_embedding.pt"
+if os.path.exists(EMB_PATH):
+    speaker_embedding = torch.load(EMB_PATH).to(device)
+else:
+    audio, sr = torchaudio.load("1.wav")
     audio = torchaudio.functional.resample(audio, sr, 16000).mean(dim=0).unsqueeze(0).to(device)
     with torch.no_grad():
         emb = speaker_model.encode_batch(audio)
         emb = torch.nn.functional.normalize(emb, dim=2).squeeze()
+    torch.save(emb.cpu(), EMB_PATH)
+    speaker_embedding = emb
+# Number conversion (Somali)
 number_words = {
     0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
     6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
+    11: "toban iyo koow", 12: "toban iyo labo", 13: "toban iyo seddex",
+    14: "toban iyo afar", 15: "toban iyo shan", 16: "toban iyo lix",
+    17: "toban iyo todobo", 18: "toban iyo sideed", 19: "toban iyo sagaal",
     20: "labaatan", 30: "sodon", 40: "afartan", 50: "konton",
     60: "lixdan", 70: "todobaatan", 80: "sideetan", 90: "sagaashan",
     100: "boqol", 1000: "kun"
 }
+shortcut_map = {
+    "asc": "asalaamu caleykum",
+    "wcs": "wacaleykum salaam",
+    "fcn": "fiican",
+    "xld": "xaaladda ka waran",
+    "kwrn": "kawaran",
+    "scw": "salalaahu caleyhi wa salam",
+    "alx": "alxamdu lilaahi",
+    "m.a": "maasha allah",
+    "sthy": "side tahey",
+    "sxp": "saaxiib"
+}
+country_map = {
+    "somalia": "Soomaaliya",
+    "ethiopia": "Itoobiya",
+    "kenya": "Kenya",
+    "djibouti": "Jabuuti",
+    "sudan": "Suudaan",
+    "Yeman": "yemaan",
+    "uganda": "Ugaandha",
+    "tanzania": "Tansaaniya",
+    "egypt": "Masar",
+    "libya": "Liibiya",
+    "algeria": "Aljeeriya",
+    "morocco": "Morooko",
+    "tunisia": "Tuniisiya",
+    "eritrea": "Eriteriya",
+    "malawi": "Malaawi",
+    "English": "ingiriis",
+    "Spain": "isbeen",
+    "Brazil": "baraasiil",
+    "niger": "Niyjer",
+    "Italy": "itaaliya",
+    "united states": "Maraykanka",
+    "china": "Shiinaha",
+    "india": "Hindiya",
+    "russia": "Ruushka",
+    "Saudi Arabia": "Sucuudi Carabiya",
+    "germany": "Jarmalka",
+    "france": "Faransiiska",
+    "japan": "Jabaan",
+    "canada": "Kanada",
+    "australia": "Australia"
+}
+def number_to_words(number):
+    number = int(number)
+    if number < 20:
+        return number_words[number]
+    elif number < 100:
+        tens, unit = divmod(number, 10)
+        return number_words[tens * 10] + (" iyo " + number_words[unit] if unit else "")
+    elif number < 1000:
+        hundreds, remainder = divmod(number, 100)
+        part = (number_words[hundreds] + " boqol") if hundreds > 1 else "boqol"
+        if remainder:
+            part += " iyo " + number_to_words(remainder)
+        return part
+    elif number < 1000000:
+        thousands, remainder = divmod(number, 1000)
+        words = []
+        if thousands == 1:
+            words.append("kun")
+        else:
+            words.append(number_to_words(thousands) + " kun")
+        if remainder:
+            words.append("iyo " + number_to_words(remainder))
+        return " ".join(words)
+    elif number < 1000000000:
+        millions, remainder = divmod(number, 1000000)
+        words = []
+        if millions == 1:
+            words.append("milyan")
+        else:
+            words.append(number_to_words(millions) + " milyan")
+        if remainder:
+            words.append(number_to_words(remainder))
+        return " ".join(words)
     else:
+        return str(number)
 def replace_numbers_with_words(text):
+    def replace(match):
+        number = int(match.group())
+        return number_to_words(number)
+    return re.sub(r'\b\d+\b', replace, text)
 def normalize_text(text):
     text = text.lower()
     text = replace_numbers_with_words(text)
+    def replace_shortcuts(match):
+        word = match.group(0).lower()
+        return shortcut_map.get(word, word)
+    pattern = re.compile(r'\b(' + '|'.join(re.escape(k) for k in shortcut_map.keys()) + r')\b', re.IGNORECASE)
+    text = pattern.sub(replace_shortcuts, text)
+    def replace_countries(match):
+        word = match.group(0).lower()
+        return country_map.get(word, word)
+    country_pattern = re.compile(r'\b(' + '|'.join(re.escape(k) for k in country_map.keys()) + r')\b', re.IGNORECASE)
+    text = country_pattern.sub(replace_countries, text)
+    text = re.sub(r'(\d{1,3})(,\d{3})+', lambda m: m.group(0).replace(",", ""), text)
+    text = re.sub(r'\.\d+', '', text)
+    symbol_map = {
+        '$': 'doolar',
+        '=': 'egwal',
+        '+': 'balaas',
+        '#': 'haash'
+    }
+    for sym, word in symbol_map.items():
+        text = text.replace(sym, ' ' + word + ' ')
     text = re.sub(r'[^\w\s]', '', text)
     return text
+def text_to_speech(text):
+    text = normalize_text(text)
+    inputs = processor(text=text, return_tensors="pt").to(device)
+    with torch.no_grad():
+        speech = model.generate_speech(inputs["input_ids"], speaker_embedding.unsqueeze(0), vocoder=vocoder)
+    return (16000, speech.cpu().numpy())
 iface = gr.Interface(
+    fn=text_to_speech,
+    inputs=gr.Textbox(label="Geli qoraalka af-soomaali"),
+    outputs=gr.Audio(label="Codka la abuuray", type="numpy"),
+    title="Somali TTS",
+    description="TTS Soomaaliyeed oo la adeegsaday cod gaar ah (1.wav)"
 )
 iface.launch()