Spaces:

Somalitts
/

labadaba_do

Runtime error

App Files Files Community

Somalitts commited on Jul 21, 2025

Commit

65c7d6b

verified ·

1 Parent(s): a415f53

Update app.py

Browse files

Files changed (1) hide show

app.py +162 -105

app.py CHANGED Viewed

@@ -1,153 +1,210 @@
 import os
 import re
-import io
 import torch
-import numpy as np
 import torchaudio
-from fastapi import FastAPI, UploadFile, File, HTTPException, Query
-from fastapi.responses import StreamingResponse
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
-from speechbrain.inference import EncoderClassifier
-# ─── Setup ─────────────────────────────────────────────────────────────────────
-app = FastAPI(title="Somali Multi-Voice TTS API")
 device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Using device: {device}")
-VOICE_SAMPLE_FILES = ["1.wav"]
-# Use Hugging Face writable directory
-EMBEDDING_DIR = "/tmp/speaker_embeddings"
-os.makedirs(EMBEDDING_DIR, exist_ok=True)
-# Load models once
-processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
-model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
-vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
 speaker_model = EncoderClassifier.from_hparams(
     source="speechbrain/spkrec-xvect-voxceleb",
     run_opts={"device": device},
-    savedir="/tmp/spkrec-xvect-voxceleb"
 )
-speaker_embeddings_cache = {}
-# ─── Embedding Function ────────────────────────────────────────────────────────
-def get_speaker_embedding(wav_file_path):
-    if wav_file_path in speaker_embeddings_cache:
-        return speaker_embeddings_cache[wav_file_path]
-    embedding_path = os.path.join(EMBEDDING_DIR, os.path.basename(wav_file_path) + ".pt")
-    if os.path.exists(embedding_path):
-        embedding = torch.load(embedding_path, map_location=device)
-        speaker_embeddings_cache[wav_file_path] = embedding
-        return embedding
-    if not os.path.exists(wav_file_path):
-        raise HTTPException(status_code=404, detail=f"Voice file not found: {wav_file_path}")
-    audio, sr = torchaudio.load(wav_file_path)
-    if sr != 16000:
-        audio = torchaudio.functional.resample(audio, sr, 16000)
-    if audio.shape[0] > 1:
-        audio = torch.mean(audio, dim=0, keepdim=True)
     with torch.no_grad():
-        embedding = speaker_model.encode_batch(audio.to(device))
-        embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
-    torch.save(embedding.cpu(), embedding_path)
-    speaker_embeddings_cache[wav_file_path] = embedding.to(device)
-    return embedding.to(device)
-# ─── Text Normalization Functions ──────────────────────────────────────────────
 number_words = {
-    0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
-    6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
-    11: "kow iyo toban", 12: "labo iyo toban", 13: "saddex iyo toban",
-    14: "afar iyo toban", 15: "shan iyo toban", 16: "lix iyo toban",
-    17: "toddobo iyo toban", 18: "siddeed iyo toban", 19: "sagaal iyo toban",
-    20: "labaatan", 30: "soddon", 40: "afartan", 50: "konton",
-    60: "lixdan", 70: "toddobaatan", 80: "siddeetan", 90: "sagaashan",
-    100: "boqol", 1000: "kun"
 }
 def number_to_words(n):
-    if n in number_words:
-        return number_words[n]
-    if n < 100:
-        return number_words[n // 10 * 10] + (" iyo " + number_words[n % 10] if n % 10 else "")
-    if n < 1000:
-        return (number_words[n // 100] + " boqol" if n // 100 > 1 else "boqol") + (
-            " iyo " + number_to_words(n % 100) if n % 100 else "")
-    if n < 1_000_000:
-        return (number_to_words(n // 1000) + " kun" if n // 1000 > 1 else "kun") + (
-            " iyo " + number_to_words(n % 1000) if n % 1000 else "")
-    return str(n)
 def replace_numbers_with_words(text):
     return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
 def normalize_text(text):
     text = text.lower()
     text = replace_numbers_with_words(text)
     text = re.sub(r'[^\w\s\']', '', text)
     return text
-def split_long_text_into_chunks(text, max_words=18):
-    words = text.split()
-    return [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)]
-# ─── Routes ────────────────────────────────────────────────────────────────────
-@app.get("/")
-async def root():
-    return {"message": "Welcome to Somali Multi-Voice TTS API"}
-@app.post("/tts")
-async def text_to_speech_api(text: str = Query(..., min_length=1), voice_file: str = Query(...)):
-    if voice_file not in VOICE_SAMPLE_FILES:
-        raise HTTPException(status_code=400, detail=f"Voice file '{voice_file}' not found.")
-    try:
-        speaker_embedding = get_speaker_embedding(voice_file)
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-    text_chunks = split_long_text_into_chunks(text)
     audio_chunks = []
-    for idx, chunk in enumerate(text_chunks):
-        chunk = chunk.strip()
-        if not chunk:
             continue
-        norm_text = normalize_text(chunk)
-        inputs = processor(text=norm_text, return_tensors="pt").to(device)
-        with torch.no_grad():
-            speech = model.generate(
-                input_ids=inputs["input_ids"],
-                speaker_embeddings=speaker_embedding.unsqueeze(0),
-                do_sample=True,
-                top_k=50,
-                temperature=0.75,
-                repetition_penalty=1.2,
-                max_new_tokens=512
-            )
-            audio = vocoder(speech).cpu().squeeze().numpy()
             audio_chunks.append(audio)
-        # Add short pause between chunks
-        if idx < len(text_chunks) - 1:
-            pause = np.zeros(int(16000 * 0.8))
-            audio_chunks.append(pause)
     final_audio = np.concatenate(audio_chunks)
-    buffer = io.BytesIO()
-    torchaudio.save(buffer, torch.tensor(final_audio).unsqueeze(0), 16000, format="wav")
-    buffer.seek(0)
-    return StreamingResponse(buffer, media_type="audio/wav", headers={"Content-Disposition": "inline; filename=tts_output.wav"})

 import os
 import re
+import uuid
 import torch
 import torchaudio
+import soundfile as sf
+import numpy as np
+from fastapi import FastAPI
+from fastapi.responses import FileResponse
+from pydantic import BaseModel
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from speechbrain.inference.speaker import EncoderClassifier
+app = FastAPI()
 device = "cuda" if torch.cuda.is_available() else "cpu"
+CACHE_DIR = "/tmp/hf-cache"
+# Load models (female only)
+processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts", cache_dir=CACHE_DIR)
+vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan", cache_dir=CACHE_DIR).to(device)
+model_female = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad", cache_dir=CACHE_DIR).to(device)
+# Speaker encoder
 speaker_model = EncoderClassifier.from_hparams(
     source="speechbrain/spkrec-xvect-voxceleb",
     run_opts={"device": device},
+    savedir="/tmp/spk_model"
 )
+# Load female embedding only
+def get_embedding(wav_path, pt_path):
+    if os.path.exists(pt_path):
+        return torch.load(pt_path).to(device)
+    audio, sr = torchaudio.load(wav_path)
+    audio = torchaudio.functional.resample(audio, sr, 16000).mean(dim=0).unsqueeze(0).to(device)
     with torch.no_grad():
+        emb = speaker_model.encode_batch(audio)
+        emb = torch.nn.functional.normalize(emb, dim=2).squeeze()
+    torch.save(emb.cpu(), pt_path)
+    return emb
+embedding_female = get_embedding("caasho.wav", "/tmp/female_embedding.pt")
+# Number words dictionary (Somali)
 number_words = {
+    0: "eber",
+    1: "kow",
+    2: "laba",
+    3: "saddex",
+    4: "afar",
+    5: "shan",
+    6: "lix",
+    7: "toddoba",
+    8: "siddeed",
+    9: "sagaal",
+    10: "toban",
+    11: "kow iyo toban",
+    12: "laba iyo toban",
+    13: "saddex iyo toban",
+    14: "afar iyo toban",
+    15: "shan iyo toban",
+    16: "lix iyo toban",
+    17: "toddoba iyo toban",
+    18: "siddeed iyo toban",
+    19: "sagaal iyo toban",
+    20: "labaatan",
+    30: "soddon",
+    40: "afaratan",
+    50: "konton",
+    60: "lixdan",
+    70: "toddobaatan",
+    80: "siddeetan",
+    90: "sagaashan",
+    100: "boqol",
+    1000: "kun"
 }
 def number_to_words(n):
+    try:
+        if n in number_words:
+            return number_words[n]
+        if n < 100:
+            return number_words[n // 10 * 10] + (" iyo " + number_words[n % 10] if n % 10 else "")
+        if n < 1000:
+            return (number_words[n // 100] + " boqol" if n // 100 > 1 else "boqol") + (
+                " iyo " + number_to_words(n % 100) if n % 100 else "")
+        if n < 1_000_000:
+            return (number_to_words(n // 1000) + " kun" if n // 1000 > 1 else "kun") + (
+                " iyo " + number_to_words(n % 1000) if n % 1000 else "")
+        if n < 1_000_000_000:
+            return (number_to_words(n // 1_000_000) + " milyan" if n // 1_000_000 > 1 else "milyan") + (
+                " iyo " + number_to_words(n % 1_000_000) if n % 1_000_000 else "")
+        return str(n)
+    except Exception as e:
+        print(f"Error converting number {n}: {e}")
+        return str(n)
 def replace_numbers_with_words(text):
     return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
 def normalize_text(text):
     text = text.lower()
     text = replace_numbers_with_words(text)
     text = re.sub(r'[^\w\s\']', '', text)
     return text
+def split_into_sentences(text):
+    sentence_endings = re.compile(r'(?<=[.!?])\s+')
+    sentences = sentence_endings.split(text)
+    return [s.strip() for s in sentences if s.strip()]
+def get_speaker_embedding(voice_choice):
+    # For now we only have female embedding loaded
+    # If you have male embedding, load it and select here based on voice_choice
+    return embedding_female
+def text_to_speech(text, voice_choice):
+    if not text or not voice_choice:
+        # gr.Warning() is undefined in this context - replace or remove as needed
+        print("Fadlan geli qoraal oo dooro cod.")
+        return None
+    speaker_embedding = get_speaker_embedding(voice_choice)
+    paragraphs = text.strip().split("\n")
     audio_chunks = []
+    for para_idx, para in enumerate(paragraphs):
+        para = para.strip()
+        if not para:
             continue
+        sentences = split_into_sentences(para)
+        for sent_idx, sentence in enumerate(sentences):
+            norm_sentence = normalize_text(sentence)
+            inputs = processor(text=norm_sentence, return_tensors="pt").to(device)
+            with torch.no_grad():
+                speech = model_female.generate(
+                    input_ids=inputs["input_ids"],
+                    speaker_embeddings=speaker_embedding.unsqueeze(0),
+                    do_sample=True,
+                    top_k=50,
+                    temperature=0.75,
+                    repetition_penalty=1.2,
+                    max_new_tokens=512
+                )
+                audio = vocoder(speech).cpu().squeeze().numpy()
             audio_chunks.append(audio)
+            # Pause 0.5s after each sentence except last
+            if sent_idx < len(sentences) - 1:
+                pause = np.zeros(int(16000 * 0.5))
+                audio_chunks.append(pause)
+        # Pause 0.8s after each paragraph except last
+        if para_idx < len(paragraphs) - 1:
+            para_pause = np.zeros(int(16000 * 0.8))
+            audio_chunks.append(para_pause)
     final_audio = np.concatenate(audio_chunks)
+    return (16000, final_audio)
+class TTSRequest(BaseModel):
+    text: str
+@app.post("/speak")
+def speak(payload: TTSRequest):
+    clean_text = normalize_text(payload.text)
+    inputs = processor(text=clean_text, return_tensors="pt").to(device)
+    with torch.no_grad():
+        waveform = model_female.generate_speech(
+            input_ids=inputs["input_ids"],
+            speaker_embeddings=embedding_female.unsqueeze(0),
+            vocoder=vocoder
+        )
+    out_path = f"/tmp/{uuid.uuid4().hex}.wav"
+    sf.write(out_path, waveform.cpu().numpy(), 16000)
+    return FileResponse(out_path, media_type="audio/wav", filename="voice.wav")