Spaces:

Somalitts
/

caash_api

No application file

App Files Files Community

Somalitts commited on Jul 19, 2025

Commit

9d4c38e

verified ·

1 Parent(s): acbd995

Update app.py

Browse files

Files changed (1) hide show

app.py +154 -66

app.py CHANGED Viewed

@@ -4,92 +4,180 @@ import uuid
 import torch
 import torchaudio
 import soundfile as sf
-from fastapi import FastAPI
 from fastapi.responses import FileResponse
 from pydantic import BaseModel
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
-from speechbrain.inference.speaker import EncoderClassifier
-app = FastAPI()
 device = "cuda" if torch.cuda.is_available() else "cpu"
-CACHE_DIR = "/tmp/hf-cache"
-# Load models
-processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts", cache_dir=CACHE_DIR)
-vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan", cache_dir=CACHE_DIR).to(device)
-model_male = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/5aad", cache_dir=CACHE_DIR).to(device)
-model_female = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad", cache_dir=CACHE_DIR).to(device)
-# Speaker encoder
-speaker_model = EncoderClassifier.from_hparams(
-    source="speechbrain/spkrec-xvect-voxceleb",
-    run_opts={"device": device},
-    savedir="/tmp/spk_model"
-)
-# Load speaker embeddings
-def get_embedding(wav_path, pt_path):
-    if os.path.exists(pt_path):
-        return torch.load(pt_path).to(device)
-    audio, sr = torchaudio.load(wav_path)
-    audio = torchaudio.functional.resample(audio, sr, 16000).mean(dim=0).unsqueeze(0).to(device)
-    with torch.no_grad():
-        emb = speaker_model.encode_batch(audio)
-        emb = torch.nn.functional.normalize(emb, dim=2).squeeze()
-    torch.save(emb.cpu(), pt_path)
-    return emb
-embedding_male = get_embedding("Hussein.wav", "/tmp/male_embedding.pt")
-embedding_female = get_embedding("caasho.wav", "/tmp/female_embedding.pt")
-# Text normalization
 number_words = {
-    0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
-    6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
-    20: "labaatan", 30: "sodon", 40: "afartan", 50: "konton",
-    60: "lixdan", 70: "todobaatan", 80: "sideetan", 90: "sagaashan",
-    100: "boqol", 1000: "kun"
 }
-def number_to_words(n):
-    if n < 20:
-        return number_words.get(n, str(n))
-    elif n < 100:
-        tens, unit = divmod(n, 10)
-        return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
-    elif n < 1000:
-        hundreds, rem = divmod(n, 100)
-        return (number_words[hundreds] + " boqol" if hundreds > 1 else "boqol") + (" " + number_to_words(rem) if rem else "")
-    elif n < 1_000_000:
-        th, rem = divmod(n, 1000)
-        return (number_to_words(th) + " kun") + (" " + number_to_words(rem) if rem else "")
-    else:
-        return str(n)
 def replace_numbers_with_words(text):
-    return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
 def normalize_text(text):
     text = text.lower()
     text = replace_numbers_with_words(text)
-    text = re.sub(r'[^\w\s]', '', text)
     return text
-# API request schema
 class TTSRequest(BaseModel):
     text: str
-    voice: str  # "Male" or "Female"
-@app.post("/speak")
-def speak(payload: TTSRequest):
-    clean_text = normalize_text(payload.text)
-    inputs = processor(text=clean_text, return_tensors="pt").to(device)
-    model = model_male if payload.voice.lower() == "male" else model_female
-    embedding = embedding_male if payload.voice.lower() == "male" else embedding_female
-    with torch.no_grad():
-        waveform = model.generate_speech(inputs["input_ids"], embedding.unsqueeze(0), vocoder=vocoder)
-    out_path = f"/tmp/{uuid.uuid4().hex}.wav"
-    sf.write(out_path, waveform.cpu().numpy(), 16000)
-    return FileResponse(out_path, media_type="audio/wav", filename="voice.wav")

 import torch
 import torchaudio
 import soundfile as sf
+from fastapi import FastAPI, HTTPException, BackgroundTasks
 from fastapi.responses import FileResponse
 from pydantic import BaseModel
+import logging
+import tempfile
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from speechbrain.pretrained import EncoderClassifier
+# --- Dejinta iyo Isku-habeynta (Configuration) ---
+logging.basicConfig(level=logging.INFO)
+app = FastAPI(title="Multi-Voice Somali Text-to-Speech API")
+# Hubinta aaladda (GPU ama CPU)
 device = "cuda" if torch.cuda.is_available() else "cpu"
+logging.info(f"Using device: {device}")
+# Faylasha codadka tixraaca (ku dar halkan faylashaada .wav)
+# Hubi in faylashan ay yaalliin isla galka uu ku jiro koodhkan
+VOICE_SAMPLE_FILES = ["1.wav"]
+EMBEDDING_DIR = "speaker_embeddings"
+os.makedirs(EMBEDDING_DIR, exist_ok=True)
+# --- Soo Dejinta Model-yada (Global variables) ---
+processor = None
+model = None
+vocoder = None
+speaker_model = None
+speaker_embeddings_cache = {}
+@app.on_event("startup")
+async def startup_event():
+    """
+    Shaqadan waxay shaqaynaysaa hal mar marka uu barnaamijku bilaabmo.
+    Waxay soo dejinaysaa model-yada waxayna diyaarisaa codadka.
+    """
+    global processor, model, vocoder, speaker_model
+    logging.info("Loading models...")
+    try:
+        processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+        model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
+        vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
+        speaker_model = EncoderClassifier.from_hparams(
+            source="speechbrain/spkrec-xvect-voxceleb",
+            run_opts={"device": device},
+            savedir=os.path.join("pretrained_models", "spkrec-xvect-voxceleb")
+        )
+        logging.info("Models loaded successfully.")
+    except Exception as e:
+        logging.error(f"Error loading models: {e}")
+        raise RuntimeError(f"Could not load models: {e}")
+    logging.info("Pre-caching speaker embeddings...")
+    for voice_file in VOICE_SAMPLE_FILES:
+        if not os.path.exists(voice_file):
+            raise FileNotFoundError(f"Reference audio file not found: {voice_file}. Make sure it's in the same directory.")
+        get_speaker_embedding(voice_file)
+    logging.info("Embeddings cached. Application is ready to serve requests.")
+def get_speaker_embedding(wav_file_path):
+    """
+    Waxay abuurtaa oo kaydisaa 'speaker embedding' ama way soo akhridaa haddii uu horay u kaydsanaa.
+    """
+    if wav_file_path in speaker_embeddings_cache:
+        return speaker_embeddings_cache[wav_file_path]
+    embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
+    if os.path.exists(embedding_path):
+        embedding = torch.load(embedding_path, map_location=device)
+        speaker_embeddings_cache[wav_file_path] = embedding
+        logging.info(f"Loaded cached embedding for {wav_file_path}")
+        return embedding
+    try:
+        audio, sr = torchaudio.load(wav_file_path)
+        if sr != 16000:
+            audio = torchaudio.functional.resample(audio, sr, 16000)
+        if audio.shape[0] > 1:
+            audio = torch.mean(audio, dim=0, keepdim=True)
+        with torch.no_grad():
+            embedding = speaker_model.encode_batch(audio.to(device))
+            embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
+        torch.save(embedding.cpu(), embedding_path)
+        speaker_embeddings_cache[wav_file_path] = embedding.to(device)
+        logging.info(f"Generated and cached new embedding for {wav_file_path}")
+        return embedding.to(device)
+    except Exception as e:
+        logging.error(f"Could not process audio file {wav_file_path}. Error: {e}")
+        raise HTTPException(status_code=500, detail=f"Failed to process reference audio: {wav_file_path}")
+# --- Shaqooyinka Hagaajinta Qoraalka (Text Processing) ---
+# (Kuwani sidoodii hore ayay u fiican yihiin)
 number_words = {
+    0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
+    6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
+    11: "kow iyo toban", 12: "labo iyo toban", 13: "saddex iyo toban",
+    14: "afar iyo toban", 15: "shan iyo toban", 16: "lix iyo toban",
+    17: "toddobo iyo toban", 18: "siddeed iyo toban", 19: "sagaal iyo toban",
+    20: "labaatan", 30: "soddon", 40: "afartan", 50: "konton",
+    60: "lixdan", 70: "toddobaatan", 80: "siddeetan", 90: "sagaashan",
+    100: "boqol", 1000: "kun",
 }
+def number_to_words_recursive(n):
+    if n in number_words: return number_words[n]
+    if n < 100: return number_words[n//10 * 10] + (" iyo " + number_words[n%10] if n%10 else "")
+    if n < 1000: return (number_to_words_recursive(n//100) + " boqol" if n//100 > 1 else "boqol") + (" iyo " + number_to_words_recursive(n%100) if n%100 else "")
+    if n < 1000000: return (number_to_words_recursive(n//1000) + " kun") + (" iyo " + number_to_words_recursive(n%1000) if n%1000 else "")
+    return str(n)
 def replace_numbers_with_words(text):
+    return re.sub(r'\b\d+\b', lambda m: number_to_words_recursive(int(m.group())), text)
 def normalize_text(text):
     text = text.lower()
     text = replace_numbers_with_words(text)
+    text = re.sub(r'[^\w\s\']', '', text)
+    text = re.sub(r'\s+', ' ', text).strip()
     return text
+# --- Qaabka Codsiga API-ga (Pydantic Model) ---
 class TTSRequest(BaseModel):
     text: str
+    voice_choice: str = "1.wav"  # Qiimaha asalka ah haddii aan la soo dirin
+# --- Endpoints-ka API-ga ---
+@app.get("/voices", summary="Soo Hel Codadka La Heli Karo")
+async def get_available_voices():
+    """
+    Wuxuu soo celinayaa liiska faylasha codadka ee diyaar ka ah.
+    """
+    return {"available_voices": VOICE_SAMPLE_FILES}
+@app.post("/speak", summary="Abuur Cod Qoraal ka timid")
+async def text_to_speech_endpoint(payload: TTSRequest, background_tasks: BackgroundTasks):
+    """
+    Wuxuu qoraal u beddelaa cod .wav ah.
+    - **text**: Qoraalka aad rabto inaad cod u beddesho.
+    - **voice_choice**: Faylka codka aad rabto inaad tixraacdo (tusaale, "1.wav").
+    """
+    if not payload.text or not payload.text.strip():
+        raise HTTPException(status_code=400, detail="Qoraalku ma bannaanaan karo (Text cannot be empty).")
+    if payload.voice_choice not in VOICE_SAMPLE_FILES:
+        raise HTTPException(status_code=400, detail=f"Codka la doortay '{payload.voice_choice}' lama helin.")
+    try:
+        speaker_embedding = get_speaker_embedding(payload.voice_choice)
+    except FileNotFoundError:
+        raise HTTPException(status_code=404, detail=f"Faylka codka ee '{payload.voice_choice}' lama helin.")
+    normalized_text = normalize_text(payload.text)
+    logging.info(f"Generating speech for: '{normalized_text}' with voice '{payload.voice_choice}'")
+    inputs = processor(text=normalized_text, return_tensors="pt").to(device)
+    with torch.no_grad():
+        speech = model.generate_speech(
+            inputs["input_ids"],
+            speaker_embedding.unsqueeze(0),
+            vocoder=vocoder
+        )
+    # Ku kaydi fayl ku meel gaar ah
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+        sf.write(tmp_file.name, speech.cpu().numpy(), 16000)
+        # Ku dar shaqo tirtiraysa faylka ka dib marka la soo celiyo
+        background_tasks.add_task(os.remove, tmp_file.name)
+        # Soo celi faylka codka
+        return FileResponse(
+            path=tmp_file.name,
+            media_type="audio/wav",
+            filename=f"{uuid.uuid4()}.wav"
+        )