Spaces:

nexusbert
/

tts_new

Sleeping

App Files Files Community

nexusbert commited on Dec 17, 2025

Commit

4d777b5

1 Parent(s): 0139faa

Add voice chat feature with audio storage

Browse files

Files changed (1) hide show

main.py +197 -0

main.py CHANGED Viewed

@@ -15,6 +15,10 @@ import requests
 import soundfile as sf
 import subprocess
 import imageio_ffmpeg
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -42,6 +46,51 @@ for _p in ["/tmp/huggingface", "/tmp/models", "/tmp/hf_asr"]:
 ASK_URL = os.getenv("ASK_URL", "https://remostart-farmlingua-ai-conversational.hf.space/ask")
 asr_models = {
     "ha": {"repo": "NCAIR1/Hausa-ASR", "model": None, "proc": None},
     "yo": {"repo": "NCAIR1/Yoruba-ASR", "model": None, "proc": None},
@@ -76,6 +125,13 @@ class SpeakRequest(BaseModel):
     repetition_penalty: float | None = 1.1
     max_length: int | None = 4000
 def load_audio_tokenizer():
     global audio_tokenizer
@@ -367,6 +423,147 @@ def _map_lang_code(code: str) -> str:
     m = {"yo": "yoruba", "ha": "hausa", "ig": "igbo", "en": "english"}
     return m.get(code.lower(), "english")
 @app.post("/tts")
 async def text_to_speech(request: TTSRequest):
     global model, audio_tokenizer

 import soundfile as sf
 import subprocess
 import imageio_ffmpeg
+import uuid
+import time
+import threading
+from pathlib import Path
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 ASK_URL = os.getenv("ASK_URL", "https://remostart-farmlingua-ai-conversational.hf.space/ask")
+AUDIO_STORAGE_DIR = Path("/tmp/voice_chat_audio")
+AUDIO_STORAGE_DIR.mkdir(parents=True, exist_ok=True)
+AUDIO_EXPIRY_SECONDS = 3600
+audio_registry: Dict[str, Dict[str, Any]] = {}
+audio_registry_lock = threading.Lock()
+def cleanup_expired_audio():
+    now = time.time()
+    expired_ids = []
+    with audio_registry_lock:
+        for audio_id, info in audio_registry.items():
+            if now - info["created_at"] > AUDIO_EXPIRY_SECONDS:
+                expired_ids.append(audio_id)
+        for audio_id in expired_ids:
+            info = audio_registry.pop(audio_id, None)
+            if info and os.path.exists(info["path"]):
+                try:
+                    os.unlink(info["path"])
+                except Exception:
+                    pass
+def store_audio(audio_data: bytes, suffix: str = ".wav") -> str:
+    cleanup_expired_audio()
+    audio_id = str(uuid.uuid4())
+    file_path = AUDIO_STORAGE_DIR / f"{audio_id}{suffix}"
+    with open(file_path, "wb") as f:
+        f.write(audio_data)
+    with audio_registry_lock:
+        audio_registry[audio_id] = {
+            "path": str(file_path),
+            "created_at": time.time()
+        }
+    return audio_id
+def get_audio_path(audio_id: str) -> Optional[str]:
+    with audio_registry_lock:
+        info = audio_registry.get(audio_id)
+        if info and os.path.exists(info["path"]):
+            return info["path"]
+    return None
 asr_models = {
     "ha": {"repo": "NCAIR1/Hausa-ASR", "model": None, "proc": None},
     "yo": {"repo": "NCAIR1/Yoruba-ASR", "model": None, "proc": None},
     repetition_penalty: float | None = 1.1
     max_length: int | None = 4000
+class VoiceChatResponse(BaseModel):
+    user_transcription: str
+    user_audio_id: str
+    ai_response: str
+    ai_audio_id: str
 def load_audio_tokenizer():
     global audio_tokenizer
     m = {"yo": "yoruba", "ha": "hausa", "ig": "igbo", "en": "english"}
     return m.get(code.lower(), "english")
+@app.get("/audio/{audio_id}")
+async def get_audio(audio_id: str):
+    file_path = get_audio_path(audio_id)
+    if not file_path:
+        raise HTTPException(status_code=404, detail="Audio not found or expired")
+    return FileResponse(
+        file_path,
+        media_type="audio/wav",
+        filename=f"{audio_id}.wav"
+    )
+@app.post("/voice-chat", response_model=VoiceChatResponse)
+async def voice_chat(audio_file: UploadFile = File(...), language: str = Form(...)):
+    global model, audio_tokenizer
+    if language not in ["yo", "ha", "ig", "en"]:
+        raise HTTPException(status_code=400, detail="Language must be one of: yo, ha, ig, en")
+    audio_bytes = await audio_file.read()
+    user_audio_id = store_audio(audio_bytes, suffix=".webm")
+    audio_array = _preprocess_audio_ffmpeg(audio_bytes)
+    model_asr, proc = _get_asr(language)
+    if model_asr is None or proc is None:
+        raise HTTPException(status_code=500, detail="ASR model not available")
+    try:
+        device_t = next(model_asr.parameters()).device
+        inputs = proc(audio_array, sampling_rate=16000, return_tensors="pt")
+        input_features = inputs.input_features.to(device_t)
+        with torch.no_grad():
+            pred_ids = model_asr.generate(input_features)
+        text_list = proc.batch_decode(pred_ids, skip_special_tokens=True)
+        user_transcription = text_list[0].strip() if text_list else ""
+    except Exception as e:
+        logger.error(f"ASR inference failed: {e}")
+        raise HTTPException(status_code=500, detail="ASR inference failed")
+    if not user_transcription:
+        raise HTTPException(status_code=400, detail="Could not transcribe audio")
+    try:
+        ans = requests.post(ASK_URL, json={"query": user_transcription}, timeout=30)
+        ans.raise_for_status()
+        ai_response = ans.json().get("answer", "")
+        if not ai_response:
+            ai_response = "I'm sorry, I couldn't generate a response."
+    except Exception as e:
+        logger.warning(f"Ask failed ({e}); using fallback response")
+        ai_response = "I'm sorry, I'm having trouble connecting. Please try again."
+    if model is None:
+        logger.info("Loading YarnGPT2 model (lazy loading)...")
+        load_model()
+    if audio_tokenizer is None:
+        logger.info("Loading audio tokenizer (lazy loading)...")
+        load_audio_tokenizer()
+    if model is None or audio_tokenizer is None:
+        raise HTTPException(status_code=503, detail="TTS model loading failed")
+    tts_language = _map_lang_code(language)
+    default_speakers = {
+        "english": "idera",
+        "yoruba": "yoruba_male2",
+        "igbo": "igbo_male2",
+        "hausa": "hausa_female1",
+    }
+    speaker = default_speakers.get(tts_language, "idera")
+    try:
+        prompt = audio_tokenizer.create_prompt(
+            ai_response,
+            lang=tts_language,
+            speaker_name=speaker,
+        )
+        tokenized = audio_tokenizer.tokenize_prompt(prompt)
+        if isinstance(tokenized, torch.Tensor):
+            input_ids = tokenized
+            attention_mask = None
+        else:
+            input_ids = tokenized.get("input_ids", tokenized)
+            attention_mask = tokenized.get("attention_mask", None)
+        if hasattr(audio_tokenizer, 'tokenizer') and audio_tokenizer.tokenizer.pad_token is None:
+            audio_tokenizer.tokenizer.pad_token = audio_tokenizer.tokenizer.eos_token
+        with torch.no_grad():
+            gen_kwargs = {
+                "input_ids": input_ids,
+                "repetition_penalty": 1.1,
+                "max_length": 4000,
+            }
+            if attention_mask is not None:
+                gen_kwargs["attention_mask"] = attention_mask
+            use_beams = tts_language in ["yoruba", "igbo", "hausa"]
+            if use_beams:
+                gen_kwargs["num_beams"] = 5
+                gen_kwargs["early_stopping"] = False
+            else:
+                gen_kwargs["do_sample"] = True
+                gen_kwargs["temperature"] = 0.1
+            output = model.generate(**gen_kwargs)
+        codes = audio_tokenizer.get_codes(output)
+        audio = audio_tokenizer.get_audio(codes)
+        if isinstance(audio, torch.Tensor):
+            audio_tensor = audio.detach()
+        else:
+            audio_tensor = torch.tensor(np.asarray(audio))
+        audio_tensor = audio_tensor.to(torch.float32).cpu()
+        if audio_tensor.ndim > 1:
+            audio_tensor = audio_tensor.squeeze()
+        peak = audio_tensor.abs().max()
+        if peak > 1.0:
+            audio_tensor = audio_tensor / peak
+        buffer = io.BytesIO()
+        torchaudio.save(buffer, audio_tensor.unsqueeze(0), 24000, format="wav")
+        buffer.seek(0)
+        ai_audio_bytes = buffer.read()
+        ai_audio_id = store_audio(ai_audio_bytes, suffix=".wav")
+    except Exception as e:
+        logger.error(f"TTS generation failed: {e}")
+        raise HTTPException(status_code=500, detail=f"TTS generation failed: {e}")
+    return VoiceChatResponse(
+        user_transcription=user_transcription,
+        user_audio_id=user_audio_id,
+        ai_response=ai_response,
+        ai_audio_id=ai_audio_id
+    )
 @app.post("/tts")
 async def text_to_speech(request: TTSRequest):
     global model, audio_tokenizer