Spaces:

nexusbert
/

milestone3

Sleeping

App Files Files Community

nexusbert commited on Oct 10, 2025

Commit

32ad752

1 Parent(s): 63703a0

push

Browse files

Files changed (1) hide show

app.py +107 -34

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import requests
 import torch
 import numpy as np
 import soundfile as sf
 from fastapi import FastAPI, File, UploadFile, HTTPException, Form
 from fastapi.responses import FileResponse
 from fastapi.middleware.cors import CORSMiddleware
@@ -164,6 +165,41 @@ def _run_mms(model: Wav2Vec2ForCTC, proc: Wav2Vec2Processor, audio_array: np.nda
         logging.exception("MMS ASR inference failed")
         return ""
 def preprocess_audio_ffmpeg(audio_data: bytes, target_sr: int = 16000) -> np.ndarray:
     try:
         with tempfile.NamedTemporaryFile(suffix='.input', delete=False) as in_file:
@@ -194,40 +230,77 @@ def preprocess_audio_ffmpeg(audio_data: bytes, target_sr: int = 16000) -> np.nda
 def speech_to_text(audio_data: bytes) -> str:
-    audio_array = preprocess_audio_ffmpeg(audio_data)
-    candidates = []
-    mms_result = _get_mms()
-    if mms_result and mms_result[0] is not None and mms_result[1] is not None:
-        mms_model, mms_proc = mms_result
-        mms_text = _run_mms(mms_model, mms_proc, audio_array)
-        if mms_text:
-            candidates.append(("mms", mms_text))
-            logger.info(f"MMS result: '{mms_text}'")
-    igbo_result = _get_igbo_asr()
-    if igbo_result[0] is not None and igbo_result[1] is not None:
-        igbo_model, igbo_proc = igbo_result
-        igbo_text = _run_whisper(igbo_model, igbo_proc, audio_array, language="igbo")
-        if igbo_text:
-            candidates.append(("igbo", igbo_text))
-            logger.info(f"Igbo ASR result: '{igbo_text}'")
-    for model_name, text in candidates:
-        detected_lang = detect_language(text)
-        if detected_lang == "ig" and model_name == "igbo":
-            logger.info(f"Using {model_name} ASR result (detected {detected_lang} language)")
-            return text
-        elif detected_lang in ["ha", "yo", "en"] and model_name == "mms":
-            logger.info(f"Using {model_name} ASR result (detected {detected_lang} language)")
-            return text
-    if candidates:
-        best_text = max((t for _, t in candidates), key=lambda s: len(s or ""))
-        logger.info(f"Using best result by length: '{best_text}'")
-        return best_text
-    return ""
 def get_ai_response(text: str, response_language: str = None) -> str:

 import torch
 import numpy as np
 import soundfile as sf
+import torchaudio
 from fastapi import FastAPI, File, UploadFile, HTTPException, Form
 from fastapi.responses import FileResponse
 from fastapi.middleware.cors import CORSMiddleware
         logging.exception("MMS ASR inference failed")
         return ""
+def chunk_audio(audio_data: bytes, chunk_len: int = 15) -> list:
+    """Split audio into smaller chunks for better transcription."""
+    try:
+        with tempfile.NamedTemporaryFile(suffix='.input', delete=False) as in_file:
+            in_file.write(audio_data)
+            in_path = in_file.name
+        waveform, sr = torchaudio.load(in_path)
+        if sr != 16000:
+            waveform = torchaudio.functional.resample(waveform, sr, 16000)
+            sr = 16000
+        if waveform.shape[0] > 1:
+            waveform = waveform.mean(dim=0, keepdim=True)
+        num_samples = waveform.size(1)
+        chunk_size = sr * chunk_len
+        chunks = []
+        for i in range(0, num_samples, chunk_size):
+            chunk_waveform = waveform[:, i:i+chunk_size]
+            if chunk_waveform.size(1) == 0:
+                continue
+            chunk_path = tempfile.mktemp(suffix=f"_chunk_{i//chunk_size}.wav")
+            torchaudio.save(chunk_path, chunk_waveform, sr)
+            chunks.append(chunk_path)
+        os.unlink(in_path)
+        return chunks
+    except Exception as e:
+        logger.error(f"Audio chunking failed: {e}")
+        raise HTTPException(status_code=400, detail="Audio chunking failed.")
 def preprocess_audio_ffmpeg(audio_data: bytes, target_sr: int = 16000) -> np.ndarray:
     try:
         with tempfile.NamedTemporaryFile(suffix='.input', delete=False) as in_file:
 def speech_to_text(audio_data: bytes) -> str:
+    """Transcribe audio using chunking technique for better accuracy."""
+    try:
+        chunks = chunk_audio(audio_data, chunk_len=15)
+        logger.info(f"Split audio into {len(chunks)} chunks")
+        candidates = []
+        mms_result = _get_mms()
+        if mms_result and mms_result[0] is not None and mms_result[1] is not None:
+            mms_model, mms_proc = mms_result
+            mms_full_text = ""
+            for chunk_path in chunks:
+                try:
+                    waveform, sr = torchaudio.load(chunk_path)
+                    audio_array = waveform.squeeze().numpy()
+                    chunk_text = _run_mms(mms_model, mms_proc, audio_array)
+                    if chunk_text:
+                        mms_full_text += " " + chunk_text
+                except Exception as e:
+                    logger.warning(f"MMS chunk processing failed: {e}")
+                    continue
+            if mms_full_text.strip():
+                candidates.append(("mms", mms_full_text.strip()))
+                logger.info(f"MMS result: '{mms_full_text.strip()}'")
+        igbo_result = _get_igbo_asr()
+        if igbo_result[0] is not None and igbo_result[1] is not None:
+            igbo_model, igbo_proc = igbo_result
+            igbo_full_text = ""
+            for chunk_path in chunks:
+                try:
+                    waveform, sr = torchaudio.load(chunk_path)
+                    audio_array = waveform.squeeze().numpy()
+                    chunk_text = _run_whisper(igbo_model, igbo_proc, audio_array, language="igbo")
+                    if chunk_text:
+                        igbo_full_text += " " + chunk_text
+                except Exception as e:
+                    logger.warning(f"Igbo ASR chunk processing failed: {e}")
+                    continue
+            if igbo_full_text.strip():
+                candidates.append(("igbo", igbo_full_text.strip()))
+                logger.info(f"Igbo ASR result: '{igbo_full_text.strip()}'")
+        for chunk_path in chunks:
+            try:
+                os.unlink(chunk_path)
+            except:
+                pass
+        for model_name, text in candidates:
+            detected_lang = detect_language(text)
+            if detected_lang == "ig" and model_name == "igbo":
+                logger.info(f"Using {model_name} ASR result (detected {detected_lang} language)")
+                return text
+            elif detected_lang in ["ha", "yo", "en"] and model_name == "mms":
+                logger.info(f"Using {model_name} ASR result (detected {detected_lang} language)")
+                return text
+        if candidates:
+            best_text = max((t for _, t in candidates), key=lambda s: len(s or ""))
+            logger.info(f"Using best result by length: '{best_text}'")
+            return best_text
+        return ""
+    except Exception as e:
+        logger.error(f"Speech-to-text chunking failed: {e}")
+        return ""
 def get_ai_response(text: str, response_language: str = None) -> str: