Spaces:

nexusbert
/

milestone3

Sleeping

App Files Files Community

nexusbert commited on Oct 10, 2025

Commit

4b62031

1 Parent(s): 4f110d3

push igbo model

Browse files

Files changed (2) hide show

Dockerfile +2 -0
app.py +51 -8

Dockerfile CHANGED Viewed

@@ -38,10 +38,12 @@ RUN python -c "from huggingface_hub import snapshot_download; snapshot_download(
  && python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-hau')" \
  && python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-eng')" \
  && python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-yor')" \
  && find /models/huggingface -name '*.lock' -delete
 # Preload tokenizers (avoid runtime delays)
 RUN python -c "from transformers import Wav2Vec2Processor; Wav2Vec2Processor.from_pretrained('facebook/mms-1b-all')" \
  && python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-hau')" \
  && python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-eng')" \
  && python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-yor')"

  && python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-hau')" \
  && python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-eng')" \
  && python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-yor')" \
+ && python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='NCAIR1/Igbo-ASR')" \
  && find /models/huggingface -name '*.lock' -delete
 # Preload tokenizers (avoid runtime delays)
 RUN python -c "from transformers import Wav2Vec2Processor; Wav2Vec2Processor.from_pretrained('facebook/mms-1b-all')" \
+ && python -c "from transformers import WhisperProcessor; WhisperProcessor.from_pretrained('NCAIR1/Igbo-ASR')" \
  && python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-hau')" \
  && python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-eng')" \
  && python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-yor')"

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ import soundfile as sf
 from fastapi import FastAPI, File, UploadFile, HTTPException, Form
 from fastapi.responses import FileResponse
 from fastapi.middleware.cors import CORSMiddleware
-from transformers import pipeline, Wav2Vec2Processor, Wav2Vec2ForCTC
 from langdetect import detect
 import imageio_ffmpeg
 import logging
@@ -43,6 +43,8 @@ tts_ha, tts_en, tts_yo, tts_ig = None, None, None, None
 mms_model = None
 mms_processor = None
 def load_models():
     global tts_ha, tts_en, tts_yo, tts_ig
@@ -74,7 +76,7 @@ def load_models():
     logger.info("Igbo TTS disabled: will fallback to text response")
-    logger.info("Deferred MMS model load: will lazy-load on first use")
 def _get_mms():
     global mms_model, mms_processor
@@ -94,7 +96,36 @@ def _get_mms():
     except Exception:
         logger.exception("Failed to load MMS ASR model")
         mms_model, mms_processor = None, None
-        return None, None
 def _run_mms(model: Wav2Vec2ForCTC, proc: Wav2Vec2Processor, audio_array: np.ndarray) -> str:
     try:
@@ -140,12 +171,24 @@ def preprocess_audio_ffmpeg(audio_data: bytes, target_sr: int = 16000) -> np.nda
 def speech_to_text(audio_data: bytes) -> str:
     audio_array = preprocess_audio_ffmpeg(audio_data)
-    model, proc = _get_mms()
-    if model is None or proc is None:
-        return ""
-    text = _run_mms(model, proc, audio_array)
-    return text
 def get_ai_response(text: str, response_language: str = None) -> str:

 from fastapi import FastAPI, File, UploadFile, HTTPException, Form
 from fastapi.responses import FileResponse
 from fastapi.middleware.cors import CORSMiddleware
+from transformers import pipeline, Wav2Vec2Processor, Wav2Vec2ForCTC, WhisperProcessor, WhisperForConditionalGeneration
 from langdetect import detect
 import imageio_ffmpeg
 import logging
 mms_model = None
 mms_processor = None
+igbo_model = None
+igbo_processor = None
 def load_models():
     global tts_ha, tts_en, tts_yo, tts_ig
     logger.info("Igbo TTS disabled: will fallback to text response")
+    logger.info("Deferred MMS and Igbo ASR model loads: will lazy-load on first use")
 def _get_mms():
     global mms_model, mms_processor
     except Exception:
         logger.exception("Failed to load MMS ASR model")
         mms_model, mms_processor = None, None
+def _get_igbo_asr():
+    global igbo_model, igbo_processor
+    if igbo_model is not None and igbo_processor is not None:
+        return igbo_model, igbo_processor
+    hf_token = os.getenv("HF_TOKEN")
+    try:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        logger.info("Lazy-loading Igbo ASR model...")
+        igbo_processor = WhisperProcessor.from_pretrained("NCAIR1/Igbo-ASR", token=hf_token)
+        igbo_model = WhisperForConditionalGeneration.from_pretrained("NCAIR1/Igbo-ASR", token=hf_token)
+        igbo_model.to(device)
+        igbo_model.eval()
+        logger.info("Loaded Igbo ASR model")
+        return igbo_model, igbo_processor
+    except Exception:
+        logger.exception("Failed to load Igbo ASR model")
+        igbo_model, igbo_processor = None, None
+def _run_whisper(model: WhisperForConditionalGeneration, proc: WhisperProcessor, audio_array: np.ndarray) -> str:
+    try:
+        device = next(model.parameters()).device
+        inputs = proc(audio_array, sampling_rate=16000, return_tensors="pt")
+        input_features = inputs.input_features.to(device)
+        with torch.no_grad():
+            predicted_ids = model.generate(input_features)
+        text_list = proc.batch_decode(predicted_ids, skip_special_tokens=True)
+        return text_list[0] if text_list else ""
+    except Exception:
+        logging.exception("Whisper ASR inference failed")
+        return ""
 def _run_mms(model: Wav2Vec2ForCTC, proc: Wav2Vec2Processor, audio_array: np.ndarray) -> str:
     try:
 def speech_to_text(audio_data: bytes) -> str:
     audio_array = preprocess_audio_ffmpeg(audio_data)
+    # Try Igbo ASR first for better Igbo detection
+    igbo_model, igbo_proc = _get_igbo_asr()
+    if igbo_model is not None and igbo_proc is not None:
+        igbo_text = _run_whisper(igbo_model, igbo_proc, audio_array)
+        if igbo_text and igbo_text.strip():
+            logger.info("Using Igbo ASR result")
+            return igbo_text
+    # Fallback to MMS for other languages
+    mms_model, mms_proc = _get_mms()
+    if mms_model is not None and mms_proc is not None:
+        mms_text = _run_mms(mms_model, mms_proc, audio_array)
+        if mms_text and mms_text.strip():
+            logger.info("Using MMS ASR result")
+            return mms_text
+    return ""
 def get_ai_response(text: str, response_language: str = None) -> str: