Spaces:

nexusbert
/

milestone3

Sleeping

App Files Files Community

nexusbert commited on Oct 10, 2025

Commit

8e783bb

1 Parent(s): 6e6f14a

push

Browse files

Files changed (2) hide show

Dockerfile +0 -4
app.py +29 -32

Dockerfile CHANGED Viewed

@@ -33,13 +33,11 @@ ENV HF_HOME=/models/huggingface \
 # Created cache dir and set permissions
 RUN mkdir -p /models/huggingface && chmod -R 777 /models/huggingface
-# Pre-download TTS models at build time (only non-gated models)
 RUN python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-hau')" \
  && python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-eng')" \
  && python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-yor')" \
  && find /models/huggingface -name '*.lock' -delete
-# Preload TTS pipelines (avoid runtime delays) - ASR models will be lazy-loaded
 RUN python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-hau')" \
  && python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-eng')" \
  && python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-yor')"
@@ -47,8 +45,6 @@ RUN python -c "from transformers import pipeline; pipeline('text-to-speech', mod
 # Copy project files
 COPY . .
-# Expose FastAPI port
 EXPOSE 7860
-# Run FastAPI app with uvicorn (1 workers for concurrency)
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

 # Created cache dir and set permissions
 RUN mkdir -p /models/huggingface && chmod -R 777 /models/huggingface
 RUN python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-hau')" \
  && python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-eng')" \
  && python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-yor')" \
  && find /models/huggingface -name '*.lock' -delete
 RUN python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-hau')" \
  && python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-eng')" \
  && python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-yor')"
 # Copy project files
 COPY . .
 EXPOSE 7860
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

app.py CHANGED Viewed

@@ -39,7 +39,7 @@ app.add_middleware(
 ASK_URL = "https://remostart-milestone-one-farmlingua-ai.hf.space/ask"
-tts_ha, tts_en, tts_yo = None, None, None
 asr_models = {
     "ha": {"repo": "NCAIR1/Hausa-ASR", "model": None, "proc": None},
@@ -49,11 +49,14 @@ asr_models = {
 }
 def load_models():
-    global tts_ha, tts_en, tts_yo
     device = 0 if torch.cuda.is_available() else -1
     hf_token = os.getenv("HF_TOKEN")
     if not hf_token:
-        logger.info("HF_TOKEN not set; gated repos may fail to load. Set HF_TOKEN to access restricted models.")
     logger.info("Loading TTS models...")
     try:
         tts_ha = pipeline("text-to-speech", model="facebook/mms-tts-hau", device=device)
@@ -74,7 +77,9 @@ def load_models():
         logger.exception("Failed to load TTS (Yoruba)")
         tts_yo = None
-    logger.info("Igbo TTS disabled: will fallback to text response")
     logger.info("Deferred ASR model loads: will lazy-load per language on first use")
@@ -86,8 +91,6 @@ def _get_asr(lang_code: str):
         return entry["model"], entry["proc"]
     repo_id = entry["repo"]
     hf_token = os.getenv("HF_TOKEN")
-    if hf_token:
-        hf_token = hf_token.strip()
     try:
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         logger.info(f"Lazy-loading ASR for {lang_code} from {repo_id}...")
@@ -211,9 +214,12 @@ def text_to_speech_file(text: str) -> str:
     print(f"Detected language: {lang}")
     if lang == "ig":
-        logger.warning("Igbo TTS not available - returning text response")
-        raise Exception("Igbo TTS not available - returning text response")
     if lang == "ha":
         tts_model = tts_ha
     elif lang == "yo":
@@ -222,7 +228,8 @@ def text_to_speech_file(text: str) -> str:
         tts_model = tts_en
     if tts_model is None:
-        raise Exception(f"TTS model not available for language '{lang}'")
     speech_output = tts_model(text)
     audio_raw = speech_output["audio"]
@@ -265,17 +272,12 @@ async def chat(text: str = Form(...), speak: bool = False, raw: bool = False):
         raise HTTPException(status_code=400, detail="Text cannot be empty")
     final_text = text if raw else get_ai_response(text)
     if speak:
-        try:
-            audio_path = text_to_speech_file(final_text)
-            return FileResponse(audio_path, media_type="audio/wav", filename="response.wav")
-        except Exception as e:
-            logger.warning(f"TTS failed for chat endpoint: {e}")
-            return {
-                "question": text,
-                "answer": final_text,
-                "tts_available": False,
-                "message": f"TTS not available: {str(e)}"
-            }
     return {"question": text, "answer": final_text}
 @app.post("/speak")
@@ -286,17 +288,12 @@ async def speak_to_ai(audio_file: UploadFile = File(...), speak: bool = True):
     transcription = speech_to_text(audio_data)
     ai_response = get_ai_response(transcription)
     if speak:
-        try:
-            audio_path = text_to_speech_file(ai_response)
-            return FileResponse(audio_path, media_type="audio/wav", filename="response.wav")
-        except Exception as e:
-            logger.warning(f"TTS failed for speak endpoint: {e}")
-            return {
-                "transcription": transcription,
-                "ai_response": ai_response,
-                "tts_available": False,
-                "message": f"TTS not available: {str(e)}"
-            }
     return {"transcription": transcription, "ai_response": ai_response}
 if __name__ == "__main__":

 ASK_URL = "https://remostart-milestone-one-farmlingua-ai.hf.space/ask"
+tts_ha, tts_en, tts_yo, tts_ig = None, None, None, None
 asr_models = {
     "ha": {"repo": "NCAIR1/Hausa-ASR", "model": None, "proc": None},
 }
 def load_models():
+    global tts_ha, tts_en, tts_yo, tts_ig
     device = 0 if torch.cuda.is_available() else -1
     hf_token = os.getenv("HF_TOKEN")
     if not hf_token:
+        logger.warning("HF_TOKEN not set! This may cause authentication failures for gated repositories.")
+        logger.warning("Please set HF_TOKEN environment variable to access restricted models.")
+    else:
+        logger.info("HF_TOKEN is set and ready for authenticated model access.")
     logger.info("Loading TTS models...")
     try:
         tts_ha = pipeline("text-to-speech", model="facebook/mms-tts-hau", device=device)
         logger.exception("Failed to load TTS (Yoruba)")
         tts_yo = None
+    tts_ig = None
+    logger.info("Igbo TTS model disabled - will return text responses for Igbo language")
     logger.info("Deferred ASR model loads: will lazy-load per language on first use")
         return entry["model"], entry["proc"]
     repo_id = entry["repo"]
     hf_token = os.getenv("HF_TOKEN")
     try:
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         logger.info(f"Lazy-loading ASR for {lang_code} from {repo_id}...")
     print(f"Detected language: {lang}")
     if lang == "ig":
+        logger.info("Igbo language detected - returning text response instead of audio")
+        fd, path = tempfile.mkstemp(suffix=".txt")
+        os.close(fd)
+        with open(path, 'w', encoding='utf-8') as f:
+            f.write(text)
+        return path
     if lang == "ha":
         tts_model = tts_ha
     elif lang == "yo":
         tts_model = tts_en
     if tts_model is None:
+        logger.error(f"TTS model for {lang} is not available")
+        raise HTTPException(status_code=500, detail=f"TTS model for {lang} is not available")
     speech_output = tts_model(text)
     audio_raw = speech_output["audio"]
         raise HTTPException(status_code=400, detail="Text cannot be empty")
     final_text = text if raw else get_ai_response(text)
     if speak:
+        output_path = text_to_speech_file(final_text)
+        lang = detect_language(final_text)
+        if lang == "ig":
+            return FileResponse(output_path, media_type="text/plain", filename="response.txt")
+        else:
+            return FileResponse(output_path, media_type="audio/wav", filename="response.wav")
     return {"question": text, "answer": final_text}
 @app.post("/speak")
     transcription = speech_to_text(audio_data)
     ai_response = get_ai_response(transcription)
     if speak:
+        output_path = text_to_speech_file(ai_response)
+        lang = detect_language(ai_response)
+        if lang == "ig":
+            return FileResponse(output_path, media_type="text/plain", filename="response.txt")
+        else:
+            return FileResponse(output_path, media_type="audio/wav", filename="response.wav")
     return {"transcription": transcription, "ai_response": ai_response}
 if __name__ == "__main__":