nexusbert commited on
Commit
8e783bb
·
1 Parent(s): 6e6f14a
Files changed (2) hide show
  1. Dockerfile +0 -4
  2. app.py +29 -32
Dockerfile CHANGED
@@ -33,13 +33,11 @@ ENV HF_HOME=/models/huggingface \
33
  # Created cache dir and set permissions
34
  RUN mkdir -p /models/huggingface && chmod -R 777 /models/huggingface
35
 
36
- # Pre-download TTS models at build time (only non-gated models)
37
  RUN python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-hau')" \
38
  && python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-eng')" \
39
  && python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-yor')" \
40
  && find /models/huggingface -name '*.lock' -delete
41
 
42
- # Preload TTS pipelines (avoid runtime delays) - ASR models will be lazy-loaded
43
  RUN python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-hau')" \
44
  && python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-eng')" \
45
  && python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-yor')"
@@ -47,8 +45,6 @@ RUN python -c "from transformers import pipeline; pipeline('text-to-speech', mod
47
  # Copy project files
48
  COPY . .
49
 
50
- # Expose FastAPI port
51
  EXPOSE 7860
52
 
53
- # Run FastAPI app with uvicorn (1 workers for concurrency)
54
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
 
33
  # Created cache dir and set permissions
34
  RUN mkdir -p /models/huggingface && chmod -R 777 /models/huggingface
35
 
 
36
  RUN python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-hau')" \
37
  && python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-eng')" \
38
  && python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-yor')" \
39
  && find /models/huggingface -name '*.lock' -delete
40
 
 
41
  RUN python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-hau')" \
42
  && python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-eng')" \
43
  && python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-yor')"
 
45
  # Copy project files
46
  COPY . .
47
 
 
48
  EXPOSE 7860
49
 
 
50
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
app.py CHANGED
@@ -39,7 +39,7 @@ app.add_middleware(
39
 
40
 
41
  ASK_URL = "https://remostart-milestone-one-farmlingua-ai.hf.space/ask"
42
- tts_ha, tts_en, tts_yo = None, None, None
43
 
44
  asr_models = {
45
  "ha": {"repo": "NCAIR1/Hausa-ASR", "model": None, "proc": None},
@@ -49,11 +49,14 @@ asr_models = {
49
  }
50
 
51
  def load_models():
52
- global tts_ha, tts_en, tts_yo
53
  device = 0 if torch.cuda.is_available() else -1
54
  hf_token = os.getenv("HF_TOKEN")
55
  if not hf_token:
56
- logger.info("HF_TOKEN not set; gated repos may fail to load. Set HF_TOKEN to access restricted models.")
 
 
 
57
  logger.info("Loading TTS models...")
58
  try:
59
  tts_ha = pipeline("text-to-speech", model="facebook/mms-tts-hau", device=device)
@@ -74,7 +77,9 @@ def load_models():
74
  logger.exception("Failed to load TTS (Yoruba)")
75
  tts_yo = None
76
 
77
- logger.info("Igbo TTS disabled: will fallback to text response")
 
 
78
 
79
  logger.info("Deferred ASR model loads: will lazy-load per language on first use")
80
 
@@ -86,8 +91,6 @@ def _get_asr(lang_code: str):
86
  return entry["model"], entry["proc"]
87
  repo_id = entry["repo"]
88
  hf_token = os.getenv("HF_TOKEN")
89
- if hf_token:
90
- hf_token = hf_token.strip()
91
  try:
92
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
93
  logger.info(f"Lazy-loading ASR for {lang_code} from {repo_id}...")
@@ -211,9 +214,12 @@ def text_to_speech_file(text: str) -> str:
211
  print(f"Detected language: {lang}")
212
 
213
  if lang == "ig":
214
- logger.warning("Igbo TTS not available - returning text response")
215
- raise Exception("Igbo TTS not available - returning text response")
216
-
 
 
 
217
  if lang == "ha":
218
  tts_model = tts_ha
219
  elif lang == "yo":
@@ -222,7 +228,8 @@ def text_to_speech_file(text: str) -> str:
222
  tts_model = tts_en
223
 
224
  if tts_model is None:
225
- raise Exception(f"TTS model not available for language '{lang}'")
 
226
 
227
  speech_output = tts_model(text)
228
  audio_raw = speech_output["audio"]
@@ -265,17 +272,12 @@ async def chat(text: str = Form(...), speak: bool = False, raw: bool = False):
265
  raise HTTPException(status_code=400, detail="Text cannot be empty")
266
  final_text = text if raw else get_ai_response(text)
267
  if speak:
268
- try:
269
- audio_path = text_to_speech_file(final_text)
270
- return FileResponse(audio_path, media_type="audio/wav", filename="response.wav")
271
- except Exception as e:
272
- logger.warning(f"TTS failed for chat endpoint: {e}")
273
- return {
274
- "question": text,
275
- "answer": final_text,
276
- "tts_available": False,
277
- "message": f"TTS not available: {str(e)}"
278
- }
279
  return {"question": text, "answer": final_text}
280
 
281
  @app.post("/speak")
@@ -286,17 +288,12 @@ async def speak_to_ai(audio_file: UploadFile = File(...), speak: bool = True):
286
  transcription = speech_to_text(audio_data)
287
  ai_response = get_ai_response(transcription)
288
  if speak:
289
- try:
290
- audio_path = text_to_speech_file(ai_response)
291
- return FileResponse(audio_path, media_type="audio/wav", filename="response.wav")
292
- except Exception as e:
293
- logger.warning(f"TTS failed for speak endpoint: {e}")
294
- return {
295
- "transcription": transcription,
296
- "ai_response": ai_response,
297
- "tts_available": False,
298
- "message": f"TTS not available: {str(e)}"
299
- }
300
  return {"transcription": transcription, "ai_response": ai_response}
301
 
302
  if __name__ == "__main__":
 
39
 
40
 
41
  ASK_URL = "https://remostart-milestone-one-farmlingua-ai.hf.space/ask"
42
+ tts_ha, tts_en, tts_yo, tts_ig = None, None, None, None
43
 
44
  asr_models = {
45
  "ha": {"repo": "NCAIR1/Hausa-ASR", "model": None, "proc": None},
 
49
  }
50
 
51
  def load_models():
52
+ global tts_ha, tts_en, tts_yo, tts_ig
53
  device = 0 if torch.cuda.is_available() else -1
54
  hf_token = os.getenv("HF_TOKEN")
55
  if not hf_token:
56
+ logger.warning("HF_TOKEN not set! This may cause authentication failures for gated repositories.")
57
+ logger.warning("Please set HF_TOKEN environment variable to access restricted models.")
58
+ else:
59
+ logger.info("HF_TOKEN is set and ready for authenticated model access.")
60
  logger.info("Loading TTS models...")
61
  try:
62
  tts_ha = pipeline("text-to-speech", model="facebook/mms-tts-hau", device=device)
 
77
  logger.exception("Failed to load TTS (Yoruba)")
78
  tts_yo = None
79
 
80
+ tts_ig = None
81
+ logger.info("Igbo TTS model disabled - will return text responses for Igbo language")
82
+
83
 
84
  logger.info("Deferred ASR model loads: will lazy-load per language on first use")
85
 
 
91
  return entry["model"], entry["proc"]
92
  repo_id = entry["repo"]
93
  hf_token = os.getenv("HF_TOKEN")
 
 
94
  try:
95
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
96
  logger.info(f"Lazy-loading ASR for {lang_code} from {repo_id}...")
 
214
  print(f"Detected language: {lang}")
215
 
216
  if lang == "ig":
217
+ logger.info("Igbo language detected - returning text response instead of audio")
218
+ fd, path = tempfile.mkstemp(suffix=".txt")
219
+ os.close(fd)
220
+ with open(path, 'w', encoding='utf-8') as f:
221
+ f.write(text)
222
+ return path
223
  if lang == "ha":
224
  tts_model = tts_ha
225
  elif lang == "yo":
 
228
  tts_model = tts_en
229
 
230
  if tts_model is None:
231
+ logger.error(f"TTS model for {lang} is not available")
232
+ raise HTTPException(status_code=500, detail=f"TTS model for {lang} is not available")
233
 
234
  speech_output = tts_model(text)
235
  audio_raw = speech_output["audio"]
 
272
  raise HTTPException(status_code=400, detail="Text cannot be empty")
273
  final_text = text if raw else get_ai_response(text)
274
  if speak:
275
+ output_path = text_to_speech_file(final_text)
276
+ lang = detect_language(final_text)
277
+ if lang == "ig":
278
+ return FileResponse(output_path, media_type="text/plain", filename="response.txt")
279
+ else:
280
+ return FileResponse(output_path, media_type="audio/wav", filename="response.wav")
 
 
 
 
 
281
  return {"question": text, "answer": final_text}
282
 
283
  @app.post("/speak")
 
288
  transcription = speech_to_text(audio_data)
289
  ai_response = get_ai_response(transcription)
290
  if speak:
291
+ output_path = text_to_speech_file(ai_response)
292
+ lang = detect_language(ai_response)
293
+ if lang == "ig":
294
+ return FileResponse(output_path, media_type="text/plain", filename="response.txt")
295
+ else:
296
+ return FileResponse(output_path, media_type="audio/wav", filename="response.wav")
 
 
 
 
 
297
  return {"transcription": transcription, "ai_response": ai_response}
298
 
299
  if __name__ == "__main__":