Spaces:
Sleeping
Sleeping
push
Browse files- Dockerfile +0 -4
- app.py +29 -32
Dockerfile
CHANGED
|
@@ -33,13 +33,11 @@ ENV HF_HOME=/models/huggingface \
|
|
| 33 |
# Created cache dir and set permissions
|
| 34 |
RUN mkdir -p /models/huggingface && chmod -R 777 /models/huggingface
|
| 35 |
|
| 36 |
-
# Pre-download TTS models at build time (only non-gated models)
|
| 37 |
RUN python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-hau')" \
|
| 38 |
&& python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-eng')" \
|
| 39 |
&& python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-yor')" \
|
| 40 |
&& find /models/huggingface -name '*.lock' -delete
|
| 41 |
|
| 42 |
-
# Preload TTS pipelines (avoid runtime delays) - ASR models will be lazy-loaded
|
| 43 |
RUN python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-hau')" \
|
| 44 |
&& python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-eng')" \
|
| 45 |
&& python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-yor')"
|
|
@@ -47,8 +45,6 @@ RUN python -c "from transformers import pipeline; pipeline('text-to-speech', mod
|
|
| 47 |
# Copy project files
|
| 48 |
COPY . .
|
| 49 |
|
| 50 |
-
# Expose FastAPI port
|
| 51 |
EXPOSE 7860
|
| 52 |
|
| 53 |
-
# Run FastAPI app with uvicorn (1 workers for concurrency)
|
| 54 |
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
|
|
|
|
| 33 |
# Created cache dir and set permissions
|
| 34 |
RUN mkdir -p /models/huggingface && chmod -R 777 /models/huggingface
|
| 35 |
|
|
|
|
| 36 |
RUN python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-hau')" \
|
| 37 |
&& python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-eng')" \
|
| 38 |
&& python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-yor')" \
|
| 39 |
&& find /models/huggingface -name '*.lock' -delete
|
| 40 |
|
|
|
|
| 41 |
RUN python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-hau')" \
|
| 42 |
&& python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-eng')" \
|
| 43 |
&& python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-yor')"
|
|
|
|
| 45 |
# Copy project files
|
| 46 |
COPY . .
|
| 47 |
|
|
|
|
| 48 |
EXPOSE 7860
|
| 49 |
|
|
|
|
| 50 |
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
|
app.py
CHANGED
|
@@ -39,7 +39,7 @@ app.add_middleware(
|
|
| 39 |
|
| 40 |
|
| 41 |
ASK_URL = "https://remostart-milestone-one-farmlingua-ai.hf.space/ask"
|
| 42 |
-
tts_ha, tts_en, tts_yo = None, None, None
|
| 43 |
|
| 44 |
asr_models = {
|
| 45 |
"ha": {"repo": "NCAIR1/Hausa-ASR", "model": None, "proc": None},
|
|
@@ -49,11 +49,14 @@ asr_models = {
|
|
| 49 |
}
|
| 50 |
|
| 51 |
def load_models():
|
| 52 |
-
global tts_ha, tts_en, tts_yo
|
| 53 |
device = 0 if torch.cuda.is_available() else -1
|
| 54 |
hf_token = os.getenv("HF_TOKEN")
|
| 55 |
if not hf_token:
|
| 56 |
-
logger.
|
|
|
|
|
|
|
|
|
|
| 57 |
logger.info("Loading TTS models...")
|
| 58 |
try:
|
| 59 |
tts_ha = pipeline("text-to-speech", model="facebook/mms-tts-hau", device=device)
|
|
@@ -74,7 +77,9 @@ def load_models():
|
|
| 74 |
logger.exception("Failed to load TTS (Yoruba)")
|
| 75 |
tts_yo = None
|
| 76 |
|
| 77 |
-
|
|
|
|
|
|
|
| 78 |
|
| 79 |
logger.info("Deferred ASR model loads: will lazy-load per language on first use")
|
| 80 |
|
|
@@ -86,8 +91,6 @@ def _get_asr(lang_code: str):
|
|
| 86 |
return entry["model"], entry["proc"]
|
| 87 |
repo_id = entry["repo"]
|
| 88 |
hf_token = os.getenv("HF_TOKEN")
|
| 89 |
-
if hf_token:
|
| 90 |
-
hf_token = hf_token.strip()
|
| 91 |
try:
|
| 92 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 93 |
logger.info(f"Lazy-loading ASR for {lang_code} from {repo_id}...")
|
|
@@ -211,9 +214,12 @@ def text_to_speech_file(text: str) -> str:
|
|
| 211 |
print(f"Detected language: {lang}")
|
| 212 |
|
| 213 |
if lang == "ig":
|
| 214 |
-
logger.
|
| 215 |
-
|
| 216 |
-
|
|
|
|
|
|
|
|
|
|
| 217 |
if lang == "ha":
|
| 218 |
tts_model = tts_ha
|
| 219 |
elif lang == "yo":
|
|
@@ -222,7 +228,8 @@ def text_to_speech_file(text: str) -> str:
|
|
| 222 |
tts_model = tts_en
|
| 223 |
|
| 224 |
if tts_model is None:
|
| 225 |
-
|
|
|
|
| 226 |
|
| 227 |
speech_output = tts_model(text)
|
| 228 |
audio_raw = speech_output["audio"]
|
|
@@ -265,17 +272,12 @@ async def chat(text: str = Form(...), speak: bool = False, raw: bool = False):
|
|
| 265 |
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
| 266 |
final_text = text if raw else get_ai_response(text)
|
| 267 |
if speak:
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
return
|
| 274 |
-
"question": text,
|
| 275 |
-
"answer": final_text,
|
| 276 |
-
"tts_available": False,
|
| 277 |
-
"message": f"TTS not available: {str(e)}"
|
| 278 |
-
}
|
| 279 |
return {"question": text, "answer": final_text}
|
| 280 |
|
| 281 |
@app.post("/speak")
|
|
@@ -286,17 +288,12 @@ async def speak_to_ai(audio_file: UploadFile = File(...), speak: bool = True):
|
|
| 286 |
transcription = speech_to_text(audio_data)
|
| 287 |
ai_response = get_ai_response(transcription)
|
| 288 |
if speak:
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
return
|
| 295 |
-
"transcription": transcription,
|
| 296 |
-
"ai_response": ai_response,
|
| 297 |
-
"tts_available": False,
|
| 298 |
-
"message": f"TTS not available: {str(e)}"
|
| 299 |
-
}
|
| 300 |
return {"transcription": transcription, "ai_response": ai_response}
|
| 301 |
|
| 302 |
if __name__ == "__main__":
|
|
|
|
| 39 |
|
| 40 |
|
| 41 |
ASK_URL = "https://remostart-milestone-one-farmlingua-ai.hf.space/ask"
|
| 42 |
+
tts_ha, tts_en, tts_yo, tts_ig = None, None, None, None
|
| 43 |
|
| 44 |
asr_models = {
|
| 45 |
"ha": {"repo": "NCAIR1/Hausa-ASR", "model": None, "proc": None},
|
|
|
|
| 49 |
}
|
| 50 |
|
| 51 |
def load_models():
|
| 52 |
+
global tts_ha, tts_en, tts_yo, tts_ig
|
| 53 |
device = 0 if torch.cuda.is_available() else -1
|
| 54 |
hf_token = os.getenv("HF_TOKEN")
|
| 55 |
if not hf_token:
|
| 56 |
+
logger.warning("HF_TOKEN not set! This may cause authentication failures for gated repositories.")
|
| 57 |
+
logger.warning("Please set HF_TOKEN environment variable to access restricted models.")
|
| 58 |
+
else:
|
| 59 |
+
logger.info("HF_TOKEN is set and ready for authenticated model access.")
|
| 60 |
logger.info("Loading TTS models...")
|
| 61 |
try:
|
| 62 |
tts_ha = pipeline("text-to-speech", model="facebook/mms-tts-hau", device=device)
|
|
|
|
| 77 |
logger.exception("Failed to load TTS (Yoruba)")
|
| 78 |
tts_yo = None
|
| 79 |
|
| 80 |
+
tts_ig = None
|
| 81 |
+
logger.info("Igbo TTS model disabled - will return text responses for Igbo language")
|
| 82 |
+
|
| 83 |
|
| 84 |
logger.info("Deferred ASR model loads: will lazy-load per language on first use")
|
| 85 |
|
|
|
|
| 91 |
return entry["model"], entry["proc"]
|
| 92 |
repo_id = entry["repo"]
|
| 93 |
hf_token = os.getenv("HF_TOKEN")
|
|
|
|
|
|
|
| 94 |
try:
|
| 95 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 96 |
logger.info(f"Lazy-loading ASR for {lang_code} from {repo_id}...")
|
|
|
|
| 214 |
print(f"Detected language: {lang}")
|
| 215 |
|
| 216 |
if lang == "ig":
|
| 217 |
+
logger.info("Igbo language detected - returning text response instead of audio")
|
| 218 |
+
fd, path = tempfile.mkstemp(suffix=".txt")
|
| 219 |
+
os.close(fd)
|
| 220 |
+
with open(path, 'w', encoding='utf-8') as f:
|
| 221 |
+
f.write(text)
|
| 222 |
+
return path
|
| 223 |
if lang == "ha":
|
| 224 |
tts_model = tts_ha
|
| 225 |
elif lang == "yo":
|
|
|
|
| 228 |
tts_model = tts_en
|
| 229 |
|
| 230 |
if tts_model is None:
|
| 231 |
+
logger.error(f"TTS model for {lang} is not available")
|
| 232 |
+
raise HTTPException(status_code=500, detail=f"TTS model for {lang} is not available")
|
| 233 |
|
| 234 |
speech_output = tts_model(text)
|
| 235 |
audio_raw = speech_output["audio"]
|
|
|
|
| 272 |
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
| 273 |
final_text = text if raw else get_ai_response(text)
|
| 274 |
if speak:
|
| 275 |
+
output_path = text_to_speech_file(final_text)
|
| 276 |
+
lang = detect_language(final_text)
|
| 277 |
+
if lang == "ig":
|
| 278 |
+
return FileResponse(output_path, media_type="text/plain", filename="response.txt")
|
| 279 |
+
else:
|
| 280 |
+
return FileResponse(output_path, media_type="audio/wav", filename="response.wav")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
return {"question": text, "answer": final_text}
|
| 282 |
|
| 283 |
@app.post("/speak")
|
|
|
|
| 288 |
transcription = speech_to_text(audio_data)
|
| 289 |
ai_response = get_ai_response(transcription)
|
| 290 |
if speak:
|
| 291 |
+
output_path = text_to_speech_file(ai_response)
|
| 292 |
+
lang = detect_language(ai_response)
|
| 293 |
+
if lang == "ig":
|
| 294 |
+
return FileResponse(output_path, media_type="text/plain", filename="response.txt")
|
| 295 |
+
else:
|
| 296 |
+
return FileResponse(output_path, media_type="audio/wav", filename="response.wav")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
return {"transcription": transcription, "ai_response": ai_response}
|
| 298 |
|
| 299 |
if __name__ == "__main__":
|