Spaces:

nexusbert
/

milestone3

Sleeping

App Files Files Community

nexusbert commited on Oct 10, 2025

Commit

c3c0d65

1 Parent(s): 06b99eb

push mileston3

Browse files

Files changed (3) hide show

Dockerfile +56 -0
app.py +427 -0
requirements.txt +24 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,56 @@

+# Base Image
+FROM python:3.10-slim
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1
+WORKDIR /code
+# System Dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    git \
+    curl \
+    libopenblas-dev \
+    libomp-dev \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Hugging Face + model tools
+RUN pip install --no-cache-dir huggingface-hub sentencepiece accelerate fasttext
+# Hugging Face cache environment
+ENV HF_HOME=/models/huggingface \
+    TRANSFORMERS_CACHE=/models/huggingface \
+    HUGGINGFACE_HUB_CACHE=/models/huggingface \
+    HF_HUB_CACHE=/models/huggingface
+# Created cache dir and set permissions
+RUN mkdir -p /models/huggingface && chmod -R 777 /models/huggingface
+# Pre-download models at build time
+RUN python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-1b-all')" \
+ && python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-hau')" \
+ && python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-eng')" \
+ && python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-yor')" \
+ && find /models/huggingface -name '*.lock' -delete
+# Preload tokenizers (avoid runtime delays)
+RUN python -c "from transformers import Wav2Vec2Processor; Wav2Vec2Processor.from_pretrained('facebook/mms-1b-all')" \
+ && python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-hau')" \
+ && python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-eng')" \
+ && python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-yor')"
+# Copy project files
+COPY . .
+# Expose FastAPI port
+EXPOSE 7860
+# Run FastAPI app with uvicorn (1 workers for concurrency)
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

app.py ADDED Viewed

	@@ -0,0 +1,427 @@

+import os
+import io
+import tempfile
+import subprocess
+import requests
+import torch
+import numpy as np
+import soundfile as sf
+from fastapi import FastAPI, File, UploadFile, HTTPException, Form
+from fastapi.responses import FileResponse
+from fastapi.middleware.cors import CORSMiddleware
+from transformers import pipeline, Wav2Vec2Processor, Wav2Vec2ForCTC
+from langdetect import detect
+import imageio_ffmpeg
+import logging
+from contextlib import asynccontextmanager
+import uvicorn
+import nest_asyncio
+nest_asyncio.apply()
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    load_models()
+    yield
+app = FastAPI(title="Farmlingua AI Speech Interface", version="1.0.0", lifespan=lifespan)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+ASK_URL = "https://remostart-milestone-one-farmlingua-ai.hf.space/ask"
+tts_ha, tts_en, tts_yo, tts_ig = None, None, None, None
+mms_model = None
+mms_processor = None
+def load_models():
+    global tts_ha, tts_en, tts_yo, tts_ig
+    device = 0 if torch.cuda.is_available() else -1
+    hf_token = os.getenv("HF_TOKEN")
+    if not hf_token:
+        logger.info("HF_TOKEN not set; gated repos may fail to load. Set HF_TOKEN to access restricted models.")
+    logger.info("Loading TTS models...")
+    try:
+        tts_ha = pipeline("text-to-speech", model="facebook/mms-tts-hau", device=device)
+        logger.info("Loaded TTS (Hausa)")
+    except Exception as e:
+        logger.exception("Failed to load TTS (Hausa)")
+        tts_ha = None
+    try:
+        tts_en = pipeline("text-to-speech", model="facebook/mms-tts-eng", device=device)
+        logger.info("Loaded TTS (English)")
+    except Exception:
+        logger.exception("Failed to load TTS (English)")
+        tts_en = None
+    try:
+        tts_yo = pipeline("text-to-speech", model="facebook/mms-tts-yor", device=device)
+        logger.info("Loaded TTS (Yoruba)")
+    except Exception:
+        logger.exception("Failed to load TTS (Yoruba)")
+        tts_yo = None
+    tts_ig = None
+    logger.info("Igbo TTS disabled: will fallback to text response")
+    logger.info("Deferred MMS model load: will lazy-load on first use")
+def _get_mms():
+    global mms_model, mms_processor
+    if mms_model is not None and mms_processor is not None:
+        return mms_model, mms_processor
+    hf_token = os.getenv("HF_TOKEN")
+    try:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        logger.info("Lazy-loading MMS ASR model...")
+        mms_processor = Wav2Vec2Processor.from_pretrained("facebook/mms-1b-all", token=hf_token)
+        mms_model = Wav2Vec2ForCTC.from_pretrained("facebook/mms-1b-all", token=hf_token)
+        mms_model.to(device)
+        mms_model.eval()
+        logger.info("Loaded MMS ASR model")
+        return mms_model, mms_processor
+    except Exception:
+        logger.exception("Failed to load MMS ASR model")
+        mms_model, mms_processor = None, None
+        return None, None
+def _run_mms(model: Wav2Vec2ForCTC, proc: Wav2Vec2Processor, audio_array: np.ndarray) -> str:
+    try:
+        device = next(model.parameters()).device
+        inputs = proc(audio_array, sampling_rate=16000, return_tensors="pt", padding=True)
+        input_values = inputs.input_values.to(device)
+        with torch.no_grad():
+            logits = model(input_values).logits
+        predicted_ids = torch.argmax(logits, dim=-1)
+        text = proc.batch_decode(predicted_ids)[0]
+        return text.strip() if text else ""
+    except Exception:
+        logging.exception("MMS ASR inference failed")
+        return ""
+def preprocess_audio_ffmpeg(audio_data: bytes, target_sr: int = 16000) -> np.ndarray:
+    try:
+        with tempfile.NamedTemporaryFile(suffix='.input', delete=False) as in_file:
+            in_file.write(audio_data)
+            in_path = in_file.name
+        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as out_file:
+            out_path = out_file.name
+        ffmpeg_exe = imageio_ffmpeg.get_ffmpeg_exe()
+        subprocess.run([
+            ffmpeg_exe, '-y', '-i', in_path,
+            '-ac', '1', '-ar', str(target_sr), out_path
+        ], check=True, capture_output=True)
+        with open(out_path, 'rb') as f:
+            wav_data = f.read()
+        os.unlink(in_path)
+        os.unlink(out_path)
+        audio_array, sr = sf.read(io.BytesIO(wav_data))
+        if len(audio_array.shape) > 1:
+            audio_array = np.mean(audio_array, axis=1)
+        return audio_array.astype(np.float32)
+    except Exception as e:
+        logger.error(f"FFmpeg preprocessing failed: {e}")
+        raise HTTPException(status_code=400, detail="Audio preprocessing failed. Ensure ffmpeg is installed.")
+def speech_to_text(audio_data: bytes) -> str:
+    audio_array = preprocess_audio_ffmpeg(audio_data)
+    model, proc = _get_mms()
+    if model is None or proc is None:
+        return ""
+    text = _run_mms(model, proc, audio_array)
+    return text
+def get_ai_response(text: str, response_language: str = None) -> str:
+    try:
+        if response_language and response_language != "en":
+            language_instructions = {
+                "ha": "Please respond in Hausa language.",
+                "yo": "Please respond in Yoruba language.",
+                "ig": "Please respond in Igbo language.",
+                "en": "Please respond in English."
+            }
+            language_instruction = language_instructions.get(response_language, "")
+            enhanced_query = f"{text}. {language_instruction}" if language_instruction else text
+        else:
+            enhanced_query = text
+        response = requests.post(ASK_URL, json={"query": enhanced_query}, timeout=30)
+        response.raise_for_status()
+        result = response.json()
+        return result.get("answer", "Sorry, no answer returned.")
+    except Exception as e:
+        logger.error(f"AI request error: {e}")
+        return f"I'm sorry, I couldn't connect to the AI service. You said: '{text}'."
+HAUSA_WORDS = [
+    "aikin","manoma","gona","amfanin","yanayi","tsaba","fasaha","bisa","noman","shuka",
+    "daji","rani","damina","amfani","bidi'a","noma","bashi","manure","tsiro","gishiri"
+]
+YORUBA_WORDS = [
+    "ilé","ọmọ","òun","awọn","agbẹ","oko","ọgbà","irugbin","àkọsílẹ","omi","ojo","àgbàlá","irọlẹ"
+]
+IGBO_WORDS = [
+    "ugbo","akụkọ","mmiri","ala","ọrụ","ncheta","ọhụrụ","ugwu","nri","ahụhụ"
+]
+def detect_language(text: str) -> str:
+    text_lower = text.lower()
+    if any(word in text_lower for word in HAUSA_WORDS):
+        return "ha"
+    elif any(word in text_lower for word in YORUBA_WORDS):
+        return "yo"
+    elif any(word in text_lower for word in IGBO_WORDS):
+        return "ig"
+    lang = detect(text)
+    if lang.startswith("ha"):
+        return "ha"
+    elif lang.startswith("yo"):
+        return "yo"
+    elif lang.startswith("ig"):
+        return "ig"
+    else:
+        return "en"
+def text_to_speech_file(text: str) -> str:
+    lang = detect_language(text)
+    print(f"Detected language: {lang}")
+    supported_tts_languages = ["ha", "yo", "en"]
+    if lang not in supported_tts_languages:
+        logger.warning(f"Language '{lang}' not supported for TTS, falling back to English")
+        lang = "en"
+    global tts_ig
+    if lang == "ha":
+        tts_model = tts_ha
+    elif lang == "yo":
+        tts_model = tts_yo
+    elif lang == "ig":
+        logger.warning("Igbo TTS not available, raising exception for text fallback")
+        raise Exception("Igbo TTS not available - returning text response")
+    else:
+        tts_model = tts_en
+    if tts_model is None:
+        raise Exception(f"TTS model not available for language '{lang}'")
+    speech_output = tts_model(text)
+    audio_raw = speech_output["audio"]
+    sampling_rate = int(speech_output["sampling_rate"])
+    if isinstance(audio_raw, torch.Tensor):
+        audio_np = audio_raw.detach().cpu().numpy()
+    else:
+        audio_np = np.asarray(audio_raw)
+    if audio_np.ndim > 1:
+        audio_np = audio_np.reshape(-1)
+    audio_np = audio_np.astype(np.float32, copy=False)
+    audio_clipped = np.clip(audio_np, -1.0, 1.0)
+    audio_int16 = (audio_clipped * 32767.0).astype(np.int16)
+    fd, path = tempfile.mkstemp(suffix=".wav")
+    os.close(fd)
+    sf.write(path, audio_int16, sampling_rate, format='WAV', subtype='PCM_16')
+    return path
+@app.get("/")
+async def root():
+    return {"status": "ok", "message": "System ready"}
+@app.get("/health")
+async def health():
+    return {"message": "Farmlingua AI Speech Interface is running!"}
+@app.post("/chat")
+async def chat(text: str = Form(...), speak: bool = False, raw: bool = False):
+    if not text.strip():
+        raise HTTPException(status_code=400, detail="Text cannot be empty")
+    input_language = detect_language(text)
+    final_text = text if raw else get_ai_response(text, response_language=input_language)
+    if speak:
+        try:
+            audio_path = text_to_speech_file(final_text)
+            return FileResponse(audio_path, media_type="audio/wav", filename="response.wav")
+        except Exception as e:
+            logger.warning(f"TTS failed for chat endpoint: {e}")
+            return {
+                "question": text,
+                "answer": final_text,
+                "input_language": input_language,
+                "tts_available": False,
+                "message": f"TTS not available: {str(e)}"
+            }
+    return {
+        "question": text,
+        "answer": final_text,
+        "input_language": input_language
+    }
+@app.post("/speak")
+async def speak_to_ai(audio_file: UploadFile = File(...), speak: bool = True):
+    if not audio_file.content_type.startswith('audio/'):
+        raise HTTPException(status_code=400, detail="File must be an audio file")
+    audio_data = await audio_file.read()
+    transcription = speech_to_text(audio_data)
+    input_language = detect_language(transcription)
+    ai_response = get_ai_response(transcription, response_language=input_language)
+    if speak:
+        try:
+            audio_path = text_to_speech_file(ai_response)
+            return FileResponse(audio_path, media_type="audio/wav", filename="response.wav")
+        except Exception as e:
+            logger.warning(f"TTS failed for speak endpoint: {e}")
+            return {
+                "transcription": transcription,
+                "ai_response": ai_response,
+                "input_language": input_language,
+                "tts_available": False,
+                "message": f"TTS not available: {str(e)}"
+            }
+    return {
+        "transcription": transcription,
+        "ai_response": ai_response,
+        "input_language": input_language
+    }
+@app.post("/stt")
+async def speech_to_text_endpoint(audio_file: UploadFile = File(...)):
+    if not audio_file.content_type.startswith('audio/'):
+        raise HTTPException(status_code=400, detail="File must be an audio file")
+    try:
+        audio_data = await audio_file.read()
+        transcription = speech_to_text(audio_data)
+        if not transcription.strip():
+            return {"transcription": "", "error": "No speech detected or transcription failed"}
+        return {
+            "transcription": transcription,
+            "language_detected": detect_language(transcription),
+            "success": True
+        }
+    except Exception as e:
+        logger.error(f"STT endpoint error: {e}")
+        raise HTTPException(status_code=500, detail=f"Speech-to-text conversion failed: {str(e)}")
+@app.post("/tts")
+async def text_to_speech_endpoint(text: str = Form(...), language: str = Form(None)):
+    if not text.strip():
+        raise HTTPException(status_code=400, detail="Text cannot be empty")
+    try:
+        if language and language in ["ha", "yo", "ig", "en"]:
+            lang = language
+        else:
+            lang = detect_language(text)
+        logger.info(f"TTS using language: {lang}")
+        supported_tts_languages = ["ha", "yo", "en"]
+        if lang not in supported_tts_languages:
+            logger.warning(f"Language '{lang}' not supported for TTS, returning text-only response")
+            return {
+                "text": text,
+                "language_detected": lang,
+                "tts_available": False,
+                "message": f"TTS not available for language '{lang}'. Supported languages: {', '.join(supported_tts_languages)}",
+                "note": "AI response is already in the detected language"
+            }
+        global tts_ig
+        if lang == "ha":
+            tts_model = tts_ha
+        elif lang == "yo":
+            tts_model = tts_yo
+        elif lang == "ig":
+            logger.warning("Igbo TTS not available, returning text-only response")
+            return {
+                "text": text,
+                "language_detected": lang,
+                "tts_available": False,
+                "message": "Igbo TTS not available - returning text response",
+                "note": "AI response is already in Igbo language"
+            }
+        else:
+            tts_model = tts_en
+        if tts_model is None:
+            logger.warning(f"TTS model not available for language '{lang}', returning text-only response")
+            return {
+                "text": text,
+                "language_detected": lang,
+                "tts_available": False,
+                "message": f"TTS model not available for language '{lang}'"
+            }
+        speech_output = tts_model(text)
+        audio_raw = speech_output["audio"]
+        sampling_rate = int(speech_output["sampling_rate"])
+        if isinstance(audio_raw, torch.Tensor):
+            audio_np = audio_raw.detach().cpu().numpy()
+        else:
+            audio_np = np.asarray(audio_raw)
+        if audio_np.ndim > 1:
+            audio_np = audio_np.reshape(-1)
+        audio_np = audio_np.astype(np.float32, copy=False)
+        audio_clipped = np.clip(audio_np, -1.0, 1.0)
+        audio_int16 = (audio_clipped * 32767.0).astype(np.int16)
+        fd, path = tempfile.mkstemp(suffix=".wav")
+        os.close(fd)
+        sf.write(path, audio_int16, sampling_rate, format='WAV', subtype='PCM_16')
+        return FileResponse(
+            path,
+            media_type="audio/wav",
+            filename=f"tts_{lang}_{hash(text) % 10000}.wav"
+        )
+    except Exception as e:
+        logger.error(f"TTS endpoint error: {e}")
+        return {
+            "text": text,
+            "language_detected": lang if 'lang' in locals() else "unknown",
+            "tts_available": False,
+            "message": f"TTS conversion failed: {str(e)}"
+        }
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", "8000")))

requirements.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+fastapi
+uvicorn
+websockets
+torch
+torchaudio
+transformers
+soundfile
+requests
+numpy
+scipy
+librosa
+imageio-ffmpeg
+python-multipart
+aiofiles
+accelerate
+sentencepiece
+protobuf
+langdetect
+nest-asyncio