Spaces:

MohitG012
/

Backend_BI_Assistant_with_Voice

Sleeping

App Files Files Community

MohitGupta41 commited on Sep 9, 2025

Commit

91b6d82

1 Parent(s): 8148bd1

Initial Commit

Browse files

Files changed (3) hide show

Dockerfile_e +0 -77
app/main_e.py +0 -414
requirements.txt +0 -1

Dockerfile_e DELETED Viewed

@@ -1,77 +0,0 @@
-# --------------------------
-# Hugging Face Space (Docker) - Backend (CPU)
-# --------------------------
-FROM python:3.12-slim
-# =============== System deps ===============
-# - build-essential et al. for any wheels that need compile
-# - ffmpeg for audio resample
-# - libsndfile1 for python-soundfile
-# - OpenCV runtime libs already included below
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    build-essential gcc g++ make cmake pkg-config \
-    libgl1 libglib2.0-0 libsm6 libxext6 libxrender1 libgomp1 \
-    libsndfile1 ffmpeg git curl ca-certificates \
- && rm -rf /var/lib/apt/lists/*
-# =============== Workspace ===============
-ENV APP_HOME=/workspace
-RUN mkdir -p $APP_HOME/app $APP_HOME/data $APP_HOME/cache && chmod -R 777 $APP_HOME
-WORKDIR $APP_HOME
-# Optional caches for various libs
-ENV CC=gcc CXX=g++
-ENV INSIGHTFACE_HOME=/workspace/cache/insightface
-ENV MPLCONFIGDIR=/workspace/cache/matplotlib
-# =============== Python deps ===============
-COPY requirements.txt ./requirements.txt
-# Pre-install numpy variant compatible with py3.12, then the rest
-RUN python -m pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir "numpy<2.0; python_version<'3.12'" "numpy>=2.0; python_version>='3.12'" && \
-    pip install --no-cache-dir -r requirements.txt
-# Add audio utils used by /stt and /tts (already referenced in code you’ll add)
-RUN pip install --no-cache-dir soundfile faster-whisper==1.0.0
-# =============== Piper (offline TTS) ===============
-# Download a small/medium English voice (change to hi-IN or en-IN if you prefer)
-# Piper releases: https://github.com/rhasspy/piper/releases
-RUN curl -L -o /usr/local/bin/piper \
-      https://github.com/rhasspy/piper/releases/download/v1.2.0/piper_linux_x86_64 && \
-    chmod +x /usr/local/bin/piper
-# Voice (~50–80MB each). Swap to another voice if you need Indian English/Hindi.
-# See https://github.com/rhasspy/piper#voices for alternatives.
-RUN mkdir -p /models/piper/en_US && \
-    curl -L -o /models/piper/en_US/libri_tts_en_US-medium.onnx \
-      https://github.com/rhasspy/piper/releases/download/v1.2.0/libri_tts_en_US-medium.onnx
-# =============== faster-whisper model (offline STT) ===============
-# Pre-download the "small" model (~460 MB) so no runtime fetch is needed.
-RUN mkdir -p /models/faster-whisper
-RUN python - <<'PY'
-from faster_whisper import WhisperModel
-WhisperModel("small", download_root="/models/faster-whisper")
-print("Downloaded faster-whisper 'small' to /models/faster-whisper")
-PY
-# =============== App ===============
-COPY app ./app
-COPY run.sh ./run.sh
-RUN chmod +x ./run.sh
-# =============== Runtime ENV ===============
-# Voice/STT providers default to OFFLINE so Space does not need internet
-ENV STT_PROVIDER=offline \
-    TTS_PROVIDER=offline \
-    FW_MODEL_DIR=/models/faster-whisper \
-    FW_MODEL_SIZE=small \
-    PIPER_BIN=/usr/local/bin/piper \
-    PIPER_VOICE=/models/piper/en_US/libri_tts_en_US-medium.onnx \
-    PIPER_SAMPLE_RATE=22050 \
-    PORT=7860
-# Keep your existing port/cmd
-CMD ["./run.sh"]

app/main_e.py DELETED Viewed

@@ -1,414 +0,0 @@
-from fastapi import FastAPI, UploadFile, File, HTTPException, Query, Depends, Body
-from fastapi.middleware.cors import CORSMiddleware
-from .settings import settings
-from .deps import index, face, get_hf_token, build_agent_with_token
-from .models.face import (
-    EnrollResp, IdentifyReq, IdentifyResp, IdentifyHit,
-    IdentifyManyReq, IdentifyManyResp, FaceDet,
-)
-from .models.query import QueryReq, QueryResp
-from .services.aggregator import aggregate_by_user
-from .services.face_service import imdecode
-import numpy as np, uuid, cv2, os, io, zipfile, glob, shutil
-import base64, tempfile, subprocess, json
-from typing import Optional, Tuple
-from fastapi.responses import Response, StreamingResponse
-from pydantic import BaseModel
-import soundfile as sf
-import numpy as np
-from pathlib import Path
-app = FastAPI(title="Realtime BI Assistant")
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"], allow_credentials=True,
-    allow_methods=["*"], allow_headers=["*"],
-)
-@app.get("/")
-def root():
-    return {"ok": True, "msg": "Backend alive"}
-# ---------- Voice config ----------
-STT_PROVIDER = os.getenv("STT_PROVIDER", "offline")  # "offline" | "hf"
-TTS_PROVIDER = os.getenv("TTS_PROVIDER", "offline")  # "offline" | "hf"
-# Faster-Whisper (offline STT)
-FW_MODEL_DIR = os.getenv("FW_MODEL_DIR", "/models/faster-whisper")
-FW_MODEL_SIZE = os.getenv("FW_MODEL_SIZE", "small")  # tiny|base|small|medium|large-v3 etc.
-# Piper (offline TTS)
-PIPER_BIN = os.getenv("PIPER_BIN", "/usr/local/bin/piper")
-PIPER_VOICE = os.getenv("PIPER_VOICE", "/models/piper/en_US/libri_tts_en_US-medium.onnx")  # change to your voice
-PIPER_SAMPLE_RATE = int(os.getenv("PIPER_SAMPLE_RATE", "22050"))
-# Hugging Face (online STT/TTS)
-HF_STT_MODEL = os.getenv("HF_STT_MODEL", "openai/whisper-small")  # any STT model with audio-to-text
-HF_TTS_MODEL = os.getenv("HF_TTS_MODEL", "espnet/kan-bayashi_ljspeech_vits")  # any TTS wav output model
-def _ensure_wav_16k_mono(in_bytes: bytes, in_mime: str = "audio/wav") -> Tuple[np.ndarray, int]:
-    """
-    Convert arbitrary audio to mono 16k PCM via ffmpeg, return (float32 PCM, sr=16000).
-    """
-    # Write temp input
-    with tempfile.NamedTemporaryFile(suffix=".input", delete=False) as f_in:
-        f_in.write(in_bytes)
-        in_path = f_in.name
-    out_path = in_path + ".wav"
-    # ffmpeg -y -i in -ac 1 -ar 16000 -f wav out
-    cmd = [
-        "ffmpeg", "-y", "-i", in_path,
-        "-ac", "1", "-ar", "16000",
-        "-f", "wav", out_path
-    ]
-    subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
-    # Load wav
-    data, sr = sf.read(out_path, dtype="float32", always_2d=False)
-    if sr != 16000:
-        raise RuntimeError("ffmpeg resample failed")
-    try:
-        os.remove(in_path)
-        # keep out_path for debug if needed
-    except Exception:
-        pass
-    return data.astype(np.float32), 16000
-def _bytes_to_wav_stream(pcm: np.ndarray, sr: int = 22050) -> bytes:
-    """Encode float32 PCM to WAV bytes."""
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f_out:
-        sf.write(f_out.name, pcm, sr, subtype="PCM_16")
-        with open(f_out.name, "rb") as fr:
-            wav_bytes = fr.read()
-    try:
-        os.remove(f_out.name)
-    except Exception:
-        pass
-    return wav_bytes
-# ---------- STT ----------
-_fw_model = None
-def _stt_offline(audio_bytes: bytes, mime: str, hf_token: Optional[str]) -> str:
-    global _fw_model
-    try:
-        from faster_whisper import WhisperModel
-    except Exception as e:
-        raise HTTPException(500, f"faster-whisper not installed: {e}")
-    if _fw_model is None:
-        _fw_model = WhisperModel(FW_MODEL_SIZE, device="cpu", compute_type="int8", download_root=FW_MODEL_DIR)
-    pcm, _ = _ensure_wav_16k_mono(audio_bytes, mime)
-    # faster-whisper expects path or np array; we’ll pass array
-    segments, info = _fw_model.transcribe(pcm, language=None, beam_size=1, vad_filter=True)
-    text = " ".join([seg.text.strip() for seg in segments]).strip()
-    return text or ""
-def _stt_hf(audio_bytes: bytes, mime: str, hf_token: Optional[str]) -> str:
-    if not hf_token:
-        raise HTTPException(400, "HF token required for STT via Hugging Face")
-    url = f"https://api-inference.huggingface.co/models/{HF_STT_MODEL}"
-    headers = {"Authorization": f"Bearer {hf_token}"}
-    # HF accepts raw audio bytes
-    import requests as _rq
-    r = _rq.post(url, headers=headers, data=audio_bytes, timeout=120)
-    if not r.ok:
-        raise HTTPException(502, f"HF STT failed: {r.status_code} {r.text[:200]}")
-    try:
-        out = r.json()
-        # common outputs: {"text": "..."} or [{"text": "..."}]
-        if isinstance(out, dict) and "text" in out:
-            return out["text"]
-        if isinstance(out, list) and out and isinstance(out[0], dict) and "text" in out[0]:
-            return out[0]["text"]
-        # some models return {"generated_text": "..."}
-        if isinstance(out, dict) and "generated_text" in out:
-            return out["generated_text"]
-        return ""
-    except Exception:
-        return ""
-# ---------- TTS ----------
-def _tts_offline_piper(text: str, voice_path: str) -> bytes:
-    """
-    Call Piper CLI to synthesize WAV.
-    """
-    if not os.path.isfile(voice_path):
-        raise HTTPException(500, f"Piper voice not found at {voice_path}")
-    with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as f_txt:
-        f_txt.write(text.encode("utf-8"))
-        in_txt = f_txt.name
-    out_wav = in_txt + ".wav"
-    cmd = [PIPER_BIN, "--model", voice_path, "--output_file", out_wav, "--speaker", "0"]
-    with open(in_txt, "rb") as fin:
-        subprocess.run(cmd, stdin=fin, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
-    with open(out_wav, "rb") as fr:
-        audio = fr.read()
-    try:
-        os.remove(in_txt)
-        os.remove(out_wav)
-    except Exception:
-        pass
-    return audio
-def _tts_hf(text: str, hf_token: Optional[str]) -> bytes:
-    if not hf_token:
-        raise HTTPException(400, "HF token required for TTS via Hugging Face")
-    url = f"https://api-inference.huggingface.co/models/{HF_TTS_MODEL}"
-    headers = {"Authorization": f"Bearer {hf_token}", "Accept": "audio/wav", "Content-Type": "application/json"}
-    import requests as _rq
-    r = _rq.post(url, headers=headers, json={"inputs": text}, timeout=120)
-    if not r.ok:
-        # Some HF TTS return JSON with b64; try to parse
-        try:
-            js = r.json()
-            b64 = js.get("audio", None)
-            if b64:
-                return base64.b64decode(b64)
-        except Exception:
-            pass
-        raise HTTPException(502, f"HF TTS failed: {r.status_code} {r.text[:200]}")
-    return r.content
-# ---------- Schemas ----------
-class TTSReq(BaseModel):
-    text: str
-    voice: Optional[str] = "en-IN"
-# ---------- /stt ----------
-@app.post("/stt")
-async def stt(audio: UploadFile = File(...), hf_token: Optional[str] = Depends(get_hf_token)):
-    in_bytes = await audio.read()
-    mime = audio.content_type or "audio/wav"
-    if STT_PROVIDER == "offline":
-        text = _stt_offline(in_bytes, mime, hf_token)
-    elif STT_PROVIDER == "hf":
-        text = _stt_hf(in_bytes, mime, hf_token)
-    else:
-        raise HTTPException(400, f"Unknown STT_PROVIDER: {STT_PROVIDER}")
-    return {"text": text}
-# # ---------- /tts ----------
-# @app.post("/tts")
-# async def tts(req: TTSReq, hf_token: Optional[str] = Depends(get_hf_token)):
-#     text = (req.text or "").strip()
-#     if not text:
-#         raise HTTPException(400, "Empty text")
-#     if TTS_PROVIDER == "offline":
-#         # You can map req.voice -> multiple piper voices if you have them
-#         audio_bytes = _tts_offline_piper(text, PIPER_VOICE)
-#     elif TTS_PROVIDER == "hf":
-#         audio_bytes = _tts_hf(text, hf_token)
-#     else:
-#         raise HTTPException(400, f"Unknown TTS_PROVIDER: {TTS_PROVIDER}")
-#     return Response(content=audio_bytes, media_type="audio/wav")
-VOICE_MAP = {
-    "en-IN": "/models/piper/en_IN/xyz.onnx",
-    "en-US": "/models/piper/en_US/libri_tts_en_US-medium.onnx",
-}
-@app.post("/tts")
-async def tts(req: TTSReq, hf_token: Optional[str] = Depends(get_hf_token)):
-    text = (req.text or "").strip()
-    if not text:
-        raise HTTPException(400, "Empty text")
-    if TTS_PROVIDER == "offline":
-        voice_path = VOICE_MAP.get(req.voice, PIPER_VOICE)
-        audio_bytes = _tts_offline_piper(text, voice_path)
-    elif TTS_PROVIDER == "hf":
-        audio_bytes = _tts_hf(text, hf_token)
-    else:
-        raise HTTPException(400, f"Unknown TTS_PROVIDER: {TTS_PROVIDER}")
-    return Response(content=audio_bytes, media_type="audio/wav")
-def _decide_identity(agg, threshold: float, margin: float):
-    if not agg:
-        return "Unknown", 0.0, 0.0
-    best_user, best_score = agg[0]
-    second = agg[1][1] if len(agg) > 1 else -1.0
-    margin_val = best_score - second
-    if best_score >= threshold and margin_val >= margin and best_user != "Unknown":
-        return best_user, best_score, margin_val
-    return "Unknown", best_score, margin_val
-def _safe_extract(zf: zipfile.ZipFile, dest: str):
-    os.makedirs(dest, exist_ok=True)
-    for member in zf.infolist():
-        p = os.path.realpath(os.path.join(dest, member.filename))
-        if not p.startswith(os.path.realpath(dest) + os.sep):
-            continue
-        if member.is_dir():
-            os.makedirs(p, exist_ok=True)
-        else:
-            os.makedirs(os.path.dirname(p), exist_ok=True)
-            with zf.open(member) as src, open(p, "wb") as out:
-                out.write(src.read())
-def _guess_images_root(tmpdir: str) -> str | None:
-    pref = os.path.join(tmpdir, "Images")
-    if os.path.isdir(pref):
-        return pref
-    for root, dirs, files in os.walk(tmpdir):
-        subdirs = [os.path.join(root, d) for d in dirs]
-        if subdirs and any(
-            any(fn.lower().endswith((".jpg",".jpeg",".png")) for fn in os.listdir(sd))
-            for sd in subdirs
-        ):
-            return root
-    return None
-@app.post("/enroll_zip", response_model=EnrollResp)
-async def enroll_zip(zipfile_upload: UploadFile = File(...)):
-    """
-    Accepts a ZIP with structure: Images/<UserName>/*.jpg|png
-    Upserts all faces into the local FAISS index under user metadata.
-    """
-    if not zipfile_upload.filename.lower().endswith(".zip"):
-        raise HTTPException(400, "Please upload a .zip")
-    raw = await zipfile_upload.read()
-    tmpdir = os.path.join("/workspace", "upload", uuid.uuid4().hex[:8])
-    os.makedirs(tmpdir, exist_ok=True)
-    try:
-        with zipfile.ZipFile(io.BytesIO(raw), "r") as zf:
-            _safe_extract(zf, tmpdir)
-        root = _guess_images_root(tmpdir)
-        if not root:
-            raise HTTPException(400, "Couldn't find 'Images/<UserName>/*' structure in ZIP")
-        user_dirs = sorted([p for p in glob.glob(os.path.join(root, "*")) if os.path.isdir(p)])
-        if not user_dirs:
-            raise HTTPException(400, "No user folders found under Images/")
-        total = 0
-        enrolled_users = []
-        for udir in user_dirs:
-            user = os.path.basename(udir)
-            paths = sorted([p for p in glob.glob(os.path.join(udir, "*")) if p.lower().endswith((".jpg",".jpeg",".png"))])
-            if not paths:
-                continue
-            count_user = 0
-            for p in paths:
-                img = cv2.imdecode(np.fromfile(p, dtype=np.uint8), cv2.IMREAD_COLOR)
-                if img is None: continue
-                bbox, emb, det_score = face.embed_best(img)
-                if emb is None: continue
-                vec = emb.astype(np.float32)
-                vec = vec / (np.linalg.norm(vec) + 1e-9)
-                vid = f"{user}::{uuid.uuid4().hex[:8]}"
-                index.add_vectors(vecs=np.array([vec]),
-                                  metas=[{"user":user,"det_score":float(det_score), "source":"enroll_zip"}],
-                                  ids=[vid])
-                count_user += 1
-                total += 1
-            if count_user > 0:
-                enrolled_users.append(user)
-        return EnrollResp(users=enrolled_users, total_vectors=total)
-    finally:
-        try:
-            shutil.rmtree(tmpdir, ignore_errors=True)
-        except Exception:
-            pass
-# ---------- endpoints ----------
-@app.post("/index/upsert_image")
-async def upsert_image(user: str = Query(..., description="User label"),
-                       image: UploadFile = File(...)):
-    raw = await image.read()
-    img = cv2.imdecode(np.frombuffer(raw, np.uint8), cv2.IMREAD_COLOR)
-    if img is None:
-        raise HTTPException(400, "Invalid image file")
-    bbox, emb, det_score = face.embed_best(img)
-    if emb is None:
-        return {"ok": False, "msg": "no face detected"}
-    vec = emb.astype(np.float32)
-    vec = vec / (np.linalg.norm(vec) + 1e-9)
-    vid = f"{user}::{uuid.uuid4().hex[:8]}"
-    index.add_vectors(vecs=np.array([vec]),
-                      metas=[{"user":user,"det_score":float(det_score)}],
-                      ids=[vid])
-    return {"ok": True, "id": vid, "user": user, "det_score": float(det_score)}
-@app.post("/identify", response_model=IdentifyResp)
-async def identify(req: IdentifyReq):
-    try:
-        img = imdecode(req.image_b64)
-    except Exception:
-        raise HTTPException(status_code=400, detail="Bad image_b64")
-    bbox, emb, det_score = face.embed_best(img)
-    if emb is None:
-        return IdentifyResp(decision="NoFace", best_score=0.0, margin=0.0, topk=[], bbox=None)
-    matches = index.query(emb, top_k=settings.TOPK_DB)
-    agg = aggregate_by_user(matches)
-    user, best, margin_val = _decide_identity(agg, settings.THRESHOLD, settings.MARGIN)
-    topk = [IdentifyHit(user=u, score=s) for u, s in agg[:req.top_k]]
-    return IdentifyResp(decision=user, best_score=best, margin=margin_val, topk=topk, bbox=bbox)
-# ---------- NEW: multi-face endpoint ----------
-@app.post("/identify_many", response_model=IdentifyManyResp)
-async def identify_many(req: IdentifyManyReq):
-    try:
-        img = imdecode(req.image_b64)
-    except Exception:
-        raise HTTPException(status_code=400, detail="Bad image_b64")
-    faces = face.embed_all(img)
-    if not faces:
-        return IdentifyManyResp(detections=[])
-    detections: list[FaceDet] = []
-    top_k_db = req.top_k_db or settings.TOPK_DB
-    for bbox, emb, det_score in faces:
-        matches = index.query(emb, top_k=top_k_db)
-        agg = aggregate_by_user(matches)
-        user, best, margin_val = _decide_identity(agg, settings.THRESHOLD, settings.MARGIN)
-        topk = [IdentifyHit(user=u, score=s) for u, s in agg[:req.top_k]]
-        detections.append(FaceDet(
-            bbox=bbox,
-            decision=user,
-            best_score=best,
-            margin=margin_val,
-            topk=topk
-        ))
-    return IdentifyManyResp(detections=detections)
-@app.post("/query", response_model=QueryResp)
-async def query(req: QueryReq, hf_token: str | None = Depends(get_hf_token)):
-    text = (req.text or "").strip()
-    if not text:
-        raise HTTPException(400, "Empty question")
-    sql_agent = build_agent_with_token(hf_token)
-    try:
-        answer_text, meta = sql_agent.ask(req.user_id, text)
-        citations = [f"sql:{meta['sql']}"]
-        return QueryResp(
-            answer_text=answer_text,
-            citations=citations,
-            metrics={},
-            chart_refs=[],
-            # uncertainty=0.15
-        )
-    except Exception as e:
-        raise HTTPException(status_code=400, detail=f"Query failed: {e}")

requirements.txt CHANGED Viewed

@@ -5,7 +5,6 @@ pydantic-settings
 numpy==1.26.4
 faiss-cpu
 insightface==0.7.3
-# onnxruntime
 onnxruntime==1.17.3
 opencv-python==4.10.0.84
 python-multipart

 numpy==1.26.4
 faiss-cpu
 insightface==0.7.3
 onnxruntime==1.17.3
 opencv-python==4.10.0.84
 python-multipart