darshvit20's picture
Initial deploy
b2f9b47
# services/api/Dockerfile
#
# WHY THIS IS SEPARATE FROM THE ENCODER:
# If they were one container:
# - Restart API β†’ also restarts encoder β†’ 3s model reload on every code change
# - Scale horizontally β†’ each replica carries the 90MB model in RAM
# - One crash takes down both search logic AND inference
#
# Separate containers = independent restart, scale, update, and failure domains.
#
# THIS CONTAINER IS LIGHTER than the encoder:
# - No onnxruntime (that's the encoder's job)
# - Needs faiss-cpu, whisper, httpx (for calling encoder)
# - Target size: ~600MB
FROM python:3.11-slim
WORKDIR /app
RUN apt-get update && apt-get install -y --no-install-recommends \
ffmpeg \
git \
# ffmpeg is needed by Whisper to decode audio files (mp3, wav, webm, etc.)
# Without it, Whisper can only handle raw PCM.
# Size cost: ~80MB β€” worth it for voice search capability.
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
RUN pip install --upgrade pip setuptools wheel
# RUN pip install --no-cache-dir -r requirements.txt
RUN pip install --no-cache-dir --no-build-isolation -r requirements.txt
COPY main.py .
# Create directories for runtime data
# embeddings/ and data/ are mounted as volumes β€” not baked in
RUN mkdir -p embeddings data images
EXPOSE 8000
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"
# 2 workers for the API (it's I/O bound β€” waiting on encoder HTTP calls)
# I/O-bound services benefit from multiple workers because while one worker
# waits for the encoder response, another can handle a new request.
# The encoder is CPU-bound β€” multiple workers there would fight for CPU.
CMD ["python", "-m", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "2"]