# Multi-stage build:
#   Stage 1 — build the Next.js frontend to a static export
#   Stage 2 — Python runtime serving FastAPI + the built frontend on the same port

# ----------------------------------------------------------------------------
# Stage 1 — Node builder
# ----------------------------------------------------------------------------
FROM node:22-alpine AS frontend-builder
WORKDIR /app/frontend

# Install deps first for layer caching
COPY frontend/package.json frontend/package-lock.json* ./
RUN npm ci --no-audit --no-fund

# Copy the rest of the frontend and build
COPY frontend/ ./
# In production, the frontend calls the same origin (no separate backend URL).
ENV NEXT_PUBLIC_BACKEND_URL=""
# Static-export the app — produces ./out
RUN npm run build

# ----------------------------------------------------------------------------
# Stage 2 — Python runtime (FastAPI + corpus + Chroma + DuckDB + frontend)
# ----------------------------------------------------------------------------
FROM python:3.11-slim
WORKDIR /app

# System deps:
#   pdfplumber + torch CPU + sentence-transformers → build-essential, libpoppler
#   pydub (webm→wav transcode for Sarvam STT) → ffmpeg
RUN apt-get update && apt-get install -y --no-install-recommends \
    build-essential \
    libpoppler-cpp-dev \
    pkg-config \
    poppler-utils \
    ffmpeg \
    && rm -rf /var/lib/apt/lists/*

# Install Python deps
COPY requirements.txt ./
RUN pip install --no-cache-dir --upgrade pip && \
    pip install --no-cache-dir -r requirements.txt

# Pre-download the embedding model so the first request is fast (no cold load)
RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('BAAI/bge-small-en-v1.5')"

# Copy the backend source + RAG modules (rag/ holds .py only at this stage)
COPY backend ./backend
COPY rag ./rag
COPY eval ./eval
COPY 70-docs ./70-docs
# Curated structured data the backend reads at request time:
#   - 40-data/reviews/<slug>.json    → /api/insurers/{slug}/reviews
#   - 40-data/policy_facts/*.json    → marketplace + scorecard fact cards
#   - 40-data/premiums/*.json        → premium calculator illustrative baseline
# Total ~2.3 MB — small enough to bake into the Space image.
COPY 40-data ./40-data

# Pull the large data (corpus PDFs + pre-built Chroma vectors + extracted JSONs)
# from the companion HF dataset rather than baking it into the Space repo.
# Why: the free-tier Space repo has a 1 GB cap; rag/corpus + rag/vectors is
# ~310 MB and would have made the Space repo unviable on top of the regular
# code. HF datasets get 50 GB free quota — the right place for this data.
# Public dataset, no token needed at build time. See D-019.
#
# KI-119 (2026-05-15) — CACHE_BUST arg forces this layer to re-execute
# whenever we update the dataset. Without it, Docker reuses the cached
# snapshot_download layer (command string unchanged) even though the
# remote dataset's content changed. Symptom: HF Space served stale Chroma
# (7356 chunks from a prior ingest) instead of the freshly-uploaded
# cleaned one (3799 chunks). Bump CACHE_BUST manually each time the
# dataset is re-uploaded; the value just needs to change.
ARG DATASET_CACHE_BUST=2026-05-15-ki145-v1
RUN echo "Dataset cache bust: ${DATASET_CACHE_BUST}" && python -c "\
from huggingface_hub import snapshot_download; \
snapshot_download(\
    repo_id='rohitsar567/insurance-bot-data', \
    repo_type='dataset', \
    local_dir='/app/rag', \
    allow_patterns=['rag/corpus/**','rag/vectors/**','rag/extracted/**'], \
) " && \
    # The dataset preserves the rag/ prefix in path_in_repo, so the snapshot
    # writes to /app/rag/rag/corpus/... — flatten one level so existing
    # backend imports (rag/corpus/, rag/vectors/) keep working unchanged.
    if [ -d /app/rag/rag ]; then \
        cp -r /app/rag/rag/* /app/rag/ && rm -rf /app/rag/rag; \
    fi && \
    echo "Dataset pull complete:" && \
    du -sh /app/rag/corpus /app/rag/vectors /app/rag/extracted 2>&1 | sed 's/^/  /'

# Copy the built frontend from stage 1
COPY --from=frontend-builder /app/frontend/out ./frontend/out

# HF Spaces sends traffic to $PORT (default 7860). uvicorn will bind to it.
ENV PORT=7860
EXPOSE 7860

# Copy entrypoint and make it executable
COPY entrypoint.sh ./entrypoint.sh
RUN chmod +x ./entrypoint.sh

# Use a non-root user (HF Spaces recommends this for Docker spaces)
RUN useradd -m -u 1000 user && chown -R user:user /app
USER user

# Start: entrypoint validates Chroma + (re-)ingests if needed, then runs uvicorn
CMD ["sh", "/app/entrypoint.sh"]