# ============================================================
# Qwen3-14B  –  OpenAI-compatible API  –  CPU-only Docker image
# ============================================================
FROM python:3.11-slim

# Build-time deps for llama-cpp-python (needs a C++ compiler)
RUN apt-get update && apt-get install -y --no-install-recommends \
        build-essential \
        cmake \
        git \
        wget \
        ca-certificates \
    && rm -rf /var/lib/apt/lists/*

WORKDIR /app

COPY requirements.txt .

# ── Python deps (single source of truth: requirements.txt) ───
# Install llama-cpp-python from abetlen's prebuilt CPU wheel index
# instead of compiling it from source: building llama.cpp's C++ tree
# from scratch spawns several parallel compiler processes and was
# OOMing the build (exit 137) on the platform's build container.
# CMAKE_ARGS / CMAKE_BUILD_PARALLEL_LEVEL only matter if pip ever has
# to fall back to a source build (e.g. no matching wheel for this
# platform yet) -- they keep that fallback CPU-only and memory-bounded.
RUN CMAKE_ARGS="-DGGML_CUDA=OFF -DGGML_METAL=OFF -DGGML_OPENCL=OFF" \
    CMAKE_BUILD_PARALLEL_LEVEL=1 \
    pip install --no-cache-dir -r requirements.txt \
        --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu

# ── App code ─────────────────────────────────────────────────
COPY app.py .

# ── Storage ──────────────────────────────────────────────────
# /data is the HF Spaces persistent storage bucket.
# Model is downloaded here on first boot and reused on restarts.
RUN mkdir -p /data

# ── Runtime env defaults (override with -e or docker-compose) ─
ENV MODEL_PATH=/data/qwen3-14b-q4_k_m.gguf \
    MODEL_URL=https://huggingface.co/bartowski/Qwen_Qwen3-14B-GGUF/resolve/main/Qwen_Qwen3-14B-Q4_K_M.gguf \
    MODEL_ID=qwen3-14b \
    N_CTX=4096 \
    N_THREADS=8 \
    N_BATCH=512 \
    VERBOSE=false

EXPOSE 7860

# Health check — /health returns {"ready": true} once the model is loaded
HEALTHCHECK --interval=30s --timeout=10s --start-period=600s --retries=20 \
    CMD wget -qO- http://localhost:7860/health | grep -q '"ready": true' || exit 1

CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]