# Dedicated paid-GPU Space (Docker SDK) — real Gemma 4 on the OFFICIAL llama.cpp.
# Compiling llama.cpp in the HF build exceeds the build time limit, so we base on the
# llama.cpp project's own prebuilt CUDA image (trusted, current → supports Gemma 4).
# It runs `llama-server`; our app (UI + /agent) calls it via INFERENCE_BASE_URL.
# Pick a CUDA GPU in Space settings (e.g. 1x A100). Llama Champion = the llama.cpp server.
FROM ghcr.io/ggml-org/llama.cpp:server-cuda

ENV PYTHONUNBUFFERED=1 \
    DEBIAN_FRONTEND=noninteractive \
    PORT=7860 \
    SERVE=uvicorn \
    HF_HOME=/tmp/hf \
    LLAMA_CACHE=/tmp/llama-cache \
    INFERENCE_BASE_URL="http://127.0.0.1:8080/v1" \
    INFERENCE_MODEL="gemma-4" \
    MODEL_HF_REPO="ParetoOptimal/gemma-4-cal-gguf" \
    MODEL_FILE="gemma-cal-e4b-Q4_K_M.gguf" \
    MMPROJ_REPO="unsloth/gemma-4-E4B-it-GGUF" \
    MMPROJ_FILE="mmproj-F16.gguf"
# Agent-tab planner (OFF by default — set as Space variables to enable):
#   PLANNER_HF_REPO="openbmb/MiniCPM4.1-8B-GGUF"  PLANNER_FILE="MiniCPM4.1-8B-Q4_K_M.gguf"
#   (tiny <=4B variant: openbmb/MiniCPM5-1B-GGUF / MiniCPM5-1B-Q4_K_M.gguf)
#   PLANNER_PORT=8081  PLANNER_NGL=999  PLANNER_BASE_URL=http://127.0.0.1:8081/v1

RUN apt-get update && apt-get install -y --no-install-recommends \
    python3 python3-pip curl ca-certificates && \
    rm -rf /var/lib/apt/lists/*

# Keep our app out of the image's /app (where the llama-server binary lives).
WORKDIR /srv

COPY requirements-docker.txt .
# --break-system-packages: the base image's Python is PEP 668 externally-managed.
RUN pip3 install --no-cache-dir --break-system-packages -r requirements-docker.txt

COPY . .

# The base image's entrypoint is llama-server; we run our launcher instead.
ENTRYPOINT []
EXPOSE 7860
CMD ["bash", "scripts/start_space.sh"]