| # Dedicated paid-GPU Space (Docker SDK) — real Gemma 4 on the OFFICIAL llama.cpp. | |
| # Compiling llama.cpp in the HF build exceeds the build time limit, so we base on the | |
| # llama.cpp project's own prebuilt CUDA image (trusted, current → supports Gemma 4). | |
| # It runs `llama-server`; our app (UI + /agent) calls it via INFERENCE_BASE_URL. | |
| # Pick a CUDA GPU in Space settings (e.g. 1x A100). Llama Champion = the llama.cpp server. | |
| FROM ghcr.io/ggml-org/llama.cpp:server-cuda | |
| ENV PYTHONUNBUFFERED=1 \ | |
| DEBIAN_FRONTEND=noninteractive \ | |
| PORT=7860 \ | |
| SERVE=uvicorn \ | |
| HF_HOME=/tmp/hf \ | |
| LLAMA_CACHE=/tmp/llama-cache \ | |
| INFERENCE_BASE_URL="http://127.0.0.1:8080/v1" \ | |
| INFERENCE_MODEL="gemma-4" \ | |
| MODEL_HF_REPO="ParetoOptimal/gemma-4-cal-gguf" \ | |
| MODEL_FILE="gemma-cal-e4b-Q4_K_M.gguf" \ | |
| MMPROJ_REPO="unsloth/gemma-4-E4B-it-GGUF" \ | |
| MMPROJ_FILE="mmproj-F16.gguf" | |
| # Agent-tab planner (OFF by default — set as Space variables to enable): | |
| # PLANNER_HF_REPO="openbmb/MiniCPM4.1-8B-GGUF" PLANNER_FILE="MiniCPM4.1-8B-Q4_K_M.gguf" | |
| # (tiny <=4B variant: openbmb/MiniCPM5-1B-GGUF / MiniCPM5-1B-Q4_K_M.gguf) | |
| # PLANNER_PORT=8081 PLANNER_NGL=999 PLANNER_BASE_URL=http://127.0.0.1:8081/v1 | |
| RUN apt-get update && apt-get install -y --no-install-recommends \ | |
| python3 python3-pip curl ca-certificates && \ | |
| rm -rf /var/lib/apt/lists/* | |
| # Keep our app out of the image's /app (where the llama-server binary lives). | |
| WORKDIR /srv | |
| COPY requirements-docker.txt . | |
| # --break-system-packages: the base image's Python is PEP 668 externally-managed. | |
| RUN pip3 install --no-cache-dir --break-system-packages -r requirements-docker.txt | |
| COPY . . | |
| # The base image's entrypoint is llama-server; we run our launcher instead. | |
| ENTRYPOINT [] | |
| EXPOSE 7860 | |
| CMD ["bash", "scripts/start_space.sh"] | |