# Dedicated paid-GPU Space (Docker SDK) — real Gemma 4 on the OFFICIAL llama.cpp. # Compiling llama.cpp in the HF build exceeds the build time limit, so we base on the # llama.cpp project's own prebuilt CUDA image (trusted, current → supports Gemma 4). # It runs `llama-server`; our app (UI + /agent) calls it via INFERENCE_BASE_URL. # Pick a CUDA GPU in Space settings (e.g. 1x A100). Llama Champion = the llama.cpp server. FROM ghcr.io/ggml-org/llama.cpp:server-cuda ENV PYTHONUNBUFFERED=1 \ DEBIAN_FRONTEND=noninteractive \ PORT=7860 \ SERVE=uvicorn \ HF_HOME=/tmp/hf \ LLAMA_CACHE=/tmp/llama-cache \ INFERENCE_BASE_URL="http://127.0.0.1:8080/v1" \ INFERENCE_MODEL="gemma-4" \ MODEL_HF_REPO="ParetoOptimal/gemma-4-cal-gguf" \ MODEL_FILE="gemma-cal-e4b-Q4_K_M.gguf" \ MMPROJ_REPO="unsloth/gemma-4-E4B-it-GGUF" \ MMPROJ_FILE="mmproj-F16.gguf" # Agent-tab planner (OFF by default — set as Space variables to enable): # PLANNER_HF_REPO="openbmb/MiniCPM4.1-8B-GGUF" PLANNER_FILE="MiniCPM4.1-8B-Q4_K_M.gguf" # (tiny <=4B variant: openbmb/MiniCPM5-1B-GGUF / MiniCPM5-1B-Q4_K_M.gguf) # PLANNER_PORT=8081 PLANNER_NGL=999 PLANNER_BASE_URL=http://127.0.0.1:8081/v1 RUN apt-get update && apt-get install -y --no-install-recommends \ python3 python3-pip curl ca-certificates && \ rm -rf /var/lib/apt/lists/* # Keep our app out of the image's /app (where the llama-server binary lives). WORKDIR /srv COPY requirements-docker.txt . # --break-system-packages: the base image's Python is PEP 668 externally-managed. RUN pip3 install --no-cache-dir --break-system-packages -r requirements-docker.txt COPY . . # The base image's entrypoint is llama-server; we run our launcher instead. ENTRYPOINT [] EXPOSE 7860 CMD ["bash", "scripts/start_space.sh"]