OffGridSchedula / Dockerfile
ParetoOptimal's picture
Initial Commit
0366d65
Raw
History Blame Contribute Delete
1.81 kB
# Dedicated paid-GPU Space (Docker SDK) — real Gemma 4 on the OFFICIAL llama.cpp.
# Compiling llama.cpp in the HF build exceeds the build time limit, so we base on the
# llama.cpp project's own prebuilt CUDA image (trusted, current → supports Gemma 4).
# It runs `llama-server`; our app (UI + /agent) calls it via INFERENCE_BASE_URL.
# Pick a CUDA GPU in Space settings (e.g. 1x A100). Llama Champion = the llama.cpp server.
FROM ghcr.io/ggml-org/llama.cpp:server-cuda
ENV PYTHONUNBUFFERED=1 \
DEBIAN_FRONTEND=noninteractive \
PORT=7860 \
SERVE=uvicorn \
HF_HOME=/tmp/hf \
LLAMA_CACHE=/tmp/llama-cache \
INFERENCE_BASE_URL="http://127.0.0.1:8080/v1" \
INFERENCE_MODEL="gemma-4" \
MODEL_HF_REPO="ParetoOptimal/gemma-4-cal-gguf" \
MODEL_FILE="gemma-cal-e4b-Q4_K_M.gguf" \
MMPROJ_REPO="unsloth/gemma-4-E4B-it-GGUF" \
MMPROJ_FILE="mmproj-F16.gguf"
# Agent-tab planner (OFF by default — set as Space variables to enable):
# PLANNER_HF_REPO="openbmb/MiniCPM4.1-8B-GGUF" PLANNER_FILE="MiniCPM4.1-8B-Q4_K_M.gguf"
# (tiny <=4B variant: openbmb/MiniCPM5-1B-GGUF / MiniCPM5-1B-Q4_K_M.gguf)
# PLANNER_PORT=8081 PLANNER_NGL=999 PLANNER_BASE_URL=http://127.0.0.1:8081/v1
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip curl ca-certificates && \
rm -rf /var/lib/apt/lists/*
# Keep our app out of the image's /app (where the llama-server binary lives).
WORKDIR /srv
COPY requirements-docker.txt .
# --break-system-packages: the base image's Python is PEP 668 externally-managed.
RUN pip3 install --no-cache-dir --break-system-packages -r requirements-docker.txt
COPY . .
# The base image's entrypoint is llama-server; we run our launcher instead.
ENTRYPOINT []
EXPOSE 7860
CMD ["bash", "scripts/start_space.sh"]