OffGridSchedula

Running

App Files Files Community

OffGridSchedula / Dockerfile

ParetoOptimal

Initial Commit

0366d65 16 days ago

Raw

History Blame Contribute Delete

1.81 kB

	# Dedicated paid-GPU Space (Docker SDK) — real Gemma 4 on the OFFICIAL llama.cpp.
	# Compiling llama.cpp in the HF build exceeds the build time limit, so we base on the
	# llama.cpp project's own prebuilt CUDA image (trusted, current → supports Gemma 4).
	# It runs `llama-server`; our app (UI + /agent) calls it via INFERENCE_BASE_URL.
	# Pick a CUDA GPU in Space settings (e.g. 1x A100). Llama Champion = the llama.cpp server.
	FROM ghcr.io/ggml-org/llama.cpp:server-cuda

	ENV PYTHONUNBUFFERED=1 \
	DEBIAN_FRONTEND=noninteractive \
	PORT=7860 \
	SERVE=uvicorn \
	HF_HOME=/tmp/hf \
	LLAMA_CACHE=/tmp/llama-cache \
	INFERENCE_BASE_URL="http://127.0.0.1:8080/v1" \
	INFERENCE_MODEL="gemma-4" \
	MODEL_HF_REPO="ParetoOptimal/gemma-4-cal-gguf" \
	MODEL_FILE="gemma-cal-e4b-Q4_K_M.gguf" \
	MMPROJ_REPO="unsloth/gemma-4-E4B-it-GGUF" \
	MMPROJ_FILE="mmproj-F16.gguf"
	# Agent-tab planner (OFF by default — set as Space variables to enable):
	# PLANNER_HF_REPO="openbmb/MiniCPM4.1-8B-GGUF" PLANNER_FILE="MiniCPM4.1-8B-Q4_K_M.gguf"
	# (tiny <=4B variant: openbmb/MiniCPM5-1B-GGUF / MiniCPM5-1B-Q4_K_M.gguf)
	# PLANNER_PORT=8081 PLANNER_NGL=999 PLANNER_BASE_URL=http://127.0.0.1:8081/v1

	RUN apt-get update && apt-get install -y --no-install-recommends \
	python3 python3-pip curl ca-certificates && \
	rm -rf /var/lib/apt/lists/*

	# Keep our app out of the image's /app (where the llama-server binary lives).
	WORKDIR /srv

	COPY requirements-docker.txt .
	# --break-system-packages: the base image's Python is PEP 668 externally-managed.
	RUN pip3 install --no-cache-dir --break-system-packages -r requirements-docker.txt

	COPY . .

	# The base image's entrypoint is llama-server; we run our launcher instead.
	ENTRYPOINT []
	EXPOSE 7860
	CMD ["bash", "scripts/start_space.sh"]