OffGridSchedula

Running

App Files Files Community

OffGridSchedula / scripts /start_space.sh

ParetoOptimal

Initial Commit

0366d65 12 days ago

Raw

History Blame Contribute Delete

4.36 kB

	#!/usr/bin/env bash
	# Launch the official llama.cpp server + the agent app (Docker GPU Space).
	# llama-server downloads the GGUF from HF on first run and serves it on :8080;
	# the app calls it via INFERENCE_BASE_URL=http://127.0.0.1:8080/v1.
	set -u

	# UI-only / preview mode: in stub mode there's no model, so skip llama-server
	# entirely (otherwise it would download the ~20GB GGUF and fail on a CPU box).
	# Lets the Space run the full UI for free on cpu-basic. See PLAN / docs.
	if [ "${USE_STUB_EXTRACTOR:-0}" = "1" ]; then
	echo "[start] UI-only (USE_STUB_EXTRACTOR=1) — skipping llama-server"
	exec python3 app.py
	fi

	LS="$(command -v llama-server \|\| echo /app/llama-server)"
	# The official binary's sibling .so (libllama-server-impl.so) lives next to it in
	# /app; we run from /srv, so add its dir to the loader path.
	export LD_LIBRARY_PATH="$(dirname "$LS"):/app:${LD_LIBRARY_PATH:-}"
	echo "[start] using llama-server at: $LS (LD_LIBRARY_PATH=$LD_LIBRARY_PATH)"

	# Model selection: MODEL_FILE (explicit filename in MODEL_HF_REPO) is preferred —
	# the repo holds multiple Q4_K_M GGUFs (31B + E4B edge), so the `-hf repo:quant`
	# shorthand is ambiguous there. Falls back to -hf REPO:QUANT when MODEL_FILE unset.
	if [ -n "${MODEL_FILE:-}" ]; then
	echo "[start] model: ${MODEL_HF_REPO}/${MODEL_FILE} (explicit file; downloads on first run)"
	MODEL_PATH="$(python3 -c "from huggingface_hub import hf_hub_download; print(hf_hub_download('${MODEL_HF_REPO}', '${MODEL_FILE}'))")"
	MODEL_ARGS="-m $MODEL_PATH"
	else
	echo "[start] model: ${MODEL_HF_REPO}:${MODEL_QUANT:-Q4_K_M} (downloads on first run)"
	MODEL_ARGS="-hf ${MODEL_HF_REPO}:${MODEL_QUANT:-Q4_K_M}"
	fi

	# Vision: download the mmproj projector and pass --mmproj so llama-server accepts
	# image_url inputs (screenshots/flyers). MMPROJ_REPO lets the projector come from a
	# different repo than the LLM (the E4B edge model uses the base E4B's projector,
	# not the 31B mmproj stored alongside it). Falls back to text-only if unavailable.
	MMPROJ_ARG=""
	if [ -n "${MMPROJ_FILE:-}" ]; then
	MMPROJ_REPO="${MMPROJ_REPO:-$MODEL_HF_REPO}"
	echo "[start] fetching mmproj ${MMPROJ_REPO}/${MMPROJ_FILE} for vision..."
	MMPROJ_PATH="$(python3 -c "from huggingface_hub import hf_hub_download; print(hf_hub_download('${MMPROJ_REPO}', '${MMPROJ_FILE}'))" 2>/dev/null \|\| true)"
	if [ -n "$MMPROJ_PATH" ]; then
	MMPROJ_ARG="--mmproj $MMPROJ_PATH"
	echo "[start] mmproj ready: $MMPROJ_PATH"
	else
	echo "[start] mmproj download failed -> text-only"
	fi
	fi

	# -ngl 999 offloads all layers to the GPU; --jinja enables the chat/tool template.
	"$LS" $MODEL_ARGS \
	--host 127.0.0.1 --port 8080 \
	-ngl 999 -c 8192 --jinja $MMPROJ_ARG &
	LLAMA_PID=$!

	# Optional second llama-server: the Agent tab's MiniCPM planner. OFF unless
	# PLANNER_HF_REPO+PLANNER_FILE are set. VRAM note: E4B Q4 (~5GB) + MiniCPM-8B
	# Q4 (~5GB) + KV is tight on a 16GB T4 — tune PLANNER_NGL (default 999; lower
	# it for partial offload, planner outputs are short) or use the 1B variant
	# (openbmb/MiniCPM5-1B-GGUF / MiniCPM5-1B-Q4_K_M.gguf).
	# PLANNER_CTX (default 8192, matching the main model): a multi-step agent run
	# accumulates the tool schemas + task + thread + each step's observations, so
	# 4096 overflows on real threads ("request (4142 tokens) exceeds context").
	if [ -n "${PLANNER_HF_REPO:-}" ] && [ -n "${PLANNER_FILE:-}" ]; then
	echo "[start] planner: ${PLANNER_HF_REPO}/${PLANNER_FILE} on :${PLANNER_PORT:-8081}"
	PLANNER_PATH="$(python3 -c "from huggingface_hub import hf_hub_download; print(hf_hub_download('${PLANNER_HF_REPO}', '${PLANNER_FILE}'))")"
	"$LS" -m "$PLANNER_PATH" \
	--host 127.0.0.1 --port "${PLANNER_PORT:-8081}" \
	-ngl "${PLANNER_NGL:-999}" -c "${PLANNER_CTX:-8192}" --jinja &
	echo "[start] planner launching (PLANNER_BASE_URL should be http://127.0.0.1:${PLANNER_PORT:-8081}/v1)"
	fi

	echo "[start] waiting for llama-server health (model download can take minutes)..."
	for i in $(seq 1 900); do
	if ! kill -0 "$LLAMA_PID" 2>/dev/null; then
	echo "[start] ERROR: llama-server exited early"; break
	fi
	if curl -sf http://127.0.0.1:8080/health >/dev/null 2>&1; then
	echo "[start] llama-server ready after ~$((i*2))s"; break
	fi
	sleep 2
	done

	echo "[start] launching app (UI + /agent) -> INFERENCE_BASE_URL=$INFERENCE_BASE_URL"
	exec python3 app.py