Synesthesia / scripts /deploy_gemma.sh
Ashiedu's picture
Sync unified workbench
0490201 verified
#!/usr/bin/env bash
# deploy_gemma.sh β€” Build and launch llama-server for Kansas cognitive router
#
# Target: RX 6700 XT (gfx1030) on ROCm 7.2.1
# Model: Gemma 4 E2B (Q4_K_M GGUF)
# Pin: llama.cpp latest master (Gemma 4n compatible, ROCm 7.2.1 fixes)
#
# Usage:
# bash scripts/deploy_gemma.sh # Build + launch (blocking)
# bash scripts/deploy_gemma.sh --build # Build only
# bash scripts/deploy_gemma.sh --start # Launch only (requires prior build)
# bash scripts/deploy_gemma.sh --stop # Stop running server
set -euo pipefail
SERVER_PID=""
# ── Paths ──────────────────────────────────────────────────────────────────
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
VENDOR_DIR="$ROOT_DIR/vendor/llama.cpp"
BUILD_DIR="$VENDOR_DIR/build"
MODELS_LINK="$ROOT_DIR/models"
PID_FILE="$ROOT_DIR/runtime/.llama-server.pid"
LOG_FILE="$ROOT_DIR/runtime/logs/llama-server.log"
# GGUF model β€” unsloth/gemma-4-E2B-it-GGUF (Q4_K_M)
GGUF_REPO="unsloth/gemma-4-E2B-it-GGUF"
GGUF_FILENAME="gemma-4-E2B-it-Q4_K_M.gguf"
GGUF_MODEL_DIR="$ROOT_DIR/models"
GGUF_MODEL_PATH="$GGUF_MODEL_DIR/$GGUF_FILENAME"
# llama.cpp β€” track latest master for Gemma 4n compatibility and ROCm fixes
LLAMA_REPO="https://github.com/ggerganov/llama.cpp.git"
LLAMA_TAG="" # empty = latest master
# Server config
LLAMA_HOST="127.0.0.1"
LLAMA_PORT=8080
LLAMA_CTX=16384
LLAMA_GPU_LAYERS=35
LLAMA_CACHE_TYPE="q8_0"
# ROCm override (mandatory for RX 6700 XT)
export HSA_OVERRIDE_GFX_VERSION="10.3.0"
# ── Helpers ────────────────────────────────────────────────────────────────
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
die() { log "ERROR: $*" >&2; exit 1; }
# ── Stop ───────────────────────────────────────────────────────────────────
stop_server() {
# 1. Try global variable first (fastest)
if [[ -n "${SERVER_PID:-}" ]]; then
if kill -0 "$SERVER_PID" 2>/dev/null; then
log "Stopping llama-server (PID $SERVER_PID) via global variable..."
kill -TERM "$SERVER_PID" 2>/dev/null || true
fi
fi
# 2. Fallback to PID file logic (handles orphaned processes)
if [[ -f "$PID_FILE" ]]; then
local pid
pid=$(cat "$PID_FILE")
if kill -0 "$pid" 2>/dev/null; then
log "Stopping llama-server (PID $pid) via PID file..."
kill -TERM "$pid" 2>/dev/null || true
# Wait up to 10s for graceful shutdown
for i in $(seq 1 10); do
if ! kill -0 "$pid" 2>/dev/null; then
log "llama-server stopped."
rm -f "$PID_FILE"
SERVER_PID=""
return 0
fi
sleep 1
done
log "Force killing llama-server (PID $pid)..."
kill -9 "$pid" 2>/dev/null || true
rm -f "$PID_FILE"
else
log "Stale PID file (PID $pid not running). Cleaning up."
rm -f "$PID_FILE"
fi
else
log "No PID file found. Checking for running llama-server on port $LLAMA_PORT..."
local existing_pid
existing_pid=$(lsof -ti:"$LLAMA_PORT" 2>/dev/null || true)
if [[ -n "$existing_pid" ]]; then
log "Killing llama-server (PID $existing_pid) on port $LLAMA_PORT..."
kill -TERM "$existing_pid" 2>/dev/null || true
sleep 2
kill -9 "$existing_pid" 2>/dev/null || true
else
log "No running llama-server found."
fi
fi
SERVER_PID=""
}
# ── Download ───────────────────────────────────────────────────────────────
download_model() {
log "=== Phase 0: Downloading $GGUF_FILENAME ==="
mkdir -p "$GGUF_MODEL_DIR"
if [[ -f "$GGUF_MODEL_PATH" ]]; then
log "Model already exists at $GGUF_MODEL_PATH. Skipping download."
return 0
fi
if ! command -v huggingface-cli &> /dev/null; then
log "Installing huggingface_hub CLI..."
pip install -q huggingface_hub
fi
if [[ -z "${HF_TOKEN:-}" ]]; then
# Try loading from .env
if [[ -f "$ROOT_DIR/.env" ]]; then
export HF_TOKEN=$(grep '^HF_TOKEN=' "$ROOT_DIR/.env" | cut -d'=' -f2-)
fi
fi
if [[ -z "${HF_TOKEN:-}" ]]; then
die "HF_TOKEN not set. Set it in .env or export HF_TOKEN=..."
fi
log "Downloading $GGUF_FILENAME from $GGUF_REPO..."
huggingface-cli download "$GGUF_REPO" "$GGUF_FILENAME" \
--local-dir "$GGUF_MODEL_DIR" \
--token "$HF_TOKEN"
if [[ ! -f "$GGUF_MODEL_PATH" ]]; then
die "Download failed: $GGUF_MODEL_PATH not found"
fi
local size
size=$(du -h "$GGUF_MODEL_PATH" | cut -f1)
log "Download complete: $GGUF_FILENAME ($size)"
}
# ── Build ──────────────────────────────────────────────────────────────────
build_server() {
if [[ -z "$LLAMA_TAG" ]]; then
log "=== Phase 1: Building llama.cpp (latest master) ==="
else
log "=== Phase 1: Building llama.cpp (tag $LLAMA_TAG) ==="
fi
# Clone if not present
if [[ -d "$VENDOR_DIR/.git" ]]; then
log "llama.cpp already cloned. Skipping."
else
if [[ -z "$LLAMA_TAG" ]]; then
log "Cloning llama.cpp (latest master)..."
git clone --depth 1 "$LLAMA_REPO" "$VENDOR_DIR"
else
log "Cloning llama.cpp at tag $LLAMA_TAG..."
git clone --depth 1 --branch "$LLAMA_TAG" "$LLAMA_REPO" "$VENDOR_DIR"
fi
fi
# CMake configuration
log "Configuring CMake with ROCm (gfx1030)..."
cmake -B "$BUILD_DIR" \
-DGGML_HIP=ON \
-DAMDGPU_TARGETS=gfx1030 \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
-DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \
-S "$VENDOR_DIR"
# Build llama-server binary only
log "Compiling llama-server (this may take 5-10 minutes)..."
cmake --build "$BUILD_DIR" -j"$(nproc)" --target llama-server
# Verify binary
if [[ ! -x "$BUILD_DIR/bin/llama-server" ]]; then
die "Build failed: llama-server binary not found at $BUILD_DIR/bin/llama-server"
fi
local version
version=$("$BUILD_DIR/bin/llama-server" --version 2>&1 || echo "unknown")
log "Build complete: $version"
}
# ── Start ──────────────────────────────────────────────────────────────────
start_server() {
log "=== Phase 2: Launching llama-server ==="
# Check binary exists
local server_bin="$BUILD_DIR/bin/llama-server"
[[ -x "$server_bin" ]] || die "llama-server binary not found. Run: $0 --build"
# Verify model file
if [[ ! -f "$GGUF_MODEL_PATH" ]]; then
die "GGUF model not found at: $GGUF_MODEL_PATH. Run: $0 --download"
fi
# Ensure log directory exists
mkdir -p "$(dirname "$LOG_FILE")"
# Stop any existing server
stop_server
# Launch llama-server in background
log "Starting llama-server on $LLAMA_HOST:$LLAMA_PORT..."
log " Model: $GGUF_FILENAME"
log " Layers: $LLAMA_GPU_LAYERS GPU / ctx=$LLAMA_CTX"
log " Cache: $LLAMA_CACHE_TYPE (K+V)"
HSA_OVERRIDE_GFX_VERSION=10.3.0 \
"$server_bin" \
--model "$GGUF_MODEL_PATH" \
--ctx-size "$LLAMA_CTX" \
--n-gpu-layers "$LLAMA_GPU_LAYERS" \
--cache-type-k "$LLAMA_CACHE_TYPE" \
--cache-type-v "$LLAMA_CACHE_TYPE" \
--port "$LLAMA_PORT" \
--host "$LLAMA_HOST" \
> "$LOG_FILE" 2>&1 &
SERVER_PID=$!
echo "$SERVER_PID" > "$PID_FILE"
log "llama-server started with PID $SERVER_PID"
# Health check loop
log "Waiting for server to become healthy..."
local max_attempts=120
local attempt=0
while (( attempt < max_attempts )); do
attempt=$((attempt + 1))
if curl -sf "http://$LLAMA_HOST:$LLAMA_PORT/health" > /dev/null 2>&1; then
log "βœ“ Server is healthy! (attempt $attempt)"
log " Health: $(curl -s "http://$LLAMA_HOST:$LLAMA_PORT/health")"
log " Logs: tail -f $LOG_FILE"
log " Stop: bash $0 --stop"
return 0
fi
# Check if process is still alive
if ! kill -0 "$SERVER_PID" 2>/dev/null; then
log "ERROR: llama-server (PID $SERVER_PID) exited unexpectedly."
log "Last 20 lines of log:"
tail -20 "$LOG_FILE" 2>/dev/null || true
rm -f "$PID_FILE"
exit 1
fi
sleep 1
done
die "Server did not become healthy within ${max_attempts}s. Check $LOG_FILE"
}
# ── Main ───────────────────────────────────────────────────────────────────
trap 'log "Received interrupt signal. Shutting down..."; stop_server; exit 0' INT TERM
case "${1:-}" in
--download)
download_model
;;
--build)
download_model
build_server
;;
--start)
start_server
;;
--stop)
stop_server
;;
--status)
if [[ -f "$PID_FILE" ]]; then
pid=$(cat "$PID_FILE")
if kill -0 "$pid" 2>/dev/null; then
log "llama-server running (PID $pid)"
curl -s "http://$LLAMA_HOST:$LLAMA_PORT/health" 2>/dev/null || echo "Health check failed"
else
log "Stale PID file (PID $pid not running)"
fi
else
log "No PID file found"
fi
;;
"")
# Default: download + build + launch
download_model
build_server
start_server
# Block until interrupted
log "Server running. Press Ctrl+C to stop."
wait "$SERVER_PID" 2>/dev/null || true
;;
*)
echo "Usage: $0 [--download|--build|--start|--stop|--status]"
exit 1
;;
esac