#!/usr/bin/env bash # deploy_gemma.sh — Build and launch llama-server for Kansas cognitive router # # Target: RX 6700 XT (gfx1030) on ROCm 7.2.1 # Model: Gemma 4 E2B (Q4_K_M GGUF) # Pin: llama.cpp latest master (Gemma 4n compatible, ROCm 7.2.1 fixes) # # Usage: # bash scripts/deploy_gemma.sh # Build + launch (blocking) # bash scripts/deploy_gemma.sh --build # Build only # bash scripts/deploy_gemma.sh --start # Launch only (requires prior build) # bash scripts/deploy_gemma.sh --stop # Stop running server set -euo pipefail SERVER_PID="" # ── Paths ────────────────────────────────────────────────────────────────── SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" VENDOR_DIR="$ROOT_DIR/vendor/llama.cpp" BUILD_DIR="$VENDOR_DIR/build" MODELS_LINK="$ROOT_DIR/models" PID_FILE="$ROOT_DIR/runtime/.llama-server.pid" LOG_FILE="$ROOT_DIR/runtime/logs/llama-server.log" # GGUF model — unsloth/gemma-4-E2B-it-GGUF (Q4_K_M) GGUF_REPO="unsloth/gemma-4-E2B-it-GGUF" GGUF_FILENAME="gemma-4-E2B-it-Q4_K_M.gguf" GGUF_MODEL_DIR="$ROOT_DIR/models" GGUF_MODEL_PATH="$GGUF_MODEL_DIR/$GGUF_FILENAME" # llama.cpp — track latest master for Gemma 4n compatibility and ROCm fixes LLAMA_REPO="https://github.com/ggerganov/llama.cpp.git" LLAMA_TAG="" # empty = latest master # Server config LLAMA_HOST="127.0.0.1" LLAMA_PORT=8080 LLAMA_CTX=16384 LLAMA_GPU_LAYERS=35 LLAMA_CACHE_TYPE="q8_0" # ROCm override (mandatory for RX 6700 XT) export HSA_OVERRIDE_GFX_VERSION="10.3.0" # ── Helpers ──────────────────────────────────────────────────────────────── log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; } die() { log "ERROR: $*" >&2; exit 1; } # ── Stop ─────────────────────────────────────────────────────────────────── stop_server() { # 1. Try global variable first (fastest) if [[ -n "${SERVER_PID:-}" ]]; then if kill -0 "$SERVER_PID" 2>/dev/null; then log "Stopping llama-server (PID $SERVER_PID) via global variable..." kill -TERM "$SERVER_PID" 2>/dev/null || true fi fi # 2. Fallback to PID file logic (handles orphaned processes) if [[ -f "$PID_FILE" ]]; then local pid pid=$(cat "$PID_FILE") if kill -0 "$pid" 2>/dev/null; then log "Stopping llama-server (PID $pid) via PID file..." kill -TERM "$pid" 2>/dev/null || true # Wait up to 10s for graceful shutdown for i in $(seq 1 10); do if ! kill -0 "$pid" 2>/dev/null; then log "llama-server stopped." rm -f "$PID_FILE" SERVER_PID="" return 0 fi sleep 1 done log "Force killing llama-server (PID $pid)..." kill -9 "$pid" 2>/dev/null || true rm -f "$PID_FILE" else log "Stale PID file (PID $pid not running). Cleaning up." rm -f "$PID_FILE" fi else log "No PID file found. Checking for running llama-server on port $LLAMA_PORT..." local existing_pid existing_pid=$(lsof -ti:"$LLAMA_PORT" 2>/dev/null || true) if [[ -n "$existing_pid" ]]; then log "Killing llama-server (PID $existing_pid) on port $LLAMA_PORT..." kill -TERM "$existing_pid" 2>/dev/null || true sleep 2 kill -9 "$existing_pid" 2>/dev/null || true else log "No running llama-server found." fi fi SERVER_PID="" } # ── Download ─────────────────────────────────────────────────────────────── download_model() { log "=== Phase 0: Downloading $GGUF_FILENAME ===" mkdir -p "$GGUF_MODEL_DIR" if [[ -f "$GGUF_MODEL_PATH" ]]; then log "Model already exists at $GGUF_MODEL_PATH. Skipping download." return 0 fi if ! command -v huggingface-cli &> /dev/null; then log "Installing huggingface_hub CLI..." pip install -q huggingface_hub fi if [[ -z "${HF_TOKEN:-}" ]]; then # Try loading from .env if [[ -f "$ROOT_DIR/.env" ]]; then export HF_TOKEN=$(grep '^HF_TOKEN=' "$ROOT_DIR/.env" | cut -d'=' -f2-) fi fi if [[ -z "${HF_TOKEN:-}" ]]; then die "HF_TOKEN not set. Set it in .env or export HF_TOKEN=..." fi log "Downloading $GGUF_FILENAME from $GGUF_REPO..." huggingface-cli download "$GGUF_REPO" "$GGUF_FILENAME" \ --local-dir "$GGUF_MODEL_DIR" \ --token "$HF_TOKEN" if [[ ! -f "$GGUF_MODEL_PATH" ]]; then die "Download failed: $GGUF_MODEL_PATH not found" fi local size size=$(du -h "$GGUF_MODEL_PATH" | cut -f1) log "Download complete: $GGUF_FILENAME ($size)" } # ── Build ────────────────────────────────────────────────────────────────── build_server() { if [[ -z "$LLAMA_TAG" ]]; then log "=== Phase 1: Building llama.cpp (latest master) ===" else log "=== Phase 1: Building llama.cpp (tag $LLAMA_TAG) ===" fi # Clone if not present if [[ -d "$VENDOR_DIR/.git" ]]; then log "llama.cpp already cloned. Skipping." else if [[ -z "$LLAMA_TAG" ]]; then log "Cloning llama.cpp (latest master)..." git clone --depth 1 "$LLAMA_REPO" "$VENDOR_DIR" else log "Cloning llama.cpp at tag $LLAMA_TAG..." git clone --depth 1 --branch "$LLAMA_TAG" "$LLAMA_REPO" "$VENDOR_DIR" fi fi # CMake configuration log "Configuring CMake with ROCm (gfx1030)..." cmake -B "$BUILD_DIR" \ -DGGML_HIP=ON \ -DAMDGPU_TARGETS=gfx1030 \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \ -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \ -S "$VENDOR_DIR" # Build llama-server binary only log "Compiling llama-server (this may take 5-10 minutes)..." cmake --build "$BUILD_DIR" -j"$(nproc)" --target llama-server # Verify binary if [[ ! -x "$BUILD_DIR/bin/llama-server" ]]; then die "Build failed: llama-server binary not found at $BUILD_DIR/bin/llama-server" fi local version version=$("$BUILD_DIR/bin/llama-server" --version 2>&1 || echo "unknown") log "Build complete: $version" } # ── Start ────────────────────────────────────────────────────────────────── start_server() { log "=== Phase 2: Launching llama-server ===" # Check binary exists local server_bin="$BUILD_DIR/bin/llama-server" [[ -x "$server_bin" ]] || die "llama-server binary not found. Run: $0 --build" # Verify model file if [[ ! -f "$GGUF_MODEL_PATH" ]]; then die "GGUF model not found at: $GGUF_MODEL_PATH. Run: $0 --download" fi # Ensure log directory exists mkdir -p "$(dirname "$LOG_FILE")" # Stop any existing server stop_server # Launch llama-server in background log "Starting llama-server on $LLAMA_HOST:$LLAMA_PORT..." log " Model: $GGUF_FILENAME" log " Layers: $LLAMA_GPU_LAYERS GPU / ctx=$LLAMA_CTX" log " Cache: $LLAMA_CACHE_TYPE (K+V)" HSA_OVERRIDE_GFX_VERSION=10.3.0 \ "$server_bin" \ --model "$GGUF_MODEL_PATH" \ --ctx-size "$LLAMA_CTX" \ --n-gpu-layers "$LLAMA_GPU_LAYERS" \ --cache-type-k "$LLAMA_CACHE_TYPE" \ --cache-type-v "$LLAMA_CACHE_TYPE" \ --port "$LLAMA_PORT" \ --host "$LLAMA_HOST" \ > "$LOG_FILE" 2>&1 & SERVER_PID=$! echo "$SERVER_PID" > "$PID_FILE" log "llama-server started with PID $SERVER_PID" # Health check loop log "Waiting for server to become healthy..." local max_attempts=120 local attempt=0 while (( attempt < max_attempts )); do attempt=$((attempt + 1)) if curl -sf "http://$LLAMA_HOST:$LLAMA_PORT/health" > /dev/null 2>&1; then log "✓ Server is healthy! (attempt $attempt)" log " Health: $(curl -s "http://$LLAMA_HOST:$LLAMA_PORT/health")" log " Logs: tail -f $LOG_FILE" log " Stop: bash $0 --stop" return 0 fi # Check if process is still alive if ! kill -0 "$SERVER_PID" 2>/dev/null; then log "ERROR: llama-server (PID $SERVER_PID) exited unexpectedly." log "Last 20 lines of log:" tail -20 "$LOG_FILE" 2>/dev/null || true rm -f "$PID_FILE" exit 1 fi sleep 1 done die "Server did not become healthy within ${max_attempts}s. Check $LOG_FILE" } # ── Main ─────────────────────────────────────────────────────────────────── trap 'log "Received interrupt signal. Shutting down..."; stop_server; exit 0' INT TERM case "${1:-}" in --download) download_model ;; --build) download_model build_server ;; --start) start_server ;; --stop) stop_server ;; --status) if [[ -f "$PID_FILE" ]]; then pid=$(cat "$PID_FILE") if kill -0 "$pid" 2>/dev/null; then log "llama-server running (PID $pid)" curl -s "http://$LLAMA_HOST:$LLAMA_PORT/health" 2>/dev/null || echo "Health check failed" else log "Stale PID file (PID $pid not running)" fi else log "No PID file found" fi ;; "") # Default: download + build + launch download_model build_server start_server # Block until interrupted log "Server running. Press Ctrl+C to stop." wait "$SERVER_PID" 2>/dev/null || true ;; *) echo "Usage: $0 [--download|--build|--start|--stop|--status]" exit 1 ;; esac