Spaces:
Runtime error
Runtime error
| # deploy_gemma.sh β Build and launch llama-server for Kansas cognitive router | |
| # | |
| # Target: RX 6700 XT (gfx1030) on ROCm 7.2.1 | |
| # Model: Gemma 4 E2B (Q4_K_M GGUF) | |
| # Pin: llama.cpp latest master (Gemma 4n compatible, ROCm 7.2.1 fixes) | |
| # | |
| # Usage: | |
| # bash scripts/deploy_gemma.sh # Build + launch (blocking) | |
| # bash scripts/deploy_gemma.sh --build # Build only | |
| # bash scripts/deploy_gemma.sh --start # Launch only (requires prior build) | |
| # bash scripts/deploy_gemma.sh --stop # Stop running server | |
| set -euo pipefail | |
| SERVER_PID="" | |
| # ββ Paths ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | |
| ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" | |
| VENDOR_DIR="$ROOT_DIR/vendor/llama.cpp" | |
| BUILD_DIR="$VENDOR_DIR/build" | |
| MODELS_LINK="$ROOT_DIR/models" | |
| PID_FILE="$ROOT_DIR/runtime/.llama-server.pid" | |
| LOG_FILE="$ROOT_DIR/runtime/logs/llama-server.log" | |
| # GGUF model β unsloth/gemma-4-E2B-it-GGUF (Q4_K_M) | |
| GGUF_REPO="unsloth/gemma-4-E2B-it-GGUF" | |
| GGUF_FILENAME="gemma-4-E2B-it-Q4_K_M.gguf" | |
| GGUF_MODEL_DIR="$ROOT_DIR/models" | |
| GGUF_MODEL_PATH="$GGUF_MODEL_DIR/$GGUF_FILENAME" | |
| # llama.cpp β track latest master for Gemma 4n compatibility and ROCm fixes | |
| LLAMA_REPO="https://github.com/ggerganov/llama.cpp.git" | |
| LLAMA_TAG="" # empty = latest master | |
| # Server config | |
| LLAMA_HOST="127.0.0.1" | |
| LLAMA_PORT=8080 | |
| LLAMA_CTX=16384 | |
| LLAMA_GPU_LAYERS=35 | |
| LLAMA_CACHE_TYPE="q8_0" | |
| # ROCm override (mandatory for RX 6700 XT) | |
| export HSA_OVERRIDE_GFX_VERSION="10.3.0" | |
| # ββ Helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; } | |
| die() { log "ERROR: $*" >&2; exit 1; } | |
| # ββ Stop βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| stop_server() { | |
| # 1. Try global variable first (fastest) | |
| if [[ -n "${SERVER_PID:-}" ]]; then | |
| if kill -0 "$SERVER_PID" 2>/dev/null; then | |
| log "Stopping llama-server (PID $SERVER_PID) via global variable..." | |
| kill -TERM "$SERVER_PID" 2>/dev/null || true | |
| fi | |
| fi | |
| # 2. Fallback to PID file logic (handles orphaned processes) | |
| if [[ -f "$PID_FILE" ]]; then | |
| local pid | |
| pid=$(cat "$PID_FILE") | |
| if kill -0 "$pid" 2>/dev/null; then | |
| log "Stopping llama-server (PID $pid) via PID file..." | |
| kill -TERM "$pid" 2>/dev/null || true | |
| # Wait up to 10s for graceful shutdown | |
| for i in $(seq 1 10); do | |
| if ! kill -0 "$pid" 2>/dev/null; then | |
| log "llama-server stopped." | |
| rm -f "$PID_FILE" | |
| SERVER_PID="" | |
| return 0 | |
| fi | |
| sleep 1 | |
| done | |
| log "Force killing llama-server (PID $pid)..." | |
| kill -9 "$pid" 2>/dev/null || true | |
| rm -f "$PID_FILE" | |
| else | |
| log "Stale PID file (PID $pid not running). Cleaning up." | |
| rm -f "$PID_FILE" | |
| fi | |
| else | |
| log "No PID file found. Checking for running llama-server on port $LLAMA_PORT..." | |
| local existing_pid | |
| existing_pid=$(lsof -ti:"$LLAMA_PORT" 2>/dev/null || true) | |
| if [[ -n "$existing_pid" ]]; then | |
| log "Killing llama-server (PID $existing_pid) on port $LLAMA_PORT..." | |
| kill -TERM "$existing_pid" 2>/dev/null || true | |
| sleep 2 | |
| kill -9 "$existing_pid" 2>/dev/null || true | |
| else | |
| log "No running llama-server found." | |
| fi | |
| fi | |
| SERVER_PID="" | |
| } | |
| # ββ Download βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| download_model() { | |
| log "=== Phase 0: Downloading $GGUF_FILENAME ===" | |
| mkdir -p "$GGUF_MODEL_DIR" | |
| if [[ -f "$GGUF_MODEL_PATH" ]]; then | |
| log "Model already exists at $GGUF_MODEL_PATH. Skipping download." | |
| return 0 | |
| fi | |
| if ! command -v huggingface-cli &> /dev/null; then | |
| log "Installing huggingface_hub CLI..." | |
| pip install -q huggingface_hub | |
| fi | |
| if [[ -z "${HF_TOKEN:-}" ]]; then | |
| # Try loading from .env | |
| if [[ -f "$ROOT_DIR/.env" ]]; then | |
| export HF_TOKEN=$(grep '^HF_TOKEN=' "$ROOT_DIR/.env" | cut -d'=' -f2-) | |
| fi | |
| fi | |
| if [[ -z "${HF_TOKEN:-}" ]]; then | |
| die "HF_TOKEN not set. Set it in .env or export HF_TOKEN=..." | |
| fi | |
| log "Downloading $GGUF_FILENAME from $GGUF_REPO..." | |
| huggingface-cli download "$GGUF_REPO" "$GGUF_FILENAME" \ | |
| --local-dir "$GGUF_MODEL_DIR" \ | |
| --token "$HF_TOKEN" | |
| if [[ ! -f "$GGUF_MODEL_PATH" ]]; then | |
| die "Download failed: $GGUF_MODEL_PATH not found" | |
| fi | |
| local size | |
| size=$(du -h "$GGUF_MODEL_PATH" | cut -f1) | |
| log "Download complete: $GGUF_FILENAME ($size)" | |
| } | |
| # ββ Build ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| build_server() { | |
| if [[ -z "$LLAMA_TAG" ]]; then | |
| log "=== Phase 1: Building llama.cpp (latest master) ===" | |
| else | |
| log "=== Phase 1: Building llama.cpp (tag $LLAMA_TAG) ===" | |
| fi | |
| # Clone if not present | |
| if [[ -d "$VENDOR_DIR/.git" ]]; then | |
| log "llama.cpp already cloned. Skipping." | |
| else | |
| if [[ -z "$LLAMA_TAG" ]]; then | |
| log "Cloning llama.cpp (latest master)..." | |
| git clone --depth 1 "$LLAMA_REPO" "$VENDOR_DIR" | |
| else | |
| log "Cloning llama.cpp at tag $LLAMA_TAG..." | |
| git clone --depth 1 --branch "$LLAMA_TAG" "$LLAMA_REPO" "$VENDOR_DIR" | |
| fi | |
| fi | |
| # CMake configuration | |
| log "Configuring CMake with ROCm (gfx1030)..." | |
| cmake -B "$BUILD_DIR" \ | |
| -DGGML_HIP=ON \ | |
| -DAMDGPU_TARGETS=gfx1030 \ | |
| -DCMAKE_BUILD_TYPE=Release \ | |
| -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \ | |
| -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \ | |
| -S "$VENDOR_DIR" | |
| # Build llama-server binary only | |
| log "Compiling llama-server (this may take 5-10 minutes)..." | |
| cmake --build "$BUILD_DIR" -j"$(nproc)" --target llama-server | |
| # Verify binary | |
| if [[ ! -x "$BUILD_DIR/bin/llama-server" ]]; then | |
| die "Build failed: llama-server binary not found at $BUILD_DIR/bin/llama-server" | |
| fi | |
| local version | |
| version=$("$BUILD_DIR/bin/llama-server" --version 2>&1 || echo "unknown") | |
| log "Build complete: $version" | |
| } | |
| # ββ Start ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| start_server() { | |
| log "=== Phase 2: Launching llama-server ===" | |
| # Check binary exists | |
| local server_bin="$BUILD_DIR/bin/llama-server" | |
| [[ -x "$server_bin" ]] || die "llama-server binary not found. Run: $0 --build" | |
| # Verify model file | |
| if [[ ! -f "$GGUF_MODEL_PATH" ]]; then | |
| die "GGUF model not found at: $GGUF_MODEL_PATH. Run: $0 --download" | |
| fi | |
| # Ensure log directory exists | |
| mkdir -p "$(dirname "$LOG_FILE")" | |
| # Stop any existing server | |
| stop_server | |
| # Launch llama-server in background | |
| log "Starting llama-server on $LLAMA_HOST:$LLAMA_PORT..." | |
| log " Model: $GGUF_FILENAME" | |
| log " Layers: $LLAMA_GPU_LAYERS GPU / ctx=$LLAMA_CTX" | |
| log " Cache: $LLAMA_CACHE_TYPE (K+V)" | |
| HSA_OVERRIDE_GFX_VERSION=10.3.0 \ | |
| "$server_bin" \ | |
| --model "$GGUF_MODEL_PATH" \ | |
| --ctx-size "$LLAMA_CTX" \ | |
| --n-gpu-layers "$LLAMA_GPU_LAYERS" \ | |
| --cache-type-k "$LLAMA_CACHE_TYPE" \ | |
| --cache-type-v "$LLAMA_CACHE_TYPE" \ | |
| --port "$LLAMA_PORT" \ | |
| --host "$LLAMA_HOST" \ | |
| > "$LOG_FILE" 2>&1 & | |
| SERVER_PID=$! | |
| echo "$SERVER_PID" > "$PID_FILE" | |
| log "llama-server started with PID $SERVER_PID" | |
| # Health check loop | |
| log "Waiting for server to become healthy..." | |
| local max_attempts=120 | |
| local attempt=0 | |
| while (( attempt < max_attempts )); do | |
| attempt=$((attempt + 1)) | |
| if curl -sf "http://$LLAMA_HOST:$LLAMA_PORT/health" > /dev/null 2>&1; then | |
| log "β Server is healthy! (attempt $attempt)" | |
| log " Health: $(curl -s "http://$LLAMA_HOST:$LLAMA_PORT/health")" | |
| log " Logs: tail -f $LOG_FILE" | |
| log " Stop: bash $0 --stop" | |
| return 0 | |
| fi | |
| # Check if process is still alive | |
| if ! kill -0 "$SERVER_PID" 2>/dev/null; then | |
| log "ERROR: llama-server (PID $SERVER_PID) exited unexpectedly." | |
| log "Last 20 lines of log:" | |
| tail -20 "$LOG_FILE" 2>/dev/null || true | |
| rm -f "$PID_FILE" | |
| exit 1 | |
| fi | |
| sleep 1 | |
| done | |
| die "Server did not become healthy within ${max_attempts}s. Check $LOG_FILE" | |
| } | |
| # ββ Main βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| trap 'log "Received interrupt signal. Shutting down..."; stop_server; exit 0' INT TERM | |
| case "${1:-}" in | |
| --download) | |
| download_model | |
| ;; | |
| --build) | |
| download_model | |
| build_server | |
| ;; | |
| --start) | |
| start_server | |
| ;; | |
| --stop) | |
| stop_server | |
| ;; | |
| --status) | |
| if [[ -f "$PID_FILE" ]]; then | |
| pid=$(cat "$PID_FILE") | |
| if kill -0 "$pid" 2>/dev/null; then | |
| log "llama-server running (PID $pid)" | |
| curl -s "http://$LLAMA_HOST:$LLAMA_PORT/health" 2>/dev/null || echo "Health check failed" | |
| else | |
| log "Stale PID file (PID $pid not running)" | |
| fi | |
| else | |
| log "No PID file found" | |
| fi | |
| ;; | |
| "") | |
| # Default: download + build + launch | |
| download_model | |
| build_server | |
| start_server | |
| # Block until interrupted | |
| log "Server running. Press Ctrl+C to stop." | |
| wait "$SERVER_PID" 2>/dev/null || true | |
| ;; | |
| *) | |
| echo "Usage: $0 [--download|--build|--start|--stop|--status]" | |
| exit 1 | |
| ;; | |
| esac | |