Spaces:
Configuration error
Configuration error
| # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | |
| # SPDX-License-Identifier: Apache-2.0 | |
| # | |
| # ini_claw setup for DGX Spark devices. | |
| # | |
| # Spark ships Ubuntu 24.04 (cgroup v2) + Docker 28.x but no k3s. | |
| # OpenShell's gateway starts k3s inside a Docker container, which | |
| # needs cgroup host namespace access.# This script configures Docker for ini_claw on DGX Spark.nds off to the normal setup.sh. | |
| # | |
| # Usage: | |
| # sudo ini_claw setup-spark | |
| # # or directly: | |
| # sudo bash scripts/setup-spark.sh | |
| # | |
| # What it does (beyond setup.sh): | |
| # 1. Adds current user to docker group (avoids sudo for everything else) | |
| # 2. Configures Docker daemon for cgroupns=host (k3s-in-Docker on cgroup v2) | |
| # 3. Restarts Docker | |
| # 4. Runs the normal setup.sh | |
| set -euo pipefail | |
| RED='\033[0;31m' | |
| GREEN='\033[0;32m' | |
| YELLOW='\033[1;33m' | |
| NC='\033[0m' | |
| info() { echo -e "${GREEN}>>>${NC} $1"; } | |
| warn() { echo -e "${YELLOW}>>>${NC} $1"; } | |
| fail() { echo -e "${RED}>>>${NC} $1"; exit 1; } | |
| SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" | |
| # ββ Pre-flight checks βββββββββββββββββββββββββββββββββββββββββββββ | |
| if [ "$(uname -s)" != "Linux" ]; then | |
| fail "This script is for DGX Spark (Linux). Use 'ini_claw setup' for macOS." | |
| fi | |
| if [ "$(id -u)" -ne 0 ]; then | |
| fail "Must run as root: sudo ini_claw setup-spark" | |
| fi | |
| # Detect the real user (not root) for docker group add | |
| REAL_USER="${SUDO_USER:-$(logname 2>/dev/null || echo "")}" | |
| if [ -z "$REAL_USER" ]; then | |
| warn "Could not detect non-root user. Docker group will not be configured." | |
| fi | |
| command -v docker > /dev/null || fail "Docker not found. DGX Spark should have Docker pre-installed." | |
| # ββ 1. Docker group βββββββββββββββββββββββββββββββββββββββββββββββ | |
| if [ -n "$REAL_USER" ]; then | |
| if id -nG "$REAL_USER" | grep -qw docker; then | |
| info "User '$REAL_USER' already in docker group" | |
| else | |
| info "Adding '$REAL_USER' to docker group..." | |
| usermod -aG docker "$REAL_USER" | |
| info "Added. Group will take effect on next login (or use 'newgrp docker')." | |
| fi | |
| fi | |
| # ββ 2. Docker cgroup namespace ββββββββββββββββββββββββββββββββββββ | |
| # | |
| # Spark runs cgroup v2 (Ubuntu 24.04). OpenShell's gateway embeds | |
| # k3s in a Docker container, which needs --cgroupns=host to manage | |
| # cgroup hierarchies. Without this, kubelet fails with: | |
| # "openat2 /sys/fs/cgroup/kubepods/pids.max: no" | |
| # | |
| # Setting default-cgroupns-mode=host in daemon.json makes all | |
| # containers use the host cgroup namespace. This is safe β it's | |
| # the Docker default on cgroup v1 hosts anyway. | |
| DAEMON_JSON="/etc/docker/daemon.json" | |
| NEEDS_RESTART=false | |
| if [ -f "$DAEMON_JSON" ]; then | |
| # Check if already configured | |
| if grep -q '"default-cgroupns-mode"' "$DAEMON_JSON" 2>/dev/null; then | |
| CURRENT_MODE=$(python3 -c "import json; print(json.load(open('$DAEMON_JSON')).get('default-cgroupns-mode',''))" 2>/dev/null || echo "") | |
| if [ "$CURRENT_MODE" = "host" ]; then | |
| info "Docker daemon already configured for cgroupns=host" | |
| else | |
| info "Updating Docker daemon cgroupns mode to 'host'..." | |
| python3 -c " | |
| import json | |
| with open('$DAEMON_JSON') as f: | |
| d = json.load(f) | |
| d['default-cgroupns-mode'] = 'host' | |
| with open('$DAEMON_JSON', 'w') as f: | |
| json.dump(d, f, indent=2) | |
| " | |
| NEEDS_RESTART=true | |
| fi | |
| else | |
| info "Adding cgroupns=host to Docker daemon config..." | |
| python3 -c " | |
| import json | |
| try: | |
| with open('$DAEMON_JSON') as f: | |
| d = json.load(f) | |
| except: | |
| d = {} | |
| d['default-cgroupns-mode'] = 'host' | |
| with open('$DAEMON_JSON', 'w') as f: | |
| json.dump(d, f, indent=2) | |
| " | |
| NEEDS_RESTART=true | |
| fi | |
| else | |
| info "Configuring Docker for ini_clawβ¦"n config with cgroupns=host..." | |
| mkdir -p "$(dirname "$DAEMON_JSON")" | |
| echo '{ "default-cgroupns-mode": "host" }' > "$DAEMON_JSON" | |
| NEEDS_RESTART=true | |
| fi | |
| # ββ 3. Restart Docker if needed βββββββββββββββββββββββββββββββββββ | |
| if [ "$NEEDS_RESTART" = true ]; then | |
| info "Restarting Docker daemon..." | |
| systemctl restart docker | |
| # Wait for Docker to be ready | |
| for i in 1 2 3 4 5 6 7 8 9 10; do | |
| if docker info > /dev/null 2>&1; then | |
| break | |
| fi | |
| [ "$i" -eq 10 ] && fail "Docker didn't come back after restart. Check 'systemctl status docker'." | |
| sleep 2 | |
| done | |
| info "Docker restarted with cgroupns=host" | |
| fi | |
| # ββ 4. Install and start vLLM (local inference on Spark GPU) ββββββ | |
| if ! python3 -c "import vllm" 2>/dev/null; then | |
| info "Installing vLLM..." | |
| pip3 install --break-system-packages vllm 2>&1 | tail -1 | |
| info "vLLM installed" | |
| else | |
| info "vLLM already installed" | |
| fi | |
| # Start vLLM if not already running | |
| VLLM_MODEL="nvidia/nemotron-3-nano-30b-a3b" | |
| if curl -s http://localhost:8000/v1/models > /dev/null 2>&1; then | |
| info "vLLM already running on :8000" | |
| else | |
| if python3 -c "import vllm" 2>/dev/null && command -v nvidia-smi > /dev/null 2>&1; then | |
| info "Starting vLLM with $VLLM_MODEL..." | |
| nohup python3 -m vllm.entrypoints.openai.api_server \ | |
| --model "$VLLM_MODEL" \ | |
| --port 8000 \ | |
| --host 0.0.0.0 \ | |
| > /tmp/vllm-server.log 2>&1 & | |
| VLLM_PID=$! | |
| # Wait for vLLM to be ready (model loading can take a while) | |
| info "Waiting for vLLM to load model (this can take a few minutes)..." | |
| for i in $(seq 1 120); do | |
| if curl -s http://localhost:8000/v1/models > /dev/null 2>&1; then | |
| info "vLLM ready (PID $VLLM_PID)" | |
| break | |
| fi | |
| if ! kill -0 "$VLLM_PID" 2>/dev/null; then | |
| warn "vLLM exited. Check /tmp/vllm-server.log" | |
| break | |
| fi | |
| sleep 2 | |
| done | |
| fi | |
| fi | |
| # ββ 5. Run normal setup ββββββββββββββββββββββββββββββββββββββββββ | |
| info "Running IniClaw setup..." | |
| echo "" | |
| # Drop back to the real user for setup.sh (uses docker group, not root) | |
| if [ -n "$REAL_USER" ]; then | |
| # Pass through env vars that setup.sh needs | |
| sudo -u "$REAL_USER" -E \ | |
| NVIDIA_API_KEY="${NVIDIA_API_KEY:-}" \ | |
| DOCKER_HOST="${DOCKER_HOST:-}" \ | |
| bash "$SCRIPT_DIR/setup.sh" | |
| else | |
| bash "$SCRIPT_DIR/setup.sh" | |
| fi | |