Spaces:
Configuration error
Configuration error
File size: 6,398 Bytes
0722e92 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 | #!/usr/bin/env bash
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# ini_claw setup for DGX Spark devices.
#
# Spark ships Ubuntu 24.04 (cgroup v2) + Docker 28.x but no k3s.
# OpenShell's gateway starts k3s inside a Docker container, which
# needs cgroup host namespace access.# This script configures Docker for ini_claw on DGX Spark.nds off to the normal setup.sh.
#
# Usage:
# sudo ini_claw setup-spark
# # or directly:
# sudo bash scripts/setup-spark.sh
#
# What it does (beyond setup.sh):
# 1. Adds current user to docker group (avoids sudo for everything else)
# 2. Configures Docker daemon for cgroupns=host (k3s-in-Docker on cgroup v2)
# 3. Restarts Docker
# 4. Runs the normal setup.sh
set -euo pipefail
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
info() { echo -e "${GREEN}>>>${NC} $1"; }
warn() { echo -e "${YELLOW}>>>${NC} $1"; }
fail() { echo -e "${RED}>>>${NC} $1"; exit 1; }
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
# ββ Pre-flight checks βββββββββββββββββββββββββββββββββββββββββββββ
if [ "$(uname -s)" != "Linux" ]; then
fail "This script is for DGX Spark (Linux). Use 'ini_claw setup' for macOS."
fi
if [ "$(id -u)" -ne 0 ]; then
fail "Must run as root: sudo ini_claw setup-spark"
fi
# Detect the real user (not root) for docker group add
REAL_USER="${SUDO_USER:-$(logname 2>/dev/null || echo "")}"
if [ -z "$REAL_USER" ]; then
warn "Could not detect non-root user. Docker group will not be configured."
fi
command -v docker > /dev/null || fail "Docker not found. DGX Spark should have Docker pre-installed."
# ββ 1. Docker group βββββββββββββββββββββββββββββββββββββββββββββββ
if [ -n "$REAL_USER" ]; then
if id -nG "$REAL_USER" | grep -qw docker; then
info "User '$REAL_USER' already in docker group"
else
info "Adding '$REAL_USER' to docker group..."
usermod -aG docker "$REAL_USER"
info "Added. Group will take effect on next login (or use 'newgrp docker')."
fi
fi
# ββ 2. Docker cgroup namespace ββββββββββββββββββββββββββββββββββββ
#
# Spark runs cgroup v2 (Ubuntu 24.04). OpenShell's gateway embeds
# k3s in a Docker container, which needs --cgroupns=host to manage
# cgroup hierarchies. Without this, kubelet fails with:
# "openat2 /sys/fs/cgroup/kubepods/pids.max: no"
#
# Setting default-cgroupns-mode=host in daemon.json makes all
# containers use the host cgroup namespace. This is safe β it's
# the Docker default on cgroup v1 hosts anyway.
DAEMON_JSON="/etc/docker/daemon.json"
NEEDS_RESTART=false
if [ -f "$DAEMON_JSON" ]; then
# Check if already configured
if grep -q '"default-cgroupns-mode"' "$DAEMON_JSON" 2>/dev/null; then
CURRENT_MODE=$(python3 -c "import json; print(json.load(open('$DAEMON_JSON')).get('default-cgroupns-mode',''))" 2>/dev/null || echo "")
if [ "$CURRENT_MODE" = "host" ]; then
info "Docker daemon already configured for cgroupns=host"
else
info "Updating Docker daemon cgroupns mode to 'host'..."
python3 -c "
import json
with open('$DAEMON_JSON') as f:
d = json.load(f)
d['default-cgroupns-mode'] = 'host'
with open('$DAEMON_JSON', 'w') as f:
json.dump(d, f, indent=2)
"
NEEDS_RESTART=true
fi
else
info "Adding cgroupns=host to Docker daemon config..."
python3 -c "
import json
try:
with open('$DAEMON_JSON') as f:
d = json.load(f)
except:
d = {}
d['default-cgroupns-mode'] = 'host'
with open('$DAEMON_JSON', 'w') as f:
json.dump(d, f, indent=2)
"
NEEDS_RESTART=true
fi
else
info "Configuring Docker for ini_clawβ¦"n config with cgroupns=host..."
mkdir -p "$(dirname "$DAEMON_JSON")"
echo '{ "default-cgroupns-mode": "host" }' > "$DAEMON_JSON"
NEEDS_RESTART=true
fi
# ββ 3. Restart Docker if needed βββββββββββββββββββββββββββββββββββ
if [ "$NEEDS_RESTART" = true ]; then
info "Restarting Docker daemon..."
systemctl restart docker
# Wait for Docker to be ready
for i in 1 2 3 4 5 6 7 8 9 10; do
if docker info > /dev/null 2>&1; then
break
fi
[ "$i" -eq 10 ] && fail "Docker didn't come back after restart. Check 'systemctl status docker'."
sleep 2
done
info "Docker restarted with cgroupns=host"
fi
# ββ 4. Install and start vLLM (local inference on Spark GPU) ββββββ
if ! python3 -c "import vllm" 2>/dev/null; then
info "Installing vLLM..."
pip3 install --break-system-packages vllm 2>&1 | tail -1
info "vLLM installed"
else
info "vLLM already installed"
fi
# Start vLLM if not already running
VLLM_MODEL="nvidia/nemotron-3-nano-30b-a3b"
if curl -s http://localhost:8000/v1/models > /dev/null 2>&1; then
info "vLLM already running on :8000"
else
if python3 -c "import vllm" 2>/dev/null && command -v nvidia-smi > /dev/null 2>&1; then
info "Starting vLLM with $VLLM_MODEL..."
nohup python3 -m vllm.entrypoints.openai.api_server \
--model "$VLLM_MODEL" \
--port 8000 \
--host 0.0.0.0 \
> /tmp/vllm-server.log 2>&1 &
VLLM_PID=$!
# Wait for vLLM to be ready (model loading can take a while)
info "Waiting for vLLM to load model (this can take a few minutes)..."
for i in $(seq 1 120); do
if curl -s http://localhost:8000/v1/models > /dev/null 2>&1; then
info "vLLM ready (PID $VLLM_PID)"
break
fi
if ! kill -0 "$VLLM_PID" 2>/dev/null; then
warn "vLLM exited. Check /tmp/vllm-server.log"
break
fi
sleep 2
done
fi
fi
# ββ 5. Run normal setup ββββββββββββββββββββββββββββββββββββββββββ
info "Running IniClaw setup..."
echo ""
# Drop back to the real user for setup.sh (uses docker group, not root)
if [ -n "$REAL_USER" ]; then
# Pass through env vars that setup.sh needs
sudo -u "$REAL_USER" -E \
NVIDIA_API_KEY="${NVIDIA_API_KEY:-}" \
DOCKER_HOST="${DOCKER_HOST:-}" \
bash "$SCRIPT_DIR/setup.sh"
else
bash "$SCRIPT_DIR/setup.sh"
fi
|