File size: 6,398 Bytes
0722e92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#!/usr/bin/env bash
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# ini_claw setup for DGX Spark devices.
#
# Spark ships Ubuntu 24.04 (cgroup v2) + Docker 28.x but no k3s.
# OpenShell's gateway starts k3s inside a Docker container, which
# needs cgroup host namespace access.# This script configures Docker for ini_claw on DGX Spark.nds off to the normal setup.sh.
#
# Usage:
#   sudo ini_claw setup-spark
#   # or directly:
#   sudo bash scripts/setup-spark.sh
#
# What it does (beyond setup.sh):
#   1. Adds current user to docker group (avoids sudo for everything else)
#   2. Configures Docker daemon for cgroupns=host (k3s-in-Docker on cgroup v2)
#   3. Restarts Docker
#   4. Runs the normal setup.sh

set -euo pipefail

RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'

info() { echo -e "${GREEN}>>>${NC} $1"; }
warn() { echo -e "${YELLOW}>>>${NC} $1"; }
fail() { echo -e "${RED}>>>${NC} $1"; exit 1; }

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"

# ── Pre-flight checks ─────────────────────────────────────────────

if [ "$(uname -s)" != "Linux" ]; then
  fail "This script is for DGX Spark (Linux). Use 'ini_claw setup' for macOS."
fi

if [ "$(id -u)" -ne 0 ]; then
  fail "Must run as root: sudo ini_claw setup-spark"
fi

# Detect the real user (not root) for docker group add
REAL_USER="${SUDO_USER:-$(logname 2>/dev/null || echo "")}"
if [ -z "$REAL_USER" ]; then
  warn "Could not detect non-root user. Docker group will not be configured."
fi

command -v docker > /dev/null || fail "Docker not found. DGX Spark should have Docker pre-installed."

# ── 1. Docker group ───────────────────────────────────────────────

if [ -n "$REAL_USER" ]; then
  if id -nG "$REAL_USER" | grep -qw docker; then
    info "User '$REAL_USER' already in docker group"
  else
    info "Adding '$REAL_USER' to docker group..."
    usermod -aG docker "$REAL_USER"
    info "Added. Group will take effect on next login (or use 'newgrp docker')."
  fi
fi

# ── 2. Docker cgroup namespace ────────────────────────────────────
#
# Spark runs cgroup v2 (Ubuntu 24.04). OpenShell's gateway embeds
# k3s in a Docker container, which needs --cgroupns=host to manage
# cgroup hierarchies. Without this, kubelet fails with:
#   "openat2 /sys/fs/cgroup/kubepods/pids.max: no"
#
# Setting default-cgroupns-mode=host in daemon.json makes all
# containers use the host cgroup namespace. This is safe β€” it's
# the Docker default on cgroup v1 hosts anyway.

DAEMON_JSON="/etc/docker/daemon.json"
NEEDS_RESTART=false

if [ -f "$DAEMON_JSON" ]; then
  # Check if already configured
  if grep -q '"default-cgroupns-mode"' "$DAEMON_JSON" 2>/dev/null; then
    CURRENT_MODE=$(python3 -c "import json; print(json.load(open('$DAEMON_JSON')).get('default-cgroupns-mode',''))" 2>/dev/null || echo "")
    if [ "$CURRENT_MODE" = "host" ]; then
      info "Docker daemon already configured for cgroupns=host"
    else
      info "Updating Docker daemon cgroupns mode to 'host'..."
      python3 -c "
import json
with open('$DAEMON_JSON') as f:
    d = json.load(f)
d['default-cgroupns-mode'] = 'host'
with open('$DAEMON_JSON', 'w') as f:
    json.dump(d, f, indent=2)
"
      NEEDS_RESTART=true
    fi
  else
    info "Adding cgroupns=host to Docker daemon config..."
    python3 -c "
import json
try:
    with open('$DAEMON_JSON') as f:
        d = json.load(f)
except:
    d = {}
d['default-cgroupns-mode'] = 'host'
with open('$DAEMON_JSON', 'w') as f:
    json.dump(d, f, indent=2)
"
    NEEDS_RESTART=true
  fi
else
  info "Configuring Docker for ini_claw…"n config with cgroupns=host..."
  mkdir -p "$(dirname "$DAEMON_JSON")"
  echo '{ "default-cgroupns-mode": "host" }' > "$DAEMON_JSON"
  NEEDS_RESTART=true
fi

# ── 3. Restart Docker if needed ───────────────────────────────────

if [ "$NEEDS_RESTART" = true ]; then
  info "Restarting Docker daemon..."
  systemctl restart docker
  # Wait for Docker to be ready
  for i in 1 2 3 4 5 6 7 8 9 10; do
    if docker info > /dev/null 2>&1; then
      break
    fi
    [ "$i" -eq 10 ] && fail "Docker didn't come back after restart. Check 'systemctl status docker'."
    sleep 2
  done
  info "Docker restarted with cgroupns=host"
fi

# ── 4. Install and start vLLM (local inference on Spark GPU) ──────

if ! python3 -c "import vllm" 2>/dev/null; then
  info "Installing vLLM..."
  pip3 install --break-system-packages vllm 2>&1 | tail -1
  info "vLLM installed"
else
  info "vLLM already installed"
fi

# Start vLLM if not already running
VLLM_MODEL="nvidia/nemotron-3-nano-30b-a3b"
if curl -s http://localhost:8000/v1/models > /dev/null 2>&1; then
  info "vLLM already running on :8000"
else
  if python3 -c "import vllm" 2>/dev/null && command -v nvidia-smi > /dev/null 2>&1; then
    info "Starting vLLM with $VLLM_MODEL..."
    nohup python3 -m vllm.entrypoints.openai.api_server \
      --model "$VLLM_MODEL" \
      --port 8000 \
      --host 0.0.0.0 \
      > /tmp/vllm-server.log 2>&1 &
    VLLM_PID=$!
    # Wait for vLLM to be ready (model loading can take a while)
    info "Waiting for vLLM to load model (this can take a few minutes)..."
    for i in $(seq 1 120); do
      if curl -s http://localhost:8000/v1/models > /dev/null 2>&1; then
        info "vLLM ready (PID $VLLM_PID)"
        break
      fi
      if ! kill -0 "$VLLM_PID" 2>/dev/null; then
        warn "vLLM exited. Check /tmp/vllm-server.log"
        break
      fi
      sleep 2
    done
  fi
fi

# ── 5. Run normal setup ──────────────────────────────────────────

info "Running IniClaw setup..."
echo ""

# Drop back to the real user for setup.sh (uses docker group, not root)
if [ -n "$REAL_USER" ]; then
  # Pass through env vars that setup.sh needs
  sudo -u "$REAL_USER" -E \
    NVIDIA_API_KEY="${NVIDIA_API_KEY:-}" \
    DOCKER_HOST="${DOCKER_HOST:-}" \
    bash "$SCRIPT_DIR/setup.sh"
else
  bash "$SCRIPT_DIR/setup.sh"
fi