RamMAC / setup-worker.ps1
Aaryan17's picture
feat: upload full MAC source (mac/, frontend/, alembic/, tests/)
9c0b225 verified
# ═══════════════════════════════════════════════════════════
# MAC Worker Node Setup Script (Windows)
# Run this on each worker PC (PC2, PC3, etc.)
#
# Prerequisites: Windows 10/11, NVIDIA GPU, Admin PowerShell
# This script installs: WSL2, Docker Desktop, NVIDIA Container Toolkit
# Then deploys the MAC worker with vLLM + heartbeat agent.
#
# Usage (run as Administrator):
# Set-ExecutionPolicy Bypass -Scope Process -Force
# .\setup-worker.ps1
# ═══════════════════════════════════════════════════════════
param(
[string]$ControlNodeIP = "10.10.13.30",
[string]$EnrollmentToken = "",
[string]$NodeName = "",
[string]$Model = "",
[int]$VllmPort = 8001,
[float]$GpuMemUtil = 0.85,
[int]$MaxModelLen = 8192
)
$ErrorActionPreference = "Stop"
function Write-Step { param([string]$msg) Write-Host "`n[MAC] $msg" -ForegroundColor Cyan }
function Write-Ok { param([string]$msg) Write-Host "[OK] $msg" -ForegroundColor Green }
function Write-Warn { param([string]$msg) Write-Host "[!] $msg" -ForegroundColor Yellow }
Write-Host @"
╔══════════════════════════════════════════════════╗
β•‘ MAC Worker Node Setup β€” MBM AI Cloud β•‘
β•‘ Setting up GPU inference worker β•‘
β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
"@ -ForegroundColor Magenta
# ── Verify admin privileges ──────────────────────────────
$isAdmin = ([Security.Principal.WindowsPrincipal][Security.Principal.WindowsIdentity]::GetCurrent()).IsInRole([Security.Principal.WindowsBuiltInRole]::Administrator)
if (-not $isAdmin) {
Write-Host "ERROR: This script must be run as Administrator" -ForegroundColor Red
Write-Host "Right-click PowerShell -> Run as Administrator" -ForegroundColor Yellow
exit 1
}
# ── Check NVIDIA driver ─────────────────────────────────
Write-Step "Checking NVIDIA GPU..."
try {
$gpu = & nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv,noheader 2>$null
if ($gpu) {
Write-Ok "GPU found: $gpu"
} else {
throw "No GPU"
}
} catch {
Write-Host "ERROR: NVIDIA GPU or driver not found. Install latest NVIDIA drivers first:" -ForegroundColor Red
Write-Host " https://www.nvidia.com/Download/index.aspx" -ForegroundColor Yellow
exit 1
}
# ── Step 1: Install/Enable WSL2 ─────────────────────────
Write-Step "Checking WSL2..."
$wslInstalled = $false
try {
$wslVersion = wsl --version 2>$null
if ($LASTEXITCODE -eq 0) { $wslInstalled = $true }
} catch {}
if (-not $wslInstalled) {
Write-Step "Installing WSL2 (this may require a reboot)..."
wsl --install --no-distribution
Write-Warn "WSL2 installed. If prompted, REBOOT and re-run this script."
Write-Host "After reboot, run: .\setup-worker.ps1" -ForegroundColor Yellow
Read-Host "Press Enter to continue (or Ctrl+C to reboot first)"
} else {
Write-Ok "WSL2 is already installed"
}
# ── Step 2: Install Docker Desktop ──────────────────────
Write-Step "Checking Docker..."
$dockerInstalled = $false
try {
$dockerVer = docker version --format '{{.Server.Version}}' 2>$null
if ($LASTEXITCODE -eq 0 -and $dockerVer) { $dockerInstalled = $true }
} catch {}
if (-not $dockerInstalled) {
Write-Step "Downloading Docker Desktop..."
$dockerUrl = "https://desktop.docker.com/win/main/amd64/Docker%20Desktop%20Installer.exe"
$installerPath = "$env:TEMP\DockerDesktopInstaller.exe"
if (-not (Test-Path $installerPath)) {
Invoke-WebRequest -Uri $dockerUrl -OutFile $installerPath -UseBasicParsing
}
Write-Step "Installing Docker Desktop (this takes a few minutes)..."
Start-Process -FilePath $installerPath -ArgumentList "install","--quiet","--accept-license" -Wait -NoNewWindow
Write-Warn "Docker Desktop installed. You need to:"
Write-Host " 1. Start Docker Desktop from Start Menu" -ForegroundColor Yellow
Write-Host " 2. Wait for it to finish starting (whale icon in taskbar)" -ForegroundColor Yellow
Write-Host " 3. Re-run this script" -ForegroundColor Yellow
Read-Host "Press Enter when Docker Desktop is running"
# Re-check
try {
$dockerVer = docker version --format '{{.Server.Version}}' 2>$null
if ($LASTEXITCODE -ne 0) { throw "Docker not ready" }
$dockerInstalled = $true
} catch {
Write-Host "Docker is not running yet. Start Docker Desktop and re-run." -ForegroundColor Red
exit 1
}
} else {
Write-Ok "Docker is installed: $dockerVer"
}
# ── Step 3: Verify Docker GPU support ───────────────────
Write-Step "Checking Docker GPU support..."
try {
$gpuTest = docker run --rm --gpus all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi 2>$null
if ($LASTEXITCODE -eq 0) {
Write-Ok "Docker GPU (NVIDIA Container Toolkit) is working"
} else {
throw "GPU test failed"
}
} catch {
Write-Warn "Docker GPU test failed. Ensure Docker Desktop has WSL2 backend enabled."
Write-Host " Docker Desktop -> Settings -> General -> Use WSL2 based engine = ON" -ForegroundColor Yellow
Write-Host " Docker Desktop -> Settings -> Resources -> WSL Integration -> Enable" -ForegroundColor Yellow
Read-Host "Fix the settings and press Enter to continue"
}
# ── Step 3b: Open firewall for vLLM port ────────────────
Write-Step "Configuring firewall..."
$fwRule = "MAC vLLM Worker ($VllmPort)"
$existing = Get-NetFirewallRule -DisplayName $fwRule -ErrorAction SilentlyContinue
if (-not $existing) {
New-NetFirewallRule -DisplayName $fwRule -Direction Inbound -Protocol TCP -LocalPort $VllmPort -Action Allow -Profile Private,Domain | Out-Null
Write-Ok "Firewall rule created for port $VllmPort"
} else {
Write-Ok "Firewall rule already exists for port $VllmPort"
}
# ── Step 4: Get deployment parameters ───────────────────
Write-Step "Configuring worker node..."
if (-not $NodeName) {
$hostname = $env:COMPUTERNAME
$NodeName = Read-Host "Enter node name (default: worker-$hostname)"
if (-not $NodeName) { $NodeName = "worker-$hostname" }
}
if (-not $EnrollmentToken) {
Write-Host "`nYou need an enrollment token from the MAC admin panel." -ForegroundColor Yellow
Write-Host "Ask the admin to generate one at: http://$ControlNodeIP/admin -> Cluster -> Generate Token`n" -ForegroundColor Yellow
$EnrollmentToken = Read-Host "Paste enrollment token"
if (-not $EnrollmentToken) {
Write-Host "ERROR: Enrollment token is required" -ForegroundColor Red
exit 1
}
}
if (-not $Model) {
Write-Host "`nChoose a model for this worker:" -ForegroundColor Yellow
Write-Host " [1] Qwen2.5-Coder-7B β€” Code generation & debugging (recommended for PC2)" -ForegroundColor White
Write-Host " [2] DeepSeek-R1-7B β€” Math & reasoning (recommended for PC3)" -ForegroundColor White
Write-Host " [3] Qwen2.5-7B-Instruct β€” General chat (same as PC1)" -ForegroundColor White
Write-Host " [4] Custom model β€” Enter HuggingFace model name" -ForegroundColor White
$choice = Read-Host "Select (1-4)"
switch ($choice) {
"1" { $Model = "Qwen/Qwen2.5-Coder-7B-Instruct-AWQ" }
"2" { $Model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" }
"3" { $Model = "Qwen/Qwen2.5-7B-Instruct-AWQ" }
"4" { $Model = Read-Host "Enter full HuggingFace model name" }
default { $Model = "Qwen/Qwen2.5-Coder-7B-Instruct-AWQ" }
}
}
# Get local IP
$localIP = (Get-NetIPAddress -AddressFamily IPv4 | Where-Object { $_.InterfaceAlias -like "*Wi-Fi*" -and $_.PrefixOrigin -eq "Dhcp" } | Select-Object -First 1).IPAddress
if (-not $localIP) {
$localIP = (Get-NetIPAddress -AddressFamily IPv4 | Where-Object { $_.PrefixOrigin -eq "Dhcp" } | Select-Object -First 1).IPAddress
}
Write-Ok "This PC's IP: $localIP"
# Get CPU cores and RAM
$cpuCores = (Get-CimInstance Win32_Processor).NumberOfLogicalProcessors
$ramMB = [math]::Round((Get-CimInstance Win32_ComputerSystem).TotalPhysicalMemory / 1MB)
$gpuInfo = (& nvidia-smi --query-gpu=name,memory.total --format=csv,noheader).Trim()
$gpuName = ($gpuInfo -split ",")[0].Trim()
$gpuVram = [int](($gpuInfo -split ",")[1].Trim() -replace '[^0-9]','')
Write-Host "`n Configuration Summary:" -ForegroundColor Cyan
Write-Host " Node Name: $NodeName"
Write-Host " Control Node: $ControlNodeIP"
Write-Host " This PC IP: $localIP"
Write-Host " GPU: $gpuName ($gpuVram MB)"
Write-Host " RAM: $ramMB MB"
Write-Host " CPU Cores: $cpuCores"
Write-Host " Model: $Model"
Write-Host " vLLM Port: $VllmPort"
# ── Step 5: Create worker directory ─────────────────────
Write-Step "Setting up worker directory..."
$workerDir = "$env:USERPROFILE\mac-worker"
if (-not (Test-Path $workerDir)) { New-Item -ItemType Directory -Path $workerDir | Out-Null }
# Write .env file
$envContent = @"
# MAC Worker Node Configuration
# Generated: $(Get-Date -Format "yyyy-MM-dd HH:mm:ss")
CONTROL_NODE_URL=http://${ControlNodeIP}:8000
ENROLLMENT_TOKEN=${EnrollmentToken}
NODE_NAME=${NodeName}
VLLM_MODEL=${Model}
VLLM_PORT=${VllmPort}
GPU_MEM_UTIL=${GpuMemUtil}
MAX_MODEL_LEN=${MaxModelLen}
GPU_NAME=${gpuName}
GPU_VRAM_MB=${gpuVram}
RAM_TOTAL_MB=${ramMB}
CPU_CORES=${cpuCores}
HEARTBEAT_INTERVAL=30
"@
Set-Content -Path "$workerDir\.env" -Value $envContent
Write-Ok "Created .env at $workerDir\.env"
# Write docker-compose.yml
$composeContent = @"
# MAC GPU Worker Node β€” $NodeName
# Model: $Model
# Generated: $(Get-Date -Format "yyyy-MM-dd HH:mm:ss")
services:
vllm:
image: vllm/vllm-openai:latest
container_name: mac-vllm-worker
ports:
- "${VllmPort}:${VllmPort}"
environment:
- HF_HOME=/root/.cache/huggingface
volumes:
- hf-cache:/root/.cache/huggingface
command: >
--model `${VLLM_MODEL}
--port `${VLLM_PORT}
--gpu-memory-utilization `${GPU_MEM_UTIL}
--max-model-len `${MAX_MODEL_LEN}
--trust-remote-code
--enforce-eager
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
restart: unless-stopped
networks:
- worker-net
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:`${VLLM_PORT}/health || exit 1"]
interval: 30s
timeout: 10s
retries: 5
start_period: 120s
worker-agent:
image: python:3.11-slim
container_name: mac-worker-agent
env_file: .env
volumes:
- ./worker-agent.py:/app/agent.py:ro
- agent-state:/tmp
command: >
bash -c "pip install httpx psutil --quiet && python /app/agent.py"
depends_on:
vllm:
condition: service_healthy
restart: unless-stopped
networks:
- worker-net
volumes:
hf-cache:
agent-state:
networks:
worker-net:
driver: bridge
"@
Set-Content -Path "$workerDir\docker-compose.yml" -Value $composeContent
Write-Ok "Created docker-compose.yml"
# Copy worker-agent.py (download from control node or use local copy)
$agentScript = @'
#!/usr/bin/env python3
"""MAC Worker Agent β€” Enrolls with control node and sends periodic heartbeats."""
import asyncio, json, os, socket, sys, time
import httpx
CONTROL_URL = os.environ.get("CONTROL_NODE_URL", "http://10.10.13.30:8000")
ENROLLMENT_TOKEN = os.environ.get("ENROLLMENT_TOKEN", "")
NODE_NAME = os.environ.get("NODE_NAME", f"worker-{socket.gethostname()}")
VLLM_PORT = int(os.environ.get("VLLM_PORT", 8001))
GPU_NAME = os.environ.get("GPU_NAME", "NVIDIA GPU")
GPU_VRAM_MB = int(os.environ.get("GPU_VRAM_MB", 12288))
RAM_TOTAL_MB = int(os.environ.get("RAM_TOTAL_MB", 16384))
CPU_CORES = int(os.environ.get("CPU_CORES", 8))
HEARTBEAT_INTERVAL = int(os.environ.get("HEARTBEAT_INTERVAL", 30))
API = f"{CONTROL_URL}/api/v1"
STATE_FILE = "/tmp/mac_worker_state.json"
def get_local_ip():
try:
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.connect(("8.8.8.8", 80))
ip = s.getsockname()[0]
s.close()
return ip
except Exception:
return "127.0.0.1"
def load_state():
try:
with open(STATE_FILE, "r") as f:
return json.load(f)
except (FileNotFoundError, json.JSONDecodeError):
return {}
def save_state(data):
with open(STATE_FILE, "w") as f:
json.dump(data, f)
def get_resource_metrics():
metrics = {"cpu_util_pct": 0.0, "ram_used_mb": 0, "gpu_util_pct": 0.0, "gpu_vram_used_mb": 0}
try:
import psutil
metrics["cpu_util_pct"] = psutil.cpu_percent(interval=1)
metrics["ram_used_mb"] = int(psutil.virtual_memory().used / 1024 / 1024)
except ImportError:
pass
try:
import subprocess
result = subprocess.run(
["nvidia-smi", "--query-gpu=utilization.gpu,memory.used", "--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=5
)
if result.returncode == 0:
parts = result.stdout.strip().split(",")
if len(parts) >= 2:
metrics["gpu_util_pct"] = float(parts[0].strip())
metrics["gpu_vram_used_mb"] = int(float(parts[1].strip()))
except (FileNotFoundError, Exception):
pass
return metrics
async def enroll(client):
state = load_state()
if state.get("node_id"):
print(f"[AGENT] Already enrolled as node {state['node_id']}")
return state["node_id"]
if not ENROLLMENT_TOKEN:
print("[AGENT] ERROR: No ENROLLMENT_TOKEN set")
return None
ip = get_local_ip()
payload = {
"enrollment_token": ENROLLMENT_TOKEN, "name": NODE_NAME,
"hostname": socket.gethostname(), "ip_address": ip, "port": VLLM_PORT,
"gpu_name": GPU_NAME, "gpu_vram_mb": GPU_VRAM_MB,
"ram_total_mb": RAM_TOTAL_MB, "cpu_cores": CPU_CORES,
}
try:
resp = await client.post(f"{API}/nodes/enroll", json=payload)
if resp.status_code == 200:
data = resp.json()
node_id = data.get("id")
save_state({"node_id": node_id, "name": NODE_NAME})
print(f"[AGENT] Enrolled! Node ID: {node_id}")
return node_id
else:
print(f"[AGENT] Enrollment failed: {resp.status_code} {resp.text}")
return None
except httpx.RequestError as e:
print(f"[AGENT] Connection error: {e}")
return None
async def heartbeat_loop(client, node_id):
consecutive_failures = 0
while True:
try:
metrics = get_resource_metrics()
resp = await client.post(f"{API}/nodes/heartbeat/{node_id}", json=metrics)
if resp.status_code == 200:
consecutive_failures = 0
warnings = resp.json().get("warnings", [])
if warnings:
print(f"[AGENT] Warnings: {warnings}")
elif resp.status_code == 404:
print("[AGENT] Node not found β€” re-enrolling...")
save_state({})
return
else:
consecutive_failures += 1
except httpx.RequestError as e:
consecutive_failures += 1
print(f"[AGENT] Heartbeat error: {e}")
if consecutive_failures >= 10:
print("[AGENT] Too many failures, waiting 60s...")
await asyncio.sleep(60)
consecutive_failures = 0
else:
await asyncio.sleep(HEARTBEAT_INTERVAL)
async def wait_for_vllm():
print(f"[AGENT] Waiting for vLLM on port {VLLM_PORT}...")
async with httpx.AsyncClient(timeout=5) as client:
for _ in range(120):
try:
resp = await client.get(f"http://localhost:{VLLM_PORT}/health")
if resp.status_code == 200:
print("[AGENT] vLLM ready!")
return True
except httpx.RequestError:
pass
await asyncio.sleep(5)
print("[AGENT] WARNING: vLLM not ready after 10 min")
return False
async def main():
print(f"[AGENT] MAC Worker Agent β€” {NODE_NAME}")
print(f"[AGENT] Control: {CONTROL_URL}")
await wait_for_vllm()
async with httpx.AsyncClient(timeout=30) as client:
node_id = None
while not node_id:
node_id = await enroll(client)
if not node_id:
print("[AGENT] Retrying in 30s...")
await asyncio.sleep(30)
print(f"[AGENT] Starting heartbeat loop (every {HEARTBEAT_INTERVAL}s)")
while True:
await heartbeat_loop(client, node_id)
node_id = None
while not node_id:
node_id = await enroll(client)
if not node_id:
await asyncio.sleep(30)
if __name__ == "__main__":
asyncio.run(main())
'@
Set-Content -Path "$workerDir\worker-agent.py" -Value $agentScript
Write-Ok "Created worker-agent.py"
# ── Step 6: Launch! ─────────────────────────────────────
Write-Step "Starting MAC worker..."
Write-Host "`n This will pull the vLLM Docker image (~8GB) and the model." -ForegroundColor Yellow
Write-Host " First run may take 15-30 minutes depending on internet speed.`n" -ForegroundColor Yellow
$startNow = Read-Host "Start the worker now? (Y/n)"
if ($startNow -ne "n" -and $startNow -ne "N") {
Push-Location $workerDir
docker compose up -d
Pop-Location
Write-Host "`n" -NoNewline
Write-Ok "Worker is starting! Monitor with:"
Write-Host " cd $workerDir" -ForegroundColor White
Write-Host " docker compose logs -f # Watch all logs" -ForegroundColor White
Write-Host " docker compose logs -f vllm # Watch model loading" -ForegroundColor White
Write-Host " docker compose logs -f worker-agent # Watch enrollment" -ForegroundColor White
} else {
Write-Ok "Setup complete. When ready, run:"
Write-Host " cd $workerDir" -ForegroundColor White
Write-Host " docker compose up -d" -ForegroundColor White
}
Write-Host @"
╔══════════════════════════════════════════════════╗
β•‘ Setup complete! Worker: $NodeName
β•‘ Model: $Model
β•‘ Control: http://${ControlNodeIP}:8000
β•‘ β•‘
β•‘ The worker will auto-enroll with the control β•‘
β•‘ node and start sending heartbeats. β•‘
β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
"@ -ForegroundColor Green