# ═══════════════════════════════════════════════════════════ # MAC Worker Node Setup Script (Windows) # Run this on each worker PC (PC2, PC3, etc.) # # Prerequisites: Windows 10/11, NVIDIA GPU, Admin PowerShell # This script installs: WSL2, Docker Desktop, NVIDIA Container Toolkit # Then deploys the MAC worker with vLLM + heartbeat agent. # # Usage (run as Administrator): # Set-ExecutionPolicy Bypass -Scope Process -Force # .\setup-worker.ps1 # ═══════════════════════════════════════════════════════════ param( [string]$ControlNodeIP = "10.10.13.30", [string]$EnrollmentToken = "", [string]$NodeName = "", [string]$Model = "", [int]$VllmPort = 8001, [float]$GpuMemUtil = 0.85, [int]$MaxModelLen = 8192 ) $ErrorActionPreference = "Stop" function Write-Step { param([string]$msg) Write-Host "`n[MAC] $msg" -ForegroundColor Cyan } function Write-Ok { param([string]$msg) Write-Host "[OK] $msg" -ForegroundColor Green } function Write-Warn { param([string]$msg) Write-Host "[!] $msg" -ForegroundColor Yellow } Write-Host @" ╔══════════════════════════════════════════════════╗ ║ MAC Worker Node Setup — MBM AI Cloud ║ ║ Setting up GPU inference worker ║ ╚══════════════════════════════════════════════════╝ "@ -ForegroundColor Magenta # ── Verify admin privileges ────────────────────────────── $isAdmin = ([Security.Principal.WindowsPrincipal][Security.Principal.WindowsIdentity]::GetCurrent()).IsInRole([Security.Principal.WindowsBuiltInRole]::Administrator) if (-not $isAdmin) { Write-Host "ERROR: This script must be run as Administrator" -ForegroundColor Red Write-Host "Right-click PowerShell -> Run as Administrator" -ForegroundColor Yellow exit 1 } # ── Check NVIDIA driver ───────────────────────────────── Write-Step "Checking NVIDIA GPU..." try { $gpu = & nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv,noheader 2>$null if ($gpu) { Write-Ok "GPU found: $gpu" } else { throw "No GPU" } } catch { Write-Host "ERROR: NVIDIA GPU or driver not found. Install latest NVIDIA drivers first:" -ForegroundColor Red Write-Host " https://www.nvidia.com/Download/index.aspx" -ForegroundColor Yellow exit 1 } # ── Step 1: Install/Enable WSL2 ───────────────────────── Write-Step "Checking WSL2..." $wslInstalled = $false try { $wslVersion = wsl --version 2>$null if ($LASTEXITCODE -eq 0) { $wslInstalled = $true } } catch {} if (-not $wslInstalled) { Write-Step "Installing WSL2 (this may require a reboot)..." wsl --install --no-distribution Write-Warn "WSL2 installed. If prompted, REBOOT and re-run this script." Write-Host "After reboot, run: .\setup-worker.ps1" -ForegroundColor Yellow Read-Host "Press Enter to continue (or Ctrl+C to reboot first)" } else { Write-Ok "WSL2 is already installed" } # ── Step 2: Install Docker Desktop ────────────────────── Write-Step "Checking Docker..." $dockerInstalled = $false try { $dockerVer = docker version --format '{{.Server.Version}}' 2>$null if ($LASTEXITCODE -eq 0 -and $dockerVer) { $dockerInstalled = $true } } catch {} if (-not $dockerInstalled) { Write-Step "Downloading Docker Desktop..." $dockerUrl = "https://desktop.docker.com/win/main/amd64/Docker%20Desktop%20Installer.exe" $installerPath = "$env:TEMP\DockerDesktopInstaller.exe" if (-not (Test-Path $installerPath)) { Invoke-WebRequest -Uri $dockerUrl -OutFile $installerPath -UseBasicParsing } Write-Step "Installing Docker Desktop (this takes a few minutes)..." Start-Process -FilePath $installerPath -ArgumentList "install","--quiet","--accept-license" -Wait -NoNewWindow Write-Warn "Docker Desktop installed. You need to:" Write-Host " 1. Start Docker Desktop from Start Menu" -ForegroundColor Yellow Write-Host " 2. Wait for it to finish starting (whale icon in taskbar)" -ForegroundColor Yellow Write-Host " 3. Re-run this script" -ForegroundColor Yellow Read-Host "Press Enter when Docker Desktop is running" # Re-check try { $dockerVer = docker version --format '{{.Server.Version}}' 2>$null if ($LASTEXITCODE -ne 0) { throw "Docker not ready" } $dockerInstalled = $true } catch { Write-Host "Docker is not running yet. Start Docker Desktop and re-run." -ForegroundColor Red exit 1 } } else { Write-Ok "Docker is installed: $dockerVer" } # ── Step 3: Verify Docker GPU support ─────────────────── Write-Step "Checking Docker GPU support..." try { $gpuTest = docker run --rm --gpus all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi 2>$null if ($LASTEXITCODE -eq 0) { Write-Ok "Docker GPU (NVIDIA Container Toolkit) is working" } else { throw "GPU test failed" } } catch { Write-Warn "Docker GPU test failed. Ensure Docker Desktop has WSL2 backend enabled." Write-Host " Docker Desktop -> Settings -> General -> Use WSL2 based engine = ON" -ForegroundColor Yellow Write-Host " Docker Desktop -> Settings -> Resources -> WSL Integration -> Enable" -ForegroundColor Yellow Read-Host "Fix the settings and press Enter to continue" } # ── Step 3b: Open firewall for vLLM port ──────────────── Write-Step "Configuring firewall..." $fwRule = "MAC vLLM Worker ($VllmPort)" $existing = Get-NetFirewallRule -DisplayName $fwRule -ErrorAction SilentlyContinue if (-not $existing) { New-NetFirewallRule -DisplayName $fwRule -Direction Inbound -Protocol TCP -LocalPort $VllmPort -Action Allow -Profile Private,Domain | Out-Null Write-Ok "Firewall rule created for port $VllmPort" } else { Write-Ok "Firewall rule already exists for port $VllmPort" } # ── Step 4: Get deployment parameters ─────────────────── Write-Step "Configuring worker node..." if (-not $NodeName) { $hostname = $env:COMPUTERNAME $NodeName = Read-Host "Enter node name (default: worker-$hostname)" if (-not $NodeName) { $NodeName = "worker-$hostname" } } if (-not $EnrollmentToken) { Write-Host "`nYou need an enrollment token from the MAC admin panel." -ForegroundColor Yellow Write-Host "Ask the admin to generate one at: http://$ControlNodeIP/admin -> Cluster -> Generate Token`n" -ForegroundColor Yellow $EnrollmentToken = Read-Host "Paste enrollment token" if (-not $EnrollmentToken) { Write-Host "ERROR: Enrollment token is required" -ForegroundColor Red exit 1 } } if (-not $Model) { Write-Host "`nChoose a model for this worker:" -ForegroundColor Yellow Write-Host " [1] Qwen2.5-Coder-7B — Code generation & debugging (recommended for PC2)" -ForegroundColor White Write-Host " [2] DeepSeek-R1-7B — Math & reasoning (recommended for PC3)" -ForegroundColor White Write-Host " [3] Qwen2.5-7B-Instruct — General chat (same as PC1)" -ForegroundColor White Write-Host " [4] Custom model — Enter HuggingFace model name" -ForegroundColor White $choice = Read-Host "Select (1-4)" switch ($choice) { "1" { $Model = "Qwen/Qwen2.5-Coder-7B-Instruct-AWQ" } "2" { $Model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" } "3" { $Model = "Qwen/Qwen2.5-7B-Instruct-AWQ" } "4" { $Model = Read-Host "Enter full HuggingFace model name" } default { $Model = "Qwen/Qwen2.5-Coder-7B-Instruct-AWQ" } } } # Get local IP $localIP = (Get-NetIPAddress -AddressFamily IPv4 | Where-Object { $_.InterfaceAlias -like "*Wi-Fi*" -and $_.PrefixOrigin -eq "Dhcp" } | Select-Object -First 1).IPAddress if (-not $localIP) { $localIP = (Get-NetIPAddress -AddressFamily IPv4 | Where-Object { $_.PrefixOrigin -eq "Dhcp" } | Select-Object -First 1).IPAddress } Write-Ok "This PC's IP: $localIP" # Get CPU cores and RAM $cpuCores = (Get-CimInstance Win32_Processor).NumberOfLogicalProcessors $ramMB = [math]::Round((Get-CimInstance Win32_ComputerSystem).TotalPhysicalMemory / 1MB) $gpuInfo = (& nvidia-smi --query-gpu=name,memory.total --format=csv,noheader).Trim() $gpuName = ($gpuInfo -split ",")[0].Trim() $gpuVram = [int](($gpuInfo -split ",")[1].Trim() -replace '[^0-9]','') Write-Host "`n Configuration Summary:" -ForegroundColor Cyan Write-Host " Node Name: $NodeName" Write-Host " Control Node: $ControlNodeIP" Write-Host " This PC IP: $localIP" Write-Host " GPU: $gpuName ($gpuVram MB)" Write-Host " RAM: $ramMB MB" Write-Host " CPU Cores: $cpuCores" Write-Host " Model: $Model" Write-Host " vLLM Port: $VllmPort" # ── Step 5: Create worker directory ───────────────────── Write-Step "Setting up worker directory..." $workerDir = "$env:USERPROFILE\mac-worker" if (-not (Test-Path $workerDir)) { New-Item -ItemType Directory -Path $workerDir | Out-Null } # Write .env file $envContent = @" # MAC Worker Node Configuration # Generated: $(Get-Date -Format "yyyy-MM-dd HH:mm:ss") CONTROL_NODE_URL=http://${ControlNodeIP}:8000 ENROLLMENT_TOKEN=${EnrollmentToken} NODE_NAME=${NodeName} VLLM_MODEL=${Model} VLLM_PORT=${VllmPort} GPU_MEM_UTIL=${GpuMemUtil} MAX_MODEL_LEN=${MaxModelLen} GPU_NAME=${gpuName} GPU_VRAM_MB=${gpuVram} RAM_TOTAL_MB=${ramMB} CPU_CORES=${cpuCores} HEARTBEAT_INTERVAL=30 "@ Set-Content -Path "$workerDir\.env" -Value $envContent Write-Ok "Created .env at $workerDir\.env" # Write docker-compose.yml $composeContent = @" # MAC GPU Worker Node — $NodeName # Model: $Model # Generated: $(Get-Date -Format "yyyy-MM-dd HH:mm:ss") services: vllm: image: vllm/vllm-openai:latest container_name: mac-vllm-worker ports: - "${VllmPort}:${VllmPort}" environment: - HF_HOME=/root/.cache/huggingface volumes: - hf-cache:/root/.cache/huggingface command: > --model `${VLLM_MODEL} --port `${VLLM_PORT} --gpu-memory-utilization `${GPU_MEM_UTIL} --max-model-len `${MAX_MODEL_LEN} --trust-remote-code --enforce-eager deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] restart: unless-stopped networks: - worker-net healthcheck: test: ["CMD-SHELL", "curl -f http://localhost:`${VLLM_PORT}/health || exit 1"] interval: 30s timeout: 10s retries: 5 start_period: 120s worker-agent: image: python:3.11-slim container_name: mac-worker-agent env_file: .env volumes: - ./worker-agent.py:/app/agent.py:ro - agent-state:/tmp command: > bash -c "pip install httpx psutil --quiet && python /app/agent.py" depends_on: vllm: condition: service_healthy restart: unless-stopped networks: - worker-net volumes: hf-cache: agent-state: networks: worker-net: driver: bridge "@ Set-Content -Path "$workerDir\docker-compose.yml" -Value $composeContent Write-Ok "Created docker-compose.yml" # Copy worker-agent.py (download from control node or use local copy) $agentScript = @' #!/usr/bin/env python3 """MAC Worker Agent — Enrolls with control node and sends periodic heartbeats.""" import asyncio, json, os, socket, sys, time import httpx CONTROL_URL = os.environ.get("CONTROL_NODE_URL", "http://10.10.13.30:8000") ENROLLMENT_TOKEN = os.environ.get("ENROLLMENT_TOKEN", "") NODE_NAME = os.environ.get("NODE_NAME", f"worker-{socket.gethostname()}") VLLM_PORT = int(os.environ.get("VLLM_PORT", 8001)) GPU_NAME = os.environ.get("GPU_NAME", "NVIDIA GPU") GPU_VRAM_MB = int(os.environ.get("GPU_VRAM_MB", 12288)) RAM_TOTAL_MB = int(os.environ.get("RAM_TOTAL_MB", 16384)) CPU_CORES = int(os.environ.get("CPU_CORES", 8)) HEARTBEAT_INTERVAL = int(os.environ.get("HEARTBEAT_INTERVAL", 30)) API = f"{CONTROL_URL}/api/v1" STATE_FILE = "/tmp/mac_worker_state.json" def get_local_ip(): try: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect(("8.8.8.8", 80)) ip = s.getsockname()[0] s.close() return ip except Exception: return "127.0.0.1" def load_state(): try: with open(STATE_FILE, "r") as f: return json.load(f) except (FileNotFoundError, json.JSONDecodeError): return {} def save_state(data): with open(STATE_FILE, "w") as f: json.dump(data, f) def get_resource_metrics(): metrics = {"cpu_util_pct": 0.0, "ram_used_mb": 0, "gpu_util_pct": 0.0, "gpu_vram_used_mb": 0} try: import psutil metrics["cpu_util_pct"] = psutil.cpu_percent(interval=1) metrics["ram_used_mb"] = int(psutil.virtual_memory().used / 1024 / 1024) except ImportError: pass try: import subprocess result = subprocess.run( ["nvidia-smi", "--query-gpu=utilization.gpu,memory.used", "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5 ) if result.returncode == 0: parts = result.stdout.strip().split(",") if len(parts) >= 2: metrics["gpu_util_pct"] = float(parts[0].strip()) metrics["gpu_vram_used_mb"] = int(float(parts[1].strip())) except (FileNotFoundError, Exception): pass return metrics async def enroll(client): state = load_state() if state.get("node_id"): print(f"[AGENT] Already enrolled as node {state['node_id']}") return state["node_id"] if not ENROLLMENT_TOKEN: print("[AGENT] ERROR: No ENROLLMENT_TOKEN set") return None ip = get_local_ip() payload = { "enrollment_token": ENROLLMENT_TOKEN, "name": NODE_NAME, "hostname": socket.gethostname(), "ip_address": ip, "port": VLLM_PORT, "gpu_name": GPU_NAME, "gpu_vram_mb": GPU_VRAM_MB, "ram_total_mb": RAM_TOTAL_MB, "cpu_cores": CPU_CORES, } try: resp = await client.post(f"{API}/nodes/enroll", json=payload) if resp.status_code == 200: data = resp.json() node_id = data.get("id") save_state({"node_id": node_id, "name": NODE_NAME}) print(f"[AGENT] Enrolled! Node ID: {node_id}") return node_id else: print(f"[AGENT] Enrollment failed: {resp.status_code} {resp.text}") return None except httpx.RequestError as e: print(f"[AGENT] Connection error: {e}") return None async def heartbeat_loop(client, node_id): consecutive_failures = 0 while True: try: metrics = get_resource_metrics() resp = await client.post(f"{API}/nodes/heartbeat/{node_id}", json=metrics) if resp.status_code == 200: consecutive_failures = 0 warnings = resp.json().get("warnings", []) if warnings: print(f"[AGENT] Warnings: {warnings}") elif resp.status_code == 404: print("[AGENT] Node not found — re-enrolling...") save_state({}) return else: consecutive_failures += 1 except httpx.RequestError as e: consecutive_failures += 1 print(f"[AGENT] Heartbeat error: {e}") if consecutive_failures >= 10: print("[AGENT] Too many failures, waiting 60s...") await asyncio.sleep(60) consecutive_failures = 0 else: await asyncio.sleep(HEARTBEAT_INTERVAL) async def wait_for_vllm(): print(f"[AGENT] Waiting for vLLM on port {VLLM_PORT}...") async with httpx.AsyncClient(timeout=5) as client: for _ in range(120): try: resp = await client.get(f"http://localhost:{VLLM_PORT}/health") if resp.status_code == 200: print("[AGENT] vLLM ready!") return True except httpx.RequestError: pass await asyncio.sleep(5) print("[AGENT] WARNING: vLLM not ready after 10 min") return False async def main(): print(f"[AGENT] MAC Worker Agent — {NODE_NAME}") print(f"[AGENT] Control: {CONTROL_URL}") await wait_for_vllm() async with httpx.AsyncClient(timeout=30) as client: node_id = None while not node_id: node_id = await enroll(client) if not node_id: print("[AGENT] Retrying in 30s...") await asyncio.sleep(30) print(f"[AGENT] Starting heartbeat loop (every {HEARTBEAT_INTERVAL}s)") while True: await heartbeat_loop(client, node_id) node_id = None while not node_id: node_id = await enroll(client) if not node_id: await asyncio.sleep(30) if __name__ == "__main__": asyncio.run(main()) '@ Set-Content -Path "$workerDir\worker-agent.py" -Value $agentScript Write-Ok "Created worker-agent.py" # ── Step 6: Launch! ───────────────────────────────────── Write-Step "Starting MAC worker..." Write-Host "`n This will pull the vLLM Docker image (~8GB) and the model." -ForegroundColor Yellow Write-Host " First run may take 15-30 minutes depending on internet speed.`n" -ForegroundColor Yellow $startNow = Read-Host "Start the worker now? (Y/n)" if ($startNow -ne "n" -and $startNow -ne "N") { Push-Location $workerDir docker compose up -d Pop-Location Write-Host "`n" -NoNewline Write-Ok "Worker is starting! Monitor with:" Write-Host " cd $workerDir" -ForegroundColor White Write-Host " docker compose logs -f # Watch all logs" -ForegroundColor White Write-Host " docker compose logs -f vllm # Watch model loading" -ForegroundColor White Write-Host " docker compose logs -f worker-agent # Watch enrollment" -ForegroundColor White } else { Write-Ok "Setup complete. When ready, run:" Write-Host " cd $workerDir" -ForegroundColor White Write-Host " docker compose up -d" -ForegroundColor White } Write-Host @" ╔══════════════════════════════════════════════════╗ ║ Setup complete! Worker: $NodeName ║ Model: $Model ║ Control: http://${ControlNodeIP}:8000 ║ ║ ║ The worker will auto-enroll with the control ║ ║ node and start sending heartbeats. ║ ╚══════════════════════════════════════════════════╝ "@ -ForegroundColor Green