Spaces:

Aaryan17
/

RamMAC

Running

App Files Files Community

RamMAC / setup-worker.ps1

Aaryan17

feat: upload full MAC source (mac/, frontend/, alembic/, tests/)

9c0b225 verified about 1 month ago

raw

history blame contribute delete

20.1 kB

	# ═══════════════════════════════════════════════════════════
	# MAC Worker Node Setup Script (Windows)
	# Run this on each worker PC (PC2, PC3, etc.)
	#
	# Prerequisites: Windows 10/11, NVIDIA GPU, Admin PowerShell
	# This script installs: WSL2, Docker Desktop, NVIDIA Container Toolkit
	# Then deploys the MAC worker with vLLM + heartbeat agent.
	#
	# Usage (run as Administrator):
	# Set-ExecutionPolicy Bypass -Scope Process -Force
	# .\setup-worker.ps1
	# ═══════════════════════════════════════════════════════════

	param(
	[string]$ControlNodeIP = "10.10.13.30",
	[string]$EnrollmentToken = "",
	[string]$NodeName = "",
	[string]$Model = "",
	[int]$VllmPort = 8001,
	[float]$GpuMemUtil = 0.85,
	[int]$MaxModelLen = 8192
	)

	$ErrorActionPreference = "Stop"

	function Write-Step { param([string]$msg) Write-Host "`n[MAC] $msg" -ForegroundColor Cyan }
	function Write-Ok { param([string]$msg) Write-Host "[OK] $msg" -ForegroundColor Green }
	function Write-Warn { param([string]$msg) Write-Host "[!] $msg" -ForegroundColor Yellow }

	Write-Host @"

	╔══════════════════════════════════════════════════╗
	║ MAC Worker Node Setup — MBM AI Cloud ║
	║ Setting up GPU inference worker ║
	╚══════════════════════════════════════════════════╝

	"@ -ForegroundColor Magenta

	# ── Verify admin privileges ──────────────────────────────
	$isAdmin = ([Security.Principal.WindowsPrincipal][Security.Principal.WindowsIdentity]::GetCurrent()).IsInRole([Security.Principal.WindowsBuiltInRole]::Administrator)
	if (-not $isAdmin) {
	Write-Host "ERROR: This script must be run as Administrator" -ForegroundColor Red
	Write-Host "Right-click PowerShell -> Run as Administrator" -ForegroundColor Yellow
	exit 1
	}

	# ── Check NVIDIA driver ─────────────────────────────────
	Write-Step "Checking NVIDIA GPU..."
	try {
	$gpu = & nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv,noheader 2>$null
	if ($gpu) {
	Write-Ok "GPU found: $gpu"
	} else {
	throw "No GPU"
	}
	} catch {
	Write-Host "ERROR: NVIDIA GPU or driver not found. Install latest NVIDIA drivers first:" -ForegroundColor Red
	Write-Host " https://www.nvidia.com/Download/index.aspx" -ForegroundColor Yellow
	exit 1
	}

	# ── Step 1: Install/Enable WSL2 ─────────────────────────
	Write-Step "Checking WSL2..."
	$wslInstalled = $false
	try {
	$wslVersion = wsl --version 2>$null
	if ($LASTEXITCODE -eq 0) { $wslInstalled = $true }
	} catch {}

	if (-not $wslInstalled) {
	Write-Step "Installing WSL2 (this may require a reboot)..."
	wsl --install --no-distribution
	Write-Warn "WSL2 installed. If prompted, REBOOT and re-run this script."
	Write-Host "After reboot, run: .\setup-worker.ps1" -ForegroundColor Yellow
	Read-Host "Press Enter to continue (or Ctrl+C to reboot first)"
	} else {
	Write-Ok "WSL2 is already installed"
	}

	# ── Step 2: Install Docker Desktop ──────────────────────
	Write-Step "Checking Docker..."
	$dockerInstalled = $false
	try {
	$dockerVer = docker version --format '{{.Server.Version}}' 2>$null
	if ($LASTEXITCODE -eq 0 -and $dockerVer) { $dockerInstalled = $true }
	} catch {}

	if (-not $dockerInstalled) {
	Write-Step "Downloading Docker Desktop..."
	$dockerUrl = "https://desktop.docker.com/win/main/amd64/Docker%20Desktop%20Installer.exe"
	$installerPath = "$env:TEMP\DockerDesktopInstaller.exe"

	if (-not (Test-Path $installerPath)) {
	Invoke-WebRequest -Uri $dockerUrl -OutFile $installerPath -UseBasicParsing
	}

	Write-Step "Installing Docker Desktop (this takes a few minutes)..."
	Start-Process -FilePath $installerPath -ArgumentList "install","--quiet","--accept-license" -Wait -NoNewWindow

	Write-Warn "Docker Desktop installed. You need to:"
	Write-Host " 1. Start Docker Desktop from Start Menu" -ForegroundColor Yellow
	Write-Host " 2. Wait for it to finish starting (whale icon in taskbar)" -ForegroundColor Yellow
	Write-Host " 3. Re-run this script" -ForegroundColor Yellow
	Read-Host "Press Enter when Docker Desktop is running"

	# Re-check
	try {
	$dockerVer = docker version --format '{{.Server.Version}}' 2>$null
	if ($LASTEXITCODE -ne 0) { throw "Docker not ready" }
	$dockerInstalled = $true
	} catch {
	Write-Host "Docker is not running yet. Start Docker Desktop and re-run." -ForegroundColor Red
	exit 1
	}
	} else {
	Write-Ok "Docker is installed: $dockerVer"
	}

	# ── Step 3: Verify Docker GPU support ───────────────────
	Write-Step "Checking Docker GPU support..."
	try {
	$gpuTest = docker run --rm --gpus all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi 2>$null
	if ($LASTEXITCODE -eq 0) {
	Write-Ok "Docker GPU (NVIDIA Container Toolkit) is working"
	} else {
	throw "GPU test failed"
	}
	} catch {
	Write-Warn "Docker GPU test failed. Ensure Docker Desktop has WSL2 backend enabled."
	Write-Host " Docker Desktop -> Settings -> General -> Use WSL2 based engine = ON" -ForegroundColor Yellow
	Write-Host " Docker Desktop -> Settings -> Resources -> WSL Integration -> Enable" -ForegroundColor Yellow
	Read-Host "Fix the settings and press Enter to continue"
	}

	# ── Step 3b: Open firewall for vLLM port ────────────────
	Write-Step "Configuring firewall..."
	$fwRule = "MAC vLLM Worker ($VllmPort)"
	$existing = Get-NetFirewallRule -DisplayName $fwRule -ErrorAction SilentlyContinue
	if (-not $existing) {
	New-NetFirewallRule -DisplayName $fwRule -Direction Inbound -Protocol TCP -LocalPort $VllmPort -Action Allow -Profile Private,Domain \| Out-Null
	Write-Ok "Firewall rule created for port $VllmPort"
	} else {
	Write-Ok "Firewall rule already exists for port $VllmPort"
	}

	# ── Step 4: Get deployment parameters ───────────────────
	Write-Step "Configuring worker node..."

	if (-not $NodeName) {
	$hostname = $env:COMPUTERNAME
	$NodeName = Read-Host "Enter node name (default: worker-$hostname)"
	if (-not $NodeName) { $NodeName = "worker-$hostname" }
	}

	if (-not $EnrollmentToken) {
	Write-Host "`nYou need an enrollment token from the MAC admin panel." -ForegroundColor Yellow
	Write-Host "Ask the admin to generate one at: http://$ControlNodeIP/admin -> Cluster -> Generate Token`n" -ForegroundColor Yellow
	$EnrollmentToken = Read-Host "Paste enrollment token"
	if (-not $EnrollmentToken) {
	Write-Host "ERROR: Enrollment token is required" -ForegroundColor Red
	exit 1
	}
	}

	if (-not $Model) {
	Write-Host "`nChoose a model for this worker:" -ForegroundColor Yellow
	Write-Host " [1] Qwen2.5-Coder-7B — Code generation & debugging (recommended for PC2)" -ForegroundColor White
	Write-Host " [2] DeepSeek-R1-7B — Math & reasoning (recommended for PC3)" -ForegroundColor White
	Write-Host " [3] Qwen2.5-7B-Instruct — General chat (same as PC1)" -ForegroundColor White
	Write-Host " [4] Custom model — Enter HuggingFace model name" -ForegroundColor White
	$choice = Read-Host "Select (1-4)"
	switch ($choice) {
	"1" { $Model = "Qwen/Qwen2.5-Coder-7B-Instruct-AWQ" }
	"2" { $Model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" }
	"3" { $Model = "Qwen/Qwen2.5-7B-Instruct-AWQ" }
	"4" { $Model = Read-Host "Enter full HuggingFace model name" }
	default { $Model = "Qwen/Qwen2.5-Coder-7B-Instruct-AWQ" }
	}
	}

	# Get local IP
	$localIP = (Get-NetIPAddress -AddressFamily IPv4 \| Where-Object { $_.InterfaceAlias -like "Wi-Fi" -and $_.PrefixOrigin -eq "Dhcp" } \| Select-Object -First 1).IPAddress
	if (-not $localIP) {
	$localIP = (Get-NetIPAddress -AddressFamily IPv4 \| Where-Object { $_.PrefixOrigin -eq "Dhcp" } \| Select-Object -First 1).IPAddress
	}
	Write-Ok "This PC's IP: $localIP"

	# Get CPU cores and RAM
	$cpuCores = (Get-CimInstance Win32_Processor).NumberOfLogicalProcessors
	$ramMB = [math]::Round((Get-CimInstance Win32_ComputerSystem).TotalPhysicalMemory / 1MB)
	$gpuInfo = (& nvidia-smi --query-gpu=name,memory.total --format=csv,noheader).Trim()
	$gpuName = ($gpuInfo -split ",")[0].Trim()
	$gpuVram = [int](($gpuInfo -split ",")[1].Trim() -replace '[^0-9]','')

	Write-Host "`n Configuration Summary:" -ForegroundColor Cyan
	Write-Host " Node Name: $NodeName"
	Write-Host " Control Node: $ControlNodeIP"
	Write-Host " This PC IP: $localIP"
	Write-Host " GPU: $gpuName ($gpuVram MB)"
	Write-Host " RAM: $ramMB MB"
	Write-Host " CPU Cores: $cpuCores"
	Write-Host " Model: $Model"
	Write-Host " vLLM Port: $VllmPort"

	# ── Step 5: Create worker directory ─────────────────────
	Write-Step "Setting up worker directory..."
	$workerDir = "$env:USERPROFILE\mac-worker"
	if (-not (Test-Path $workerDir)) { New-Item -ItemType Directory -Path $workerDir \| Out-Null }

	# Write .env file
	$envContent = @"
	# MAC Worker Node Configuration
	# Generated: $(Get-Date -Format "yyyy-MM-dd HH:mm:ss")

	CONTROL_NODE_URL=http://${ControlNodeIP}:8000
	ENROLLMENT_TOKEN=${EnrollmentToken}
	NODE_NAME=${NodeName}
	VLLM_MODEL=${Model}
	VLLM_PORT=${VllmPort}
	GPU_MEM_UTIL=${GpuMemUtil}
	MAX_MODEL_LEN=${MaxModelLen}
	GPU_NAME=${gpuName}
	GPU_VRAM_MB=${gpuVram}
	RAM_TOTAL_MB=${ramMB}
	CPU_CORES=${cpuCores}
	HEARTBEAT_INTERVAL=30
	"@

	Set-Content -Path "$workerDir\.env" -Value $envContent
	Write-Ok "Created .env at $workerDir\.env"

	# Write docker-compose.yml
	$composeContent = @"
	# MAC GPU Worker Node — $NodeName
	# Model: $Model
	# Generated: $(Get-Date -Format "yyyy-MM-dd HH:mm:ss")

	services:

	vllm:
	image: vllm/vllm-openai:latest
	container_name: mac-vllm-worker
	ports:
	- "${VllmPort}:${VllmPort}"
	environment:
	- HF_HOME=/root/.cache/huggingface
	volumes:
	- hf-cache:/root/.cache/huggingface
	command: >
	--model `${VLLM_MODEL}
	--port `${VLLM_PORT}
	--gpu-memory-utilization `${GPU_MEM_UTIL}
	--max-model-len `${MAX_MODEL_LEN}
	--trust-remote-code
	--enforce-eager
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: 1
	capabilities: [gpu]
	restart: unless-stopped
	networks:
	- worker-net
	healthcheck:
	test: ["CMD-SHELL", "curl -f http://localhost:`${VLLM_PORT}/health \|\| exit 1"]
	interval: 30s
	timeout: 10s
	retries: 5
	start_period: 120s

	worker-agent:
	image: python:3.11-slim
	container_name: mac-worker-agent
	env_file: .env
	volumes:
	- ./worker-agent.py:/app/agent.py:ro
	- agent-state:/tmp
	command: >
	bash -c "pip install httpx psutil --quiet && python /app/agent.py"
	depends_on:
	vllm:
	condition: service_healthy
	restart: unless-stopped
	networks:
	- worker-net

	volumes:
	hf-cache:
	agent-state:

	networks:
	worker-net:
	driver: bridge
	"@

	Set-Content -Path "$workerDir\docker-compose.yml" -Value $composeContent
	Write-Ok "Created docker-compose.yml"

	# Copy worker-agent.py (download from control node or use local copy)
	$agentScript = @'
	#!/usr/bin/env python3
	"""MAC Worker Agent — Enrolls with control node and sends periodic heartbeats."""

	import asyncio, json, os, socket, sys, time
	import httpx

	CONTROL_URL = os.environ.get("CONTROL_NODE_URL", "http://10.10.13.30:8000")
	ENROLLMENT_TOKEN = os.environ.get("ENROLLMENT_TOKEN", "")
	NODE_NAME = os.environ.get("NODE_NAME", f"worker-{socket.gethostname()}")
	VLLM_PORT = int(os.environ.get("VLLM_PORT", 8001))
	GPU_NAME = os.environ.get("GPU_NAME", "NVIDIA GPU")
	GPU_VRAM_MB = int(os.environ.get("GPU_VRAM_MB", 12288))
	RAM_TOTAL_MB = int(os.environ.get("RAM_TOTAL_MB", 16384))
	CPU_CORES = int(os.environ.get("CPU_CORES", 8))
	HEARTBEAT_INTERVAL = int(os.environ.get("HEARTBEAT_INTERVAL", 30))

	API = f"{CONTROL_URL}/api/v1"
	STATE_FILE = "/tmp/mac_worker_state.json"


	def get_local_ip():
	try:
	s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
	s.connect(("8.8.8.8", 80))
	ip = s.getsockname()[0]
	s.close()
	return ip
	except Exception:
	return "127.0.0.1"


	def load_state():
	try:
	with open(STATE_FILE, "r") as f:
	return json.load(f)
	except (FileNotFoundError, json.JSONDecodeError):
	return {}


	def save_state(data):
	with open(STATE_FILE, "w") as f:
	json.dump(data, f)


	def get_resource_metrics():
	metrics = {"cpu_util_pct": 0.0, "ram_used_mb": 0, "gpu_util_pct": 0.0, "gpu_vram_used_mb": 0}
	try:
	import psutil
	metrics["cpu_util_pct"] = psutil.cpu_percent(interval=1)
	metrics["ram_used_mb"] = int(psutil.virtual_memory().used / 1024 / 1024)
	except ImportError:
	pass
	try:
	import subprocess
	result = subprocess.run(
	["nvidia-smi", "--query-gpu=utilization.gpu,memory.used", "--format=csv,noheader,nounits"],
	capture_output=True, text=True, timeout=5
	)
	if result.returncode == 0:
	parts = result.stdout.strip().split(",")
	if len(parts) >= 2:
	metrics["gpu_util_pct"] = float(parts[0].strip())
	metrics["gpu_vram_used_mb"] = int(float(parts[1].strip()))
	except (FileNotFoundError, Exception):
	pass
	return metrics


	async def enroll(client):
	state = load_state()
	if state.get("node_id"):
	print(f"[AGENT] Already enrolled as node {state['node_id']}")
	return state["node_id"]
	if not ENROLLMENT_TOKEN:
	print("[AGENT] ERROR: No ENROLLMENT_TOKEN set")
	return None
	ip = get_local_ip()
	payload = {
	"enrollment_token": ENROLLMENT_TOKEN, "name": NODE_NAME,
	"hostname": socket.gethostname(), "ip_address": ip, "port": VLLM_PORT,
	"gpu_name": GPU_NAME, "gpu_vram_mb": GPU_VRAM_MB,
	"ram_total_mb": RAM_TOTAL_MB, "cpu_cores": CPU_CORES,
	}
	try:
	resp = await client.post(f"{API}/nodes/enroll", json=payload)
	if resp.status_code == 200:
	data = resp.json()
	node_id = data.get("id")
	save_state({"node_id": node_id, "name": NODE_NAME})
	print(f"[AGENT] Enrolled! Node ID: {node_id}")
	return node_id
	else:
	print(f"[AGENT] Enrollment failed: {resp.status_code} {resp.text}")
	return None
	except httpx.RequestError as e:
	print(f"[AGENT] Connection error: {e}")
	return None


	async def heartbeat_loop(client, node_id):
	consecutive_failures = 0
	while True:
	try:
	metrics = get_resource_metrics()
	resp = await client.post(f"{API}/nodes/heartbeat/{node_id}", json=metrics)
	if resp.status_code == 200:
	consecutive_failures = 0
	warnings = resp.json().get("warnings", [])
	if warnings:
	print(f"[AGENT] Warnings: {warnings}")
	elif resp.status_code == 404:
	print("[AGENT] Node not found — re-enrolling...")
	save_state({})
	return
	else:
	consecutive_failures += 1
	except httpx.RequestError as e:
	consecutive_failures += 1
	print(f"[AGENT] Heartbeat error: {e}")
	if consecutive_failures >= 10:
	print("[AGENT] Too many failures, waiting 60s...")
	await asyncio.sleep(60)
	consecutive_failures = 0
	else:
	await asyncio.sleep(HEARTBEAT_INTERVAL)


	async def wait_for_vllm():
	print(f"[AGENT] Waiting for vLLM on port {VLLM_PORT}...")
	async with httpx.AsyncClient(timeout=5) as client:
	for _ in range(120):
	try:
	resp = await client.get(f"http://localhost:{VLLM_PORT}/health")
	if resp.status_code == 200:
	print("[AGENT] vLLM ready!")
	return True
	except httpx.RequestError:
	pass
	await asyncio.sleep(5)
	print("[AGENT] WARNING: vLLM not ready after 10 min")
	return False


	async def main():
	print(f"[AGENT] MAC Worker Agent — {NODE_NAME}")
	print(f"[AGENT] Control: {CONTROL_URL}")
	await wait_for_vllm()
	async with httpx.AsyncClient(timeout=30) as client:
	node_id = None
	while not node_id:
	node_id = await enroll(client)
	if not node_id:
	print("[AGENT] Retrying in 30s...")
	await asyncio.sleep(30)
	print(f"[AGENT] Starting heartbeat loop (every {HEARTBEAT_INTERVAL}s)")
	while True:
	await heartbeat_loop(client, node_id)
	node_id = None
	while not node_id:
	node_id = await enroll(client)
	if not node_id:
	await asyncio.sleep(30)

	if __name__ == "__main__":
	asyncio.run(main())
	'@

	Set-Content -Path "$workerDir\worker-agent.py" -Value $agentScript
	Write-Ok "Created worker-agent.py"

	# ── Step 6: Launch! ─────────────────────────────────────
	Write-Step "Starting MAC worker..."
	Write-Host "`n This will pull the vLLM Docker image (~8GB) and the model." -ForegroundColor Yellow
	Write-Host " First run may take 15-30 minutes depending on internet speed.`n" -ForegroundColor Yellow

	$startNow = Read-Host "Start the worker now? (Y/n)"
	if ($startNow -ne "n" -and $startNow -ne "N") {
	Push-Location $workerDir
	docker compose up -d
	Pop-Location

	Write-Host "`n" -NoNewline
	Write-Ok "Worker is starting! Monitor with:"
	Write-Host " cd $workerDir" -ForegroundColor White
	Write-Host " docker compose logs -f # Watch all logs" -ForegroundColor White
	Write-Host " docker compose logs -f vllm # Watch model loading" -ForegroundColor White
	Write-Host " docker compose logs -f worker-agent # Watch enrollment" -ForegroundColor White
	} else {
	Write-Ok "Setup complete. When ready, run:"
	Write-Host " cd $workerDir" -ForegroundColor White
	Write-Host " docker compose up -d" -ForegroundColor White
	}

	Write-Host @"

	╔══════════════════════════════════════════════════╗
	║ Setup complete! Worker: $NodeName
	║ Model: $Model
	║ Control: http://${ControlNodeIP}:8000
	║ ║
	║ The worker will auto-enroll with the control ║
	║ node and start sending heartbeats. ║
	╚══════════════════════════════════════════════════╝

	"@ -ForegroundColor Green