AntiAtropos / deploy /aws /deploy-all.ps1
div18
feat: implement Kubernetes executor for automated cluster scaling and infrastructure management
cf2697b
# AntiAtropos - One-Run Deploy Script
# Deploys entire AWS infrastructure: EKS cluster, workloads, AMP, Prometheus, Grafana
$ErrorActionPreference = "Stop"
# In PowerShell 7+, prevent native stderr from becoming terminating errors.
if (Get-Variable -Name PSNativeCommandUseErrorActionPreference -ErrorAction SilentlyContinue) {
$PSNativeCommandUseErrorActionPreference = $false
}
$Region = "ap-south-1"
$ClusterName = "antiatropos"
$AwsDir = Split-Path -Parent $MyInvocation.MyCommand.Path
$GrafanaMode = if ([string]::IsNullOrWhiteSpace($env:ANTIATROPOS_GRAFANA_MODE)) { "auto" } else { $env:ANTIATROPOS_GRAFANA_MODE.Trim().ToLowerInvariant() }
$GrafanaModeResolved = "cluster"
function Invoke-CheckedCommand {
param(
[ScriptBlock]$Command,
[string]$ErrorMessage
)
$previousErrorActionPreference = $ErrorActionPreference
$ErrorActionPreference = "Continue"
try {
& $Command
} finally {
$ErrorActionPreference = $previousErrorActionPreference
}
if ($LASTEXITCODE -ne 0) {
throw $ErrorMessage
}
}
function Get-EksClusterStatus {
param(
[string]$Name,
[string]$AwsRegion
)
try {
$status = aws eks describe-cluster --name $Name --region $AwsRegion --query 'cluster.status' --output text 2>$null
} catch {
return $null
}
if ($LASTEXITCODE -ne 0 -or [string]::IsNullOrWhiteSpace($status) -or $status -eq "None") {
return $null
}
return $status.Trim()
}
function Test-EksNodegroupExists {
param(
[string]$Cluster,
[string]$Nodegroup,
[string]$AwsRegion
)
try {
aws eks describe-nodegroup --cluster-name $Cluster --nodegroup-name $Nodegroup --region $AwsRegion --query 'nodegroup.nodegroupName' --output text 2>$null | Out-Null
return ($LASTEXITCODE -eq 0)
} catch {
return $false
}
}
function Get-EksNodegroupInstanceType {
param(
[string]$Cluster,
[string]$Nodegroup,
[string]$AwsRegion
)
try {
$instanceType = aws eks describe-nodegroup --cluster-name $Cluster --nodegroup-name $Nodegroup --region $AwsRegion --query 'nodegroup.instanceTypes[0]' --output text 2>$null
} catch {
return $null
}
if ($LASTEXITCODE -ne 0 -or [string]::IsNullOrWhiteSpace($instanceType) -or $instanceType -eq "None") {
return $null
}
return $instanceType.Trim()
}
function Get-NodegroupSubnetSelection {
param(
[string]$Cluster,
[string]$AwsRegion
)
try {
$allSubnetIds = aws eks describe-cluster --name $Cluster --region $AwsRegion --query 'cluster.resourcesVpcConfig.subnetIds' --output text 2>$null
} catch {
throw "Failed to read cluster subnet IDs"
}
if ($LASTEXITCODE -ne 0 -or [string]::IsNullOrWhiteSpace($allSubnetIds)) {
throw "Failed to read cluster subnet IDs"
}
$subnetArray = @($allSubnetIds -split '\s+' | Where-Object { -not [string]::IsNullOrWhiteSpace($_) })
if ($subnetArray.Count -eq 0) {
throw "No subnets found for cluster '$Cluster' in region '$AwsRegion'"
}
$describeSubnetArgs = @(
'ec2', 'describe-subnets',
'--region', $AwsRegion,
'--subnet-ids'
) + $subnetArray + @(
'--query', 'Subnets[?MapPublicIpOnLaunch==true].SubnetId',
'--output', 'text'
)
try {
$publicSubnetIdsText = & aws @describeSubnetArgs 2>$null
} catch {
throw "Failed to classify cluster subnets"
}
if ($LASTEXITCODE -ne 0) {
throw "Failed to classify cluster subnets"
}
$publicSubnetIds = @($publicSubnetIdsText -split '\s+' | Where-Object { -not [string]::IsNullOrWhiteSpace($_) -and $_ -ne "None" })
$privateSubnetIds = @($subnetArray | Where-Object { $publicSubnetIds -notcontains $_ })
if ($publicSubnetIds.Count -gt 0) {
return [PSCustomObject]@{
SubnetCsv = ($publicSubnetIds -join ',')
UsePrivateNetworking = $false
SubnetType = "public"
}
}
if ($privateSubnetIds.Count -gt 0) {
return [PSCustomObject]@{
SubnetCsv = ($privateSubnetIds -join ',')
UsePrivateNetworking = $true
SubnetType = "private"
}
}
throw "Could not determine valid subnets for nodegroup creation"
}
function Get-ReadyNodeCount {
$nodeLines = kubectl get nodes --no-headers 2>$null
if (-not $nodeLines) {
return 0
}
return (@($nodeLines | Select-String -Pattern '\sReady\s').Count)
}
function Wait-ForReadyNodes {
param(
[int]$MinimumReadyNodes,
[int]$TimeoutSeconds = 600
)
$attempts = [Math]::Ceiling($TimeoutSeconds / 10)
for ($i = 0; $i -lt $attempts; $i++) {
$readyCount = Get-ReadyNodeCount
Write-Host "Nodes ready: $readyCount (target: $MinimumReadyNodes)"
if ($readyCount -ge $MinimumReadyNodes) {
return
}
Start-Sleep -Seconds 10
}
throw "Timed out waiting for $MinimumReadyNodes Ready nodes"
}
Write-Host ""
Write-Host "==========================================" -ForegroundColor Cyan
Write-Host " AntiAtropos AWS Infrastructure Deploy" -ForegroundColor Cyan
Write-Host "==========================================" -ForegroundColor Cyan
Write-Host "Region: $Region"
Write-Host "Cluster: $ClusterName"
Write-Host ""
# Check prerequisites
$missing = @()
foreach ($cmd in @("aws", "eksctl", "kubectl", "helm")) {
if (-not (Get-Command $cmd -ErrorAction SilentlyContinue)) {
$missing += $cmd
}
}
if ($missing.Count -gt 0) {
Write-Host "ERROR: Missing: $($missing -join ', ')" -ForegroundColor Red
exit 1
}
# Phase 1: Create EKS Cluster
Write-Host ">>> Phase 1: Creating EKS cluster..." -ForegroundColor Yellow
$clusterStatus = Get-EksClusterStatus -Name $ClusterName -AwsRegion $Region
if ($clusterStatus -eq "DELETING") {
Write-Host "Cluster is currently deleting. Waiting for deletion to complete..." -ForegroundColor Yellow
Invoke-CheckedCommand -Command { aws eks wait cluster-deleted --name $ClusterName --region $Region } -ErrorMessage "Failed while waiting for cluster deletion"
$clusterStatus = $null
}
if (-not $clusterStatus) {
$TempConfig = Join-Path $AwsDir "eksctl-cluster-only.yaml"
$ClusterYaml = Get-Content (Join-Path $AwsDir "eksctl-cluster.yaml") -Raw
$ClusterOnlyYaml = $ClusterYaml -replace '(?s)(managedNodeGroups:.*)', ''
$ClusterOnlyYaml | Out-File -FilePath $TempConfig -Encoding utf8
Invoke-CheckedCommand -Command { eksctl create cluster -f $TempConfig } -ErrorMessage "Failed to create EKS cluster"
Remove-Item $TempConfig -Force
Write-Host "Cluster created" -ForegroundColor Green
} else {
if ($clusterStatus -eq "CREATING") {
Write-Host "Cluster creation in progress. Waiting until ACTIVE..." -ForegroundColor Yellow
Invoke-CheckedCommand -Command { aws eks wait cluster-active --name $ClusterName --region $Region } -ErrorMessage "Cluster did not become active"
}
Write-Host "Cluster already exists (status: $clusterStatus)" -ForegroundColor Green
}
Invoke-CheckedCommand -Command { aws eks wait cluster-active --name $ClusterName --region $Region } -ErrorMessage "Cluster is not active"
Invoke-CheckedCommand -Command { aws eks update-kubeconfig --name $ClusterName --region $Region | Out-Null } -ErrorMessage "Failed to update kubeconfig"
# Phase 2: Create Nodegroup
Write-Host ""
Write-Host ">>> Phase 2: Ensuring compute nodegroup..." -ForegroundColor Yellow
$NodegroupName = "linux-nodes"
$PreferredInstanceType = "t3.micro"
$ngExists = Test-EksNodegroupExists -Cluster $ClusterName -Nodegroup $NodegroupName -AwsRegion $Region
if (-not $ngExists) {
$SubnetSelection = Get-NodegroupSubnetSelection -Cluster $ClusterName -AwsRegion $Region
$SubnetCsv = $SubnetSelection.SubnetCsv
$UsePrivateNetworking = [bool]$SubnetSelection.UsePrivateNetworking
Write-Host "Using $($SubnetSelection.SubnetType) subnets: $SubnetCsv"
Invoke-CheckedCommand -Command {
$args = @(
'create', 'nodegroup',
'--cluster', $ClusterName,
'--region', $Region,
'--name', $NodegroupName,
'--node-type', $PreferredInstanceType,
'--nodes', '4',
'--nodes-min', '2',
'--nodes-max', '8',
'--node-volume-size', '20',
'--subnet-ids', $SubnetCsv
)
if ($UsePrivateNetworking) {
$args += '--node-private-networking'
}
eksctl @args
} -ErrorMessage "Failed to create nodegroup '$NodegroupName'"
Write-Host "Nodegroup created" -ForegroundColor Green
} else {
$existingInstanceType = Get-EksNodegroupInstanceType -Cluster $ClusterName -Nodegroup $NodegroupName -AwsRegion $Region
Write-Host "Nodegroup already exists ($existingInstanceType)" -ForegroundColor Green
}
Invoke-CheckedCommand -Command { aws eks wait nodegroup-active --cluster-name $ClusterName --nodegroup-name $NodegroupName --region $Region } -ErrorMessage "Nodegroup did not become active"
if ($GrafanaMode -in @("auto", "")) {
$effectiveNodeType = Get-EksNodegroupInstanceType -Cluster $ClusterName -Nodegroup $NodegroupName -AwsRegion $Region
if ($effectiveNodeType -eq "t3.micro") {
$GrafanaModeResolved = "external"
} else {
$GrafanaModeResolved = "cluster"
}
} elseif ($GrafanaMode -in @("external", "local", "hf")) {
$GrafanaModeResolved = "external"
} else {
$GrafanaModeResolved = "cluster"
}
Write-Host "Grafana mode: $GrafanaModeResolved" -ForegroundColor Cyan
Write-Host "Waiting for nodes..."
for ($i = 0; $i -lt 60; $i++) {
$nodes = $null
try {
$nodes = kubectl get nodes --no-headers --request-timeout=10s 2>$null
} catch {
Start-Sleep -Seconds 10
continue
}
if ($nodes) {
$readyCount = ($nodes | Select-String -Pattern '\sReady\s').Count
Write-Host "Nodes ready: $readyCount" -ForegroundColor Green
break
}
Start-Sleep -Seconds 10
}
# Phase 3: Deploy Workloads
Write-Host ""
Write-Host ">>> Phase 3: Deploying workloads..." -ForegroundColor Yellow
kubectl create namespace prod-sre --dry-run=client -o yaml | kubectl apply -f - | Out-Null
kubectl apply -f (Join-Path $AwsDir "k8s-workloads.yaml") | Out-Null
Write-Host "Workloads deployed" -ForegroundColor Green
# Phase 4: Create AMP Workspace
Write-Host ""
Write-Host ">>> Phase 4: Creating AMP workspace..." -ForegroundColor Yellow
$AmpWsId = $null
try {
$AmpWsId = aws amp list-workspaces --alias antiatropos-metrics --region $Region --query 'workspaces[0].workspaceId' --output text 2>$null
if ($AmpWsId -eq "None") { $AmpWsId = $null }
} catch {}
if ([string]::IsNullOrWhiteSpace($AmpWsId)) {
$AmpWsId = aws amp create-workspace --alias antiatropos-metrics --region $Region --query 'workspaceId' --output text
}
$AmpUrl = "https://aps-workspaces.$Region.amazonaws.com/workspaces/$AmpWsId"
Write-Host "AMP: $AmpWsId" -ForegroundColor Green
# Phase 5: Install Prometheus
Write-Host ""
Write-Host ">>> Phase 5: Installing Prometheus..." -ForegroundColor Yellow
kubectl create namespace monitoring --dry-run=client -o yaml | kubectl apply -f - | Out-Null
Invoke-CheckedCommand -Command { helm repo add prometheus-community https://prometheus-community.github.io/helm-charts 2>$null | Out-Null } -ErrorMessage "Failed to add prometheus helm repo"
Invoke-CheckedCommand -Command { helm repo update 2>$null | Out-Null } -ErrorMessage "Failed to update helm repos"
$promValuesYaml = Join-Path $AwsDir "prometheus-agent-values.yaml"
$remoteWriteUrl = "$AmpUrl/api/v1/remote_write"
Invoke-CheckedCommand -Command {
helm upgrade --install prometheus-agent prometheus-community/prometheus --namespace monitoring --reset-values -f $promValuesYaml `
--set "alertmanager.enabled=false" `
--set "kube-state-metrics.enabled=false" `
--set "prometheus-node-exporter.enabled=false" `
--set "pushgateway.enabled=false" `
--set "server.enabled=true" `
--set "server.persistentVolume.enabled=false" `
--set "server.resources.requests.cpu=50m" `
--set "server.resources.requests.memory=128Mi" `
--set "server.resources.limits.cpu=300m" `
--set "server.resources.limits.memory=384Mi" `
--set "server.global.scrape_interval=15s" `
--set "server.remoteWrite[0].url=$remoteWriteUrl" `
2>&1 | Out-Null
} -ErrorMessage "Failed to install/upgrade Prometheus"
Write-Host "Prometheus installed" -ForegroundColor Green
# Phase 6: Install Grafana
Write-Host ""
if ($GrafanaModeResolved -eq "cluster") {
Write-Host ">>> Phase 6: Installing Grafana in-cluster..." -ForegroundColor Yellow
Invoke-CheckedCommand -Command { helm repo add grafana https://grafana.github.io/helm-charts 2>$null | Out-Null } -ErrorMessage "Failed to add grafana helm repo"
Invoke-CheckedCommand -Command { helm repo update 2>$null | Out-Null } -ErrorMessage "Failed to update helm repos"
$GrafanaValuesYaml = Join-Path $AwsDir "grafana-values.yaml"
Invoke-CheckedCommand -Command { helm upgrade --install grafana grafana/grafana --namespace monitoring -f $GrafanaValuesYaml 2>&1 | Out-Null } -ErrorMessage "Failed to install/upgrade Grafana"
Write-Host "Waiting for Grafana..."
try {
Invoke-CheckedCommand -Command { kubectl rollout status deployment/grafana --namespace monitoring --timeout=120s 2>$null | Out-Null } -ErrorMessage "Grafana rollout timed out"
} catch {
$pendingGrafanaPod = kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana --field-selector=status.phase=Pending --no-headers 2>$null | Select-Object -First 1
$pendingReason = ""
if ($pendingGrafanaPod) {
$pendingGrafanaPodName = ($pendingGrafanaPod -split '\s+')[0]
$pendingReason = kubectl describe pod $pendingGrafanaPodName -n monitoring 2>$null | Select-String -Pattern "FailedScheduling|Insufficient memory|Too many pods|unbound" -Context 0,2 | Out-String
if (-not [string]::IsNullOrWhiteSpace($pendingReason)) {
Write-Host "Grafana is pending due to scheduler constraints:" -ForegroundColor Yellow
Write-Host $pendingReason -ForegroundColor Yellow
}
}
$shouldScale = $pendingReason -match "Too many pods|Insufficient memory"
if ($shouldScale) {
Write-Host "Scaling nodegroup to 8 nodes and retrying Grafana rollout..." -ForegroundColor Yellow
Invoke-CheckedCommand -Command { eksctl scale nodegroup --cluster $ClusterName --region $Region --name $NodegroupName --nodes 8 } -ErrorMessage "Failed to scale nodegroup"
Invoke-CheckedCommand -Command { aws eks wait nodegroup-active --cluster-name $ClusterName --nodegroup-name $NodegroupName --region $Region } -ErrorMessage "Nodegroup did not become active after scaling"
Write-Host "Waiting for newly scaled nodes to become Ready..." -ForegroundColor Yellow
Wait-ForReadyNodes -MinimumReadyNodes 8 -TimeoutSeconds 900
$pendingGrafanaPodAfterScale = kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana --field-selector=status.phase=Pending --no-headers 2>$null | Select-Object -First 1
if ($pendingGrafanaPodAfterScale) {
$pendingGrafanaPodNameAfterScale = ($pendingGrafanaPodAfterScale -split '\s+')[0]
kubectl delete pod $pendingGrafanaPodNameAfterScale -n monitoring 2>$null | Out-Null
}
Invoke-CheckedCommand -Command { kubectl rollout status deployment/grafana --namespace monitoring --timeout=600s 2>$null | Out-Null } -ErrorMessage "Grafana rollout timed out after scaling"
} else {
throw "Grafana rollout failed. Check: kubectl -n monitoring get pods ; kubectl -n monitoring describe pod -l app.kubernetes.io/name=grafana"
}
}
Write-Host "Grafana installed (admin/antiatropos)" -ForegroundColor Green
} else {
Write-Host ">>> Phase 6: Skipping in-cluster Grafana (external mode)..." -ForegroundColor Yellow
$grafanaRelease = ""
try {
$grafanaRelease = helm list -n monitoring --filter '^grafana$' --short 2>$null
} catch {
$grafanaRelease = ""
}
if (-not [string]::IsNullOrWhiteSpace($grafanaRelease)) {
helm uninstall grafana -n monitoring 2>$null | Out-Null
kubectl delete pvc grafana -n monitoring 2>$null | Out-Null
Write-Host "Removed existing in-cluster Grafana release to save resources" -ForegroundColor Green
}
}
# Phase 7: Install Cluster Autoscaler
Write-Host ""
Write-Host ">>> Phase 7: Installing Cluster Autoscaler..." -ForegroundColor Yellow
Invoke-CheckedCommand -Command { helm repo add autoscaler https://kubernetes.github.io/autoscaler 2>$null | Out-Null } -ErrorMessage "Failed to add autoscaler helm repo"
Invoke-CheckedCommand -Command { helm repo update 2>$null | Out-Null } -ErrorMessage "Failed to update helm repos"
$autoscalerValues = Join-Path $AwsDir "cluster-autoscaler-values.yaml"
Invoke-CheckedCommand -Command { helm upgrade --install cluster-autoscaler autoscaler/cluster-autoscaler --namespace kube-system -f $autoscalerValues 2>&1 | Out-Null } -ErrorMessage "Failed to install/upgrade Cluster Autoscaler"
Write-Host "Cluster Autoscaler installed" -ForegroundColor Green
# Phase 8: Generate Kubeconfig
Write-Host ""
Write-Host ">>> Phase 8: Generating kubeconfig..." -ForegroundColor Yellow
$ClusterEndpoint = aws eks describe-cluster --name $ClusterName --region $Region --query 'cluster.endpoint' --output text
$ClusterCa = aws eks describe-cluster --name $ClusterName --region $Region --query 'cluster.certificateAuthority.data' --output text
$Timestamp = (Get-Date).ToUniversalTime().ToString("yyyy-MM-ddTHH:mm:ssZ")
$output = Join-Path $AwsDir "kubeconfig-antiatropos.yaml"
$kubeconfig = "apiVersion: v1`n" +
"kind: Config`n" +
"clusters:`n" +
" - cluster:`n" +
" certificate-authority-data: $ClusterCa`n" +
" server: $ClusterEndpoint`n" +
" name: $ClusterName`n" +
"contexts:`n" +
" - context:`n" +
" cluster: $ClusterName`n" +
" user: antiatropos-hf-user`n" +
" name: $ClusterName`n" +
"current-context: $ClusterName`n" +
"preferences: {}`n" +
"users:`n" +
" - name: antiatropos-hf-user`n" +
" user:`n" +
" exec:`n" +
" apiVersion: client.authentication.k8s.io/v1beta1`n" +
" command: aws`n" +
" args:`n" +
" - eks`n" +
" - get-token`n" +
" - --region`n" +
" - $Region`n" +
" - --cluster-name`n" +
" - $ClusterName`n" +
" env:`n" +
" - name: AWS_STS_REGIONAL_ENDPOINTS`n" +
" value: regional`n" +
" - name: AWS_DEFAULT_REGION`n" +
" value: $Region`n" +
" interactiveMode: IfAvailable`n"
$kubeconfig | Out-File -FilePath $output -Encoding utf8 -Force
Write-Host "Kubeconfig: $output" -ForegroundColor Green
# Done
Write-Host ""
Write-Host "==========================================" -ForegroundColor Cyan
Write-Host " Deployment Complete!" -ForegroundColor Cyan
Write-Host "==========================================" -ForegroundColor Cyan
Write-Host ""
Write-Host "AMP: $AmpWsId" -ForegroundColor Yellow
if ($GrafanaModeResolved -eq "cluster") {
Write-Host "Grafana: kubectl port-forward svc/grafana 3000 -n monitoring" -ForegroundColor Yellow
Write-Host "Login: admin / antiatropos" -ForegroundColor Yellow
} else {
Write-Host "Grafana: external/local mode enabled (recommended for free-tier nodes)" -ForegroundColor Yellow
Write-Host "Use AMP endpoint as Prometheus datasource with SigV4 auth" -ForegroundColor Yellow
}
Write-Host "Kubeconfig: $output" -ForegroundColor Yellow
Write-Host ""