div18

feat: implement Kubernetes executor for automated cluster scaling and infrastructure management

cf2697b 23 days ago

7.72 kB

	#!/usr/bin/env bash
	# AntiAtropos AWS Infrastructure Deploy Script
	#
	# Deploys: EKS cluster, sample workloads, AMP workspace, Prometheus Agent,
	# AMG workspace, Cluster Autoscaler, and generates kubeconfig for HF Spaces.
	#
	# The AntiAtropos FastAPI server runs on Hugging Face Spaces, NOT on AWS.
	# This script only sets up the infrastructure that HF Spaces connects to.
	#
	# Prerequisites: aws cli, eksctl, kubectl, helm
	#
	# Usage:
	# chmod +x deploy/aws/deploy.sh
	# ./deploy/aws/deploy.sh
	#
	# Environment variables:
	# AWS_REGION - AWS region (default: ap-south-1)
	# CLUSTER_NAME - EKS cluster name (default: antiatropos)

	set -euo pipefail

	REGION="${AWS_REGION:-ap-south-1}"
	CLUSTER_NAME="${CLUSTER_NAME:-antiatropos}"
	AWS_DIR="$(cd "$(dirname "$0")" && pwd)"

	echo "=== AntiAtropos AWS Infrastructure Deployment ==="
	echo "Region: $REGION"
	echo "Cluster: $CLUSTER_NAME"
	echo "FastAPI: Runs on HF Spaces (not deployed here)"
	echo ""

	# --- Check prerequisites ---
	for cmd in aws eksctl kubectl helm; do
	if ! command -v "$cmd" &>/dev/null; then
	echo "ERROR: $cmd is not installed. Please install it first."
	exit 1
	fi
	done

	# --- Phase 1: Create EKS Cluster ---
	echo ""
	echo ">>> Phase 1: Creating EKS cluster..."
	if eksctl get cluster --name "$CLUSTER_NAME" --region "$REGION" &>/dev/null; then
	echo "Cluster $CLUSTER_NAME already exists, skipping creation."
	else
	eksctl create cluster -f "$AWS_DIR/eksctl-cluster.yaml"
	echo "Cluster created."
	fi

	aws eks update-kubeconfig --name "$CLUSTER_NAME" --region "$REGION"
	echo "kubeconfig updated."

	# --- Phase 2: Deploy Sample Workloads ---
	echo ""
	echo ">>> Phase 2: Deploying sample workloads (payments, checkout, catalog, cart, auth)..."
	kubectl apply -f "$AWS_DIR/k8s-workloads.yaml"
	echo "Workloads deployed."
	kubectl get pods -n prod-sre

	# --- Phase 3: Create AMP Workspace ---
	echo ""
	echo ">>> Phase 3: Creating Amazon Managed Prometheus workspace..."
	AMP_WS_ID=$(aws amp list-workspaces --alias antiatropos-metrics --region "$REGION" --query 'workspaces[0].workspaceId' --output text 2>/dev/null \|\| echo "")

	if [ -z "$AMP_WS_ID" ] \|\| [ "$AMP_WS_ID" = "None" ]; then
	AMP_WS_ID=$(aws amp create-workspace \
	--alias antiatropos-metrics \
	--region "$REGION" \
	--query 'workspaceId' \
	--output text)
	echo "AMP workspace created: $AMP_WS_ID"
	else
	echo "AMP workspace already exists: $AMP_WS_ID"
	fi

	AMP_URL="https://aps-workspaces.$REGION.amazonaws.com/workspaces/$AMP_WS_ID"
	echo "AMP URL: $AMP_URL"

	# --- Phase 4: Set up IRSA for Prometheus Agent ---
	echo ""
	echo ">>> Phase 4: Setting up IRSA for Prometheus Agent..."
	if kubectl get serviceaccount prometheus-sa -n monitoring &>/dev/null; then
	echo "prometheus-sa already exists."
	else
	eksctl create iamserviceaccount \
	--cluster "$CLUSTER_NAME" \
	--namespace monitoring \
	--name prometheus-sa \
	--attach-policy-arn arn:aws:iam::aws:policy/AmazonPrometheusRemoteWriteAccess \
	--approve \
	--override-existing-serviceaccounts
	echo "prometheus-sa created."
	fi

	# --- Phase 5: Install Prometheus Agent ---
	echo ""
	echo ">>> Phase 5: Installing Prometheus Agent (remote-writes to AMP)..."
	helm repo add prometheus-community https://prometheus-community.github.io/helm-charts 2>/dev/null \|\| true
	helm repo update

	if helm status prometheus-agent -n monitoring &>/dev/null; then
	echo "prometheus-agent already installed, upgrading..."
	helm upgrade prometheus-agent prometheus-community/prometheus \
	--namespace monitoring \
	-f "$AWS_DIR/prometheus-agent-values.yaml" \
	--set "prometheus.prometheusSpec.remoteWrite[0].url=$AMP_URL/api/v1/remote_write"
	else
	helm install prometheus-agent prometheus-community/prometheus \
	--namespace monitoring --create-namespace \
	-f "$AWS_DIR/prometheus-agent-values.yaml" \
	--set "prometheus.prometheusSpec.remoteWrite[0].url=$AMP_URL/api/v1/remote_write"
	echo "prometheus-agent installed."
	fi

	# --- Phase 6: Install Self-Hosted Grafana on EKS ---
	echo ""
	echo ">>> Phase 6: Installing self-hosted Grafana on EKS..."

	# Add Grafana Helm repo
	helm repo add grafana https://grafana.github.io/helm-charts 2>/dev/null \|\| true
	helm repo update

	# Create a secret with the dashboard JSON files for Grafana to import
	DASHBOARDS_DIR="$AWS_DIR/../../grafana/provisioning/dashboards/json"
	if [ -d "$DASHBOARDS_DIR" ]; then
	echo "Creating dashboard secret from $DASHBOARDS_DIR..."
	kubectl create secret generic antiatropos-grafana-dashboards \
	--from-file=antiatropos-overview.json="$DASHBOARDS_DIR/antiatropos-overview.json" \
	--from-file=antiatropos-live.json="$DASHBOARDS_DIR/antiatropos-live.json" \
	--namespace monitoring \
	--dry-run=client -o yaml \| kubectl apply -f -
	echo "Dashboard secret created."
	else
	echo "Dashboard JSON directory not found at $DASHBOARDS_DIR, skipping."
	fi

	# Install Grafana
	GRAFANA_VALUES="$AWS_DIR/grafana-values.yaml"

	if helm status grafana -n monitoring &>/dev/null; then
	echo "Grafana already installed, upgrading..."
	helm upgrade grafana grafana/grafana --namespace monitoring -f "$GRAFANA_VALUES"
	else
	helm install grafana grafana/grafana --namespace monitoring -f "$GRAFANA_VALUES"
	echo "Grafana installed."
	fi

	# Wait for Grafana pod to be ready
	echo "Waiting for Grafana pod to be ready..."
	kubectl rollout status deployment/grafana --namespace monitoring --timeout=120s 2>/dev/null \|\| true

	GRAFANA_POD=$(kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
	echo "Grafana pod: $GRAFANA_POD"
	echo "To access Grafana: kubectl port-forward svc/grafana 3000 -n monitoring"
	echo "Login: admin / antiatropos"

	# --- Phase 7: Install Cluster Autoscaler ---
	echo ""
	echo ">>> Phase 7: Installing Cluster Autoscaler..."
	helm repo add autoscaler https://kubernetes.github.io/autoscaler 2>/dev/null \|\| true
	helm repo update

	if helm status cluster-autoscaler -n kube-system &>/dev/null; then
	echo "cluster-autoscaler already installed, upgrading..."
	helm upgrade cluster-autoscaler autoscaler/cluster-autoscaler \
	--namespace kube-system \
	-f "$AWS_DIR/cluster-autoscaler-values.yaml"
	else
	helm install cluster-autoscaler autoscaler/cluster-autoscaler \
	--namespace kube-system \
	-f "$AWS_DIR/cluster-autoscaler-values.yaml"
	echo "cluster-autoscaler installed."
	fi

	# --- Phase 8: Generate Kubeconfig for HF Spaces ---
	echo ""
	echo ">>> Phase 8: Generating kubeconfig for HF Spaces..."
	"$AWS_DIR/generate-kubeconfig.sh"

	# --- Done ---
	echo ""
	echo "=========================================="
	echo " AntiAtropos AWS Infrastructure Ready!"
	echo "=========================================="
	echo ""
	echo "AMP Workspace ID: $AMP_WS_ID"
	echo "AMP URL: $AMP_URL"
	echo ""
	echo "Grafana: Self-hosted on EKS (monitoring namespace)"
	echo " Access: kubectl port-forward svc/grafana 3000 -n monitoring"
	echo " Login: admin / antiatropos"
	echo " URL: http://localhost:3000"
	echo ""
	echo "Kubeconfig saved: $AWS_DIR/kubeconfig-antiatropos.yaml"
	echo ""
	echo "Next steps — configure your HF Space:"
	echo " 1. Set secret KUBECONFIG_CONTENT = base64 of kubeconfig-antiatropos.yaml"
	echo " 2. Set env var PROMETHEUS_URL = $AMP_URL"
	echo " 3. Set env var KUBECONFIG = /app/kubeconfig.yaml"
	echo " 4. Set env var ANTIATROPOS_ENV_MODE = live"
	echo " 5. Set env var ANTIATROPOS_MAX_REPLICAS = 6"
	echo " 6. Set env var ANTIATROPOS_WORKLOAD_MAP = (see OPERATIONS.md)"
	echo " 7. Add kubeconfig decode to deploy/entrypoint.sh (see OPERATIONS.md)"