div18 commited on
Commit ·
2244d0f
1
Parent(s): 923f89f
droplet
Browse files- deploy-local.ps1 +91 -0
- deploy/do/README.md +50 -0
- deploy/do/deploy-droplet-one-shot.sh +150 -0
- deploy/grafana-datasource-local.yaml +11 -0
- deploy/grafana-helm-values.yaml +46 -0
- deploy/local-laptop.yaml +10 -10
- deploy/prometheus-helm-values.yaml +13 -0
- deploy/prometheus-local.yml +35 -0
- simulator.py +4 -1
- start-grafana.ps1 +12 -9
- teardown-local.ps1 +28 -0
deploy-local.ps1
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AntiAtropos Local Cluster Deploy
|
| 2 |
+
# Deploys workloads, Prometheus, and Grafana on the Kind cluster.
|
| 3 |
+
# Grafana port-forward starts automatically at the end.
|
| 4 |
+
|
| 5 |
+
param(
|
| 6 |
+
[switch]$SkipPortForward,
|
| 7 |
+
[int]$GrafanaPort = 3000
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
Write-Host "=== AntiAtropos Local Deploy ===" -ForegroundColor Cyan
|
| 11 |
+
Write-Host ""
|
| 12 |
+
|
| 13 |
+
# --- 1. Check cluster ---
|
| 14 |
+
Write-Host "[1/5] Checking Kind cluster..." -ForegroundColor Yellow
|
| 15 |
+
$cluster = kubectl config current-context 2>$null
|
| 16 |
+
if ($cluster -notmatch "antiatropos") {
|
| 17 |
+
Write-Host "WARNING: Current context is '$cluster', expected 'kind-antiatropos-local'. Proceed anyway? [Y/n]"
|
| 18 |
+
$r = Read-Host
|
| 19 |
+
if ($r -eq 'n') { exit 1 }
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
# --- 2. Deploy workload pods ---
|
| 23 |
+
Write-Host "[2/5] Deploying workload pods..." -ForegroundColor Yellow
|
| 24 |
+
kubectl create ns prod-sre 2>&1 | Out-Null
|
| 25 |
+
kubectl create ns monitoring 2>&1 | Out-Null
|
| 26 |
+
kubectl apply -f "$PSScriptRoot\deploy\local-laptop.yaml"
|
| 27 |
+
Write-Host " Waiting for workloads to be ready..."
|
| 28 |
+
kubectl wait --for=condition=ready pod -l app --all -n prod-sre --timeout=120s 2>$null
|
| 29 |
+
Write-Host " Workloads ready."
|
| 30 |
+
|
| 31 |
+
# --- 3. Deploy Prometheus ---
|
| 32 |
+
Write-Host "[3/5] Deploying Prometheus..." -ForegroundColor Yellow
|
| 33 |
+
$promRelease = helm list -n monitoring -q 2>$null | Select-String "prometheus"
|
| 34 |
+
if ($promRelease) {
|
| 35 |
+
helm upgrade prometheus prometheus-community/prometheus -n monitoring -f "$PSScriptRoot\deploy\prometheus-helm-values.yaml"
|
| 36 |
+
} else {
|
| 37 |
+
helm install prometheus prometheus-community/prometheus -n monitoring -f "$PSScriptRoot\deploy\prometheus-helm-values.yaml"
|
| 38 |
+
}
|
| 39 |
+
Write-Host " Waiting for Prometheus server..."
|
| 40 |
+
kubectl wait --for=condition=ready pod -l "app.kubernetes.io/name=prometheus" -n monitoring --timeout=120s 2>$null
|
| 41 |
+
Write-Host " Prometheus ready."
|
| 42 |
+
|
| 43 |
+
# --- 4. Deploy Grafana ---
|
| 44 |
+
Write-Host "[4/5] Deploying Grafana..." -ForegroundColor Yellow
|
| 45 |
+
# Update dashboard ConfigMap
|
| 46 |
+
kubectl delete configmap grafana-dashboards -n monitoring 2>$null
|
| 47 |
+
kubectl create configmap grafana-dashboards -n monitoring --from-file="$PSScriptRoot\deploy\grafana\provisioning\dashboards\json\"
|
| 48 |
+
|
| 49 |
+
$grafRelease = helm list -n monitoring -q 2>$null | Select-String "grafana"
|
| 50 |
+
if ($grafRelease) {
|
| 51 |
+
helm upgrade grafana grafana/grafana -n monitoring -f "$PSScriptRoot\deploy\grafana-helm-values.yaml"
|
| 52 |
+
} else {
|
| 53 |
+
helm install grafana grafana/grafana -n monitoring -f "$PSScriptRoot\deploy\grafana-helm-values.yaml"
|
| 54 |
+
}
|
| 55 |
+
Write-Host " Waiting for Grafana..."
|
| 56 |
+
kubectl wait --for=condition=ready pod -l "app.kubernetes.io/name=grafana" -n monitoring --timeout=120s 2>$null
|
| 57 |
+
Write-Host " Grafana ready."
|
| 58 |
+
|
| 59 |
+
# --- 5. Start Grafana port-forward ---
|
| 60 |
+
Write-Host "[5/5] Grafana port-forward..." -ForegroundColor Yellow
|
| 61 |
+
if (-not $SkipPortForward) {
|
| 62 |
+
# Kill any existing port-forward on the same port
|
| 63 |
+
$existing = Get-NetTCPConnection -LocalPort $GrafanaPort -ErrorAction SilentlyContinue 2>$null
|
| 64 |
+
if ($existing) {
|
| 65 |
+
$pid = $existing.OwningProcess
|
| 66 |
+
Stop-Process -Id $pid -Force -ErrorAction SilentlyContinue 2>$null
|
| 67 |
+
Start-Sleep -Seconds 1
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
Write-Host " Starting port-forward on localhost:$GrafanaPort..."
|
| 71 |
+
$proc = Start-Process -PassThru -NoNewWindow kubectl -ArgumentList "port-forward","-n","monitoring","svc/grafana","${GrafanaPort}:80"
|
| 72 |
+
|
| 73 |
+
Start-Sleep -Seconds 2
|
| 74 |
+
# Verify the port-forward is alive
|
| 75 |
+
try {
|
| 76 |
+
$null = Invoke-WebRequest -Uri "http://localhost:$GrafanaPort/api/health" -UseBasicParsing -TimeoutSec 5
|
| 77 |
+
Write-Host ""
|
| 78 |
+
Write-Host "=== Deploy Complete ===" -ForegroundColor Green
|
| 79 |
+
Write-Host " Grafana: http://localhost:$GrafanaPort (admin / antiatropos)"
|
| 80 |
+
Write-Host " Dashboards: AntiAtropos Overview, AntiAtropos Live Control Plane"
|
| 81 |
+
Write-Host " Port-forward PID: $($proc.Id)"
|
| 82 |
+
Write-Host ""
|
| 83 |
+
Write-Host "To stop port-forward: Stop-Process -Id $($proc.Id)"
|
| 84 |
+
} catch {
|
| 85 |
+
Write-Host "WARNING: Port-forward started but Grafana not reachable yet. Try: kubectl port-forward -n monitoring svc/grafana ${GrafanaPort}:80"
|
| 86 |
+
}
|
| 87 |
+
} else {
|
| 88 |
+
Write-Host ""
|
| 89 |
+
Write-Host "=== Deploy Complete ===" -ForegroundColor Green
|
| 90 |
+
Write-Host " To access Grafana: kubectl port-forward -n monitoring svc/grafana ${GrafanaPort}:80"
|
| 91 |
+
}
|
deploy/do/README.md
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# DigitalOcean Droplet one-shot deploy
|
| 2 |
+
|
| 3 |
+
This deploy flow is for a single Ubuntu Droplet running:
|
| 4 |
+
- k3s (single-node Kubernetes)
|
| 5 |
+
- AntiAtropos sample workloads (`prod-sre`)
|
| 6 |
+
- Prometheus + Grafana (`monitoring`)
|
| 7 |
+
- FastAPI control server (`antiatropos-fastapi` systemd service)
|
| 8 |
+
|
| 9 |
+
## Run
|
| 10 |
+
|
| 11 |
+
From repository root on the Droplet:
|
| 12 |
+
|
| 13 |
+
```bash
|
| 14 |
+
sudo bash deploy/do/deploy-droplet-one-shot.sh
|
| 15 |
+
```
|
| 16 |
+
|
| 17 |
+
Optional overrides:
|
| 18 |
+
|
| 19 |
+
```bash
|
| 20 |
+
sudo REPO_DIR=/opt/AntiAtropos FASTAPI_PORT=8010 MAX_REPLICAS=200 bash deploy/do/deploy-droplet-one-shot.sh
|
| 21 |
+
```
|
| 22 |
+
|
| 23 |
+
## What the script configures
|
| 24 |
+
|
| 25 |
+
- k3s kubelet with `max-pods=250`
|
| 26 |
+
- Env file at `.env.droplet` with:
|
| 27 |
+
- `ANTIATROPOS_ENV_MODE=live`
|
| 28 |
+
- `KUBECONFIG=/etc/rancher/k3s/k3s.yaml`
|
| 29 |
+
- `ANTIATROPOS_WORKLOAD_MAP` for `node-0`..`node-4`
|
| 30 |
+
- Systemd service:
|
| 31 |
+
- Name: `antiatropos-fastapi`
|
| 32 |
+
- Exec: `uvicorn server.app:app --host 0.0.0.0 --port 8000`
|
| 33 |
+
|
| 34 |
+
## Verify
|
| 35 |
+
|
| 36 |
+
```bash
|
| 37 |
+
systemctl status antiatropos-fastapi --no-pager
|
| 38 |
+
curl http://127.0.0.1:8000/config/runtime
|
| 39 |
+
kubectl get deploy -n prod-sre
|
| 40 |
+
kubectl get pods -n monitoring
|
| 41 |
+
kubectl -n monitoring port-forward svc/grafana 3000:80
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
## Agent call example
|
| 45 |
+
|
| 46 |
+
```bash
|
| 47 |
+
curl -X POST http://127.0.0.1:8000/step \
|
| 48 |
+
-H "Content-Type: application/json" \
|
| 49 |
+
-d '{"action_type":"SCALE_UP","target_node_id":"node-3","parameter":0.6}'
|
| 50 |
+
```
|
deploy/do/deploy-droplet-one-shot.sh
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
# One-shot deploy for a single DigitalOcean Droplet:
|
| 5 |
+
# - Installs k3s with kubelet max-pods=250
|
| 6 |
+
# - Deploys workloads + Prometheus + Grafana
|
| 7 |
+
# - Creates env file for live Kubernetes scaling
|
| 8 |
+
# - Starts FastAPI server via systemd (antiatropos-fastapi)
|
| 9 |
+
|
| 10 |
+
if [[ "${EUID}" -ne 0 ]]; then
|
| 11 |
+
echo "Run as root: sudo bash deploy/do/deploy-droplet-one-shot.sh"
|
| 12 |
+
exit 1
|
| 13 |
+
fi
|
| 14 |
+
|
| 15 |
+
REPO_DIR="${REPO_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)}"
|
| 16 |
+
KUBECONFIG_PATH="${KUBECONFIG_PATH:-/etc/rancher/k3s/k3s.yaml}"
|
| 17 |
+
FASTAPI_PORT="${FASTAPI_PORT:-8000}"
|
| 18 |
+
FASTAPI_HOST="${FASTAPI_HOST:-0.0.0.0}"
|
| 19 |
+
K8S_NAMESPACE="${K8S_NAMESPACE:-prod-sre}"
|
| 20 |
+
MONITORING_NAMESPACE="${MONITORING_NAMESPACE:-monitoring}"
|
| 21 |
+
PY_VENV_DIR="${PY_VENV_DIR:-${REPO_DIR}/.venv-droplet}"
|
| 22 |
+
ENV_FILE="${ENV_FILE:-${REPO_DIR}/.env.droplet}"
|
| 23 |
+
MIN_REPLICAS="${MIN_REPLICAS:-1}"
|
| 24 |
+
MAX_REPLICAS="${MAX_REPLICAS:-250}"
|
| 25 |
+
SCALE_STEP="${SCALE_STEP:-3}"
|
| 26 |
+
WORKLOAD_MAP="${WORKLOAD_MAP:-{\"node-0\":{\"deployment\":\"payments\",\"namespace\":\"prod-sre\"},\"node-1\":{\"deployment\":\"checkout\",\"namespace\":\"prod-sre\"},\"node-2\":{\"deployment\":\"catalog\",\"namespace\":\"prod-sre\"},\"node-3\":{\"deployment\":\"cart\",\"namespace\":\"prod-sre\"},\"node-4\":{\"deployment\":\"auth\",\"namespace\":\"prod-sre\"}}}"
|
| 27 |
+
|
| 28 |
+
echo "=== AntiAtropos Droplet One-Shot Deploy ==="
|
| 29 |
+
echo "Repo: ${REPO_DIR}"
|
| 30 |
+
echo "Kubeconfig: ${KUBECONFIG_PATH}"
|
| 31 |
+
echo "FastAPI: ${FASTAPI_HOST}:${FASTAPI_PORT}"
|
| 32 |
+
echo ""
|
| 33 |
+
|
| 34 |
+
if [[ ! -f "${REPO_DIR}/deploy/local-laptop.yaml" ]]; then
|
| 35 |
+
echo "ERROR: deploy/local-laptop.yaml not found. Run from AntiAtropos checkout."
|
| 36 |
+
exit 1
|
| 37 |
+
fi
|
| 38 |
+
|
| 39 |
+
export DEBIAN_FRONTEND=noninteractive
|
| 40 |
+
apt-get update
|
| 41 |
+
apt-get install -y curl ca-certificates gnupg lsb-release python3 python3-venv python3-pip
|
| 42 |
+
|
| 43 |
+
if ! command -v kubectl >/dev/null 2>&1; then
|
| 44 |
+
echo "Installing k3s (includes kubectl)..."
|
| 45 |
+
curl -sfL https://get.k3s.io | sh -s - --write-kubeconfig-mode 644 --kubelet-arg=max-pods=250
|
| 46 |
+
else
|
| 47 |
+
echo "k3s/kubectl already present; skipping k3s install."
|
| 48 |
+
fi
|
| 49 |
+
|
| 50 |
+
if ! command -v helm >/dev/null 2>&1; then
|
| 51 |
+
echo "Installing Helm..."
|
| 52 |
+
curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
| 53 |
+
fi
|
| 54 |
+
|
| 55 |
+
export KUBECONFIG="${KUBECONFIG_PATH}"
|
| 56 |
+
|
| 57 |
+
echo "Waiting for Kubernetes node to be Ready..."
|
| 58 |
+
kubectl wait --for=condition=Ready node --all --timeout=180s
|
| 59 |
+
|
| 60 |
+
kubectl create ns "${K8S_NAMESPACE}" >/dev/null 2>&1 || true
|
| 61 |
+
kubectl create ns "${MONITORING_NAMESPACE}" >/dev/null 2>&1 || true
|
| 62 |
+
|
| 63 |
+
echo "Deploying AntiAtropos workloads..."
|
| 64 |
+
kubectl apply -f "${REPO_DIR}/deploy/local-laptop.yaml"
|
| 65 |
+
|
| 66 |
+
echo "Installing/upgrading Prometheus + Grafana..."
|
| 67 |
+
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts >/dev/null 2>&1 || true
|
| 68 |
+
helm repo add grafana https://grafana.github.io/helm-charts >/dev/null 2>&1 || true
|
| 69 |
+
helm repo update
|
| 70 |
+
|
| 71 |
+
helm upgrade --install prometheus prometheus-community/prometheus \
|
| 72 |
+
-n "${MONITORING_NAMESPACE}" \
|
| 73 |
+
-f "${REPO_DIR}/deploy/prometheus-helm-values.yaml"
|
| 74 |
+
|
| 75 |
+
if [[ -d "${REPO_DIR}/deploy/grafana/provisioning/dashboards/json" ]]; then
|
| 76 |
+
kubectl delete configmap grafana-dashboards -n "${MONITORING_NAMESPACE}" >/dev/null 2>&1 || true
|
| 77 |
+
kubectl create configmap grafana-dashboards \
|
| 78 |
+
-n "${MONITORING_NAMESPACE}" \
|
| 79 |
+
--from-file="${REPO_DIR}/deploy/grafana/provisioning/dashboards/json/"
|
| 80 |
+
fi
|
| 81 |
+
|
| 82 |
+
helm upgrade --install grafana grafana/grafana \
|
| 83 |
+
-n "${MONITORING_NAMESPACE}" \
|
| 84 |
+
-f "${REPO_DIR}/deploy/grafana-helm-values.yaml"
|
| 85 |
+
|
| 86 |
+
if [[ ! -f "${ENV_FILE}" ]]; then
|
| 87 |
+
cat > "${ENV_FILE}" <<EOF
|
| 88 |
+
ANTIATROPOS_ENV_MODE=live
|
| 89 |
+
KUBECONFIG=/etc/rancher/k3s/k3s.yaml
|
| 90 |
+
ANTIATROPOS_K8S_NAMESPACE=prod-sre
|
| 91 |
+
ANTIATROPOS_MIN_REPLICAS=${MIN_REPLICAS}
|
| 92 |
+
ANTIATROPOS_MAX_REPLICAS=${MAX_REPLICAS}
|
| 93 |
+
ANTIATROPOS_SCALE_STEP=${SCALE_STEP}
|
| 94 |
+
ANTIATROPOS_WORKLOAD_MAP=${WORKLOAD_MAP}
|
| 95 |
+
EOF
|
| 96 |
+
echo "Created ${ENV_FILE}"
|
| 97 |
+
else
|
| 98 |
+
echo "Using existing ${ENV_FILE}"
|
| 99 |
+
fi
|
| 100 |
+
|
| 101 |
+
echo "Preparing Python environment..."
|
| 102 |
+
python3 -m venv "${PY_VENV_DIR}"
|
| 103 |
+
"${PY_VENV_DIR}/bin/python" -m pip install --upgrade pip
|
| 104 |
+
"${PY_VENV_DIR}/bin/pip" install -r "${REPO_DIR}/server/requirements.txt"
|
| 105 |
+
"${PY_VENV_DIR}/bin/pip" install -e "${REPO_DIR}"
|
| 106 |
+
|
| 107 |
+
cat > /etc/systemd/system/antiatropos-fastapi.service <<EOF
|
| 108 |
+
[Unit]
|
| 109 |
+
Description=AntiAtropos FastAPI Server
|
| 110 |
+
After=network-online.target k3s.service
|
| 111 |
+
Wants=network-online.target
|
| 112 |
+
|
| 113 |
+
[Service]
|
| 114 |
+
Type=simple
|
| 115 |
+
User=root
|
| 116 |
+
WorkingDirectory=${REPO_DIR}
|
| 117 |
+
EnvironmentFile=${ENV_FILE}
|
| 118 |
+
ExecStart=${PY_VENV_DIR}/bin/uvicorn server.app:app --host ${FASTAPI_HOST} --port ${FASTAPI_PORT}
|
| 119 |
+
Restart=always
|
| 120 |
+
RestartSec=3
|
| 121 |
+
|
| 122 |
+
[Install]
|
| 123 |
+
WantedBy=multi-user.target
|
| 124 |
+
EOF
|
| 125 |
+
|
| 126 |
+
systemctl daemon-reload
|
| 127 |
+
systemctl enable --now antiatropos-fastapi
|
| 128 |
+
|
| 129 |
+
echo ""
|
| 130 |
+
echo "Waiting for app readiness..."
|
| 131 |
+
for _ in {1..30}; do
|
| 132 |
+
if curl -fsS "http://127.0.0.1:${FASTAPI_PORT}/config/runtime" >/dev/null 2>&1; then
|
| 133 |
+
break
|
| 134 |
+
fi
|
| 135 |
+
sleep 2
|
| 136 |
+
done
|
| 137 |
+
|
| 138 |
+
echo ""
|
| 139 |
+
echo "=== Deploy Complete ==="
|
| 140 |
+
echo "FastAPI runtime: http://127.0.0.1:${FASTAPI_PORT}/config/runtime"
|
| 141 |
+
echo "FastAPI health: http://127.0.0.1:${FASTAPI_PORT}/state"
|
| 142 |
+
echo "Prometheus svc: kubectl -n ${MONITORING_NAMESPACE} get svc prometheus-server"
|
| 143 |
+
echo "Grafana access: kubectl -n ${MONITORING_NAMESPACE} port-forward svc/grafana 3000:80"
|
| 144 |
+
echo ""
|
| 145 |
+
echo "Service status command:"
|
| 146 |
+
echo " systemctl status antiatropos-fastapi --no-pager"
|
| 147 |
+
echo ""
|
| 148 |
+
echo "If needed, edit env and restart:"
|
| 149 |
+
echo " ${ENV_FILE}"
|
| 150 |
+
echo " systemctl restart antiatropos-fastapi"
|
deploy/grafana-datasource-local.yaml
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Grafana datasource provisioning - points to in-cluster Prometheus
|
| 2 |
+
apiVersion: 1
|
| 3 |
+
|
| 4 |
+
datasources:
|
| 5 |
+
- name: Prometheus
|
| 6 |
+
uid: PBFA97CFB590B2093
|
| 7 |
+
type: prometheus
|
| 8 |
+
access: proxy
|
| 9 |
+
url: http://prometheus-server.monitoring.svc.cluster.local
|
| 10 |
+
isDefault: true
|
| 11 |
+
editable: true
|
deploy/grafana-helm-values.yaml
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Grafana self-hosted on Kind - Simplified dashboard + datasource setup
|
| 2 |
+
|
| 3 |
+
adminUser: admin
|
| 4 |
+
adminPassword: antiatropos
|
| 5 |
+
|
| 6 |
+
service:
|
| 7 |
+
type: ClusterIP
|
| 8 |
+
port: 80
|
| 9 |
+
|
| 10 |
+
persistence:
|
| 11 |
+
enabled: false
|
| 12 |
+
|
| 13 |
+
# Datasource provisioning - mount as separate file
|
| 14 |
+
datasources:
|
| 15 |
+
datasources.yaml:
|
| 16 |
+
apiVersion: 1
|
| 17 |
+
datasources:
|
| 18 |
+
- name: Prometheus
|
| 19 |
+
uid: PBFA97CFB590B2093
|
| 20 |
+
type: prometheus
|
| 21 |
+
access: proxy
|
| 22 |
+
url: http://prometheus-server.monitoring.svc.cluster.local
|
| 23 |
+
isDefault: true
|
| 24 |
+
editable: true
|
| 25 |
+
|
| 26 |
+
# Dashboard provider config
|
| 27 |
+
dashboardProviders:
|
| 28 |
+
dashboardproviders.yaml:
|
| 29 |
+
apiVersion: 1
|
| 30 |
+
providers:
|
| 31 |
+
- name: AntiAtropos
|
| 32 |
+
orgId: 1
|
| 33 |
+
folder: AntiAtropos
|
| 34 |
+
type: file
|
| 35 |
+
disableDeletion: false
|
| 36 |
+
editable: true
|
| 37 |
+
updateIntervalSeconds: 30
|
| 38 |
+
options:
|
| 39 |
+
path: /var/lib/grafana/dashboards/antiatropos
|
| 40 |
+
|
| 41 |
+
# Mount dashboard JSONs from ConfigMap
|
| 42 |
+
extraConfigmapMounts:
|
| 43 |
+
- name: grafana-dashboards
|
| 44 |
+
configMap: grafana-dashboards
|
| 45 |
+
mountPath: /var/lib/grafana/dashboards/antiatropos
|
| 46 |
+
readOnly: true
|
deploy/local-laptop.yaml
CHANGED
|
@@ -20,7 +20,7 @@ spec:
|
|
| 20 |
annotations:
|
| 21 |
prometheus.io/scrape: "true"
|
| 22 |
prometheus.io/port: "8080"
|
| 23 |
-
prometheus.io/path: "/metrics"
|
| 24 |
spec:
|
| 25 |
containers:
|
| 26 |
- name: auth
|
|
@@ -71,7 +71,7 @@ spec:
|
|
| 71 |
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
|
| 72 |
echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
|
| 73 |
echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
|
| 74 |
-
} > /www/metrics
|
| 75 |
sleep 2
|
| 76 |
done
|
| 77 |
---
|
|
@@ -92,7 +92,7 @@ spec:
|
|
| 92 |
annotations:
|
| 93 |
prometheus.io/scrape: "true"
|
| 94 |
prometheus.io/port: "8080"
|
| 95 |
-
prometheus.io/path: "/metrics"
|
| 96 |
spec:
|
| 97 |
containers:
|
| 98 |
- name: cart
|
|
@@ -143,7 +143,7 @@ spec:
|
|
| 143 |
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
|
| 144 |
echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
|
| 145 |
echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
|
| 146 |
-
} > /www/metrics
|
| 147 |
sleep 2
|
| 148 |
done
|
| 149 |
---
|
|
@@ -164,7 +164,7 @@ spec:
|
|
| 164 |
annotations:
|
| 165 |
prometheus.io/scrape: "true"
|
| 166 |
prometheus.io/port: "8080"
|
| 167 |
-
prometheus.io/path: "/metrics"
|
| 168 |
spec:
|
| 169 |
containers:
|
| 170 |
- name: catalog
|
|
@@ -215,7 +215,7 @@ spec:
|
|
| 215 |
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
|
| 216 |
echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
|
| 217 |
echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
|
| 218 |
-
} > /www/metrics
|
| 219 |
sleep 2
|
| 220 |
done
|
| 221 |
---
|
|
@@ -236,7 +236,7 @@ spec:
|
|
| 236 |
annotations:
|
| 237 |
prometheus.io/scrape: "true"
|
| 238 |
prometheus.io/port: "8080"
|
| 239 |
-
prometheus.io/path: "/metrics"
|
| 240 |
spec:
|
| 241 |
containers:
|
| 242 |
- name: checkout
|
|
@@ -287,7 +287,7 @@ spec:
|
|
| 287 |
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
|
| 288 |
echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
|
| 289 |
echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
|
| 290 |
-
} > /www/metrics
|
| 291 |
sleep 2
|
| 292 |
done
|
| 293 |
---
|
|
@@ -308,7 +308,7 @@ spec:
|
|
| 308 |
annotations:
|
| 309 |
prometheus.io/scrape: "true"
|
| 310 |
prometheus.io/port: "8080"
|
| 311 |
-
prometheus.io/path: "/metrics"
|
| 312 |
spec:
|
| 313 |
containers:
|
| 314 |
- name: payments
|
|
@@ -359,7 +359,7 @@ spec:
|
|
| 359 |
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
|
| 360 |
echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
|
| 361 |
echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
|
| 362 |
-
} > /www/metrics
|
| 363 |
sleep 2
|
| 364 |
done
|
| 365 |
|
|
|
|
| 20 |
annotations:
|
| 21 |
prometheus.io/scrape: "true"
|
| 22 |
prometheus.io/port: "8080"
|
| 23 |
+
prometheus.io/path: "/metrics.txt"
|
| 24 |
spec:
|
| 25 |
containers:
|
| 26 |
- name: auth
|
|
|
|
| 71 |
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
|
| 72 |
echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
|
| 73 |
echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
|
| 74 |
+
} > /www/metrics.txt
|
| 75 |
sleep 2
|
| 76 |
done
|
| 77 |
---
|
|
|
|
| 92 |
annotations:
|
| 93 |
prometheus.io/scrape: "true"
|
| 94 |
prometheus.io/port: "8080"
|
| 95 |
+
prometheus.io/path: "/metrics.txt"
|
| 96 |
spec:
|
| 97 |
containers:
|
| 98 |
- name: cart
|
|
|
|
| 143 |
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
|
| 144 |
echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
|
| 145 |
echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
|
| 146 |
+
} > /www/metrics.txt
|
| 147 |
sleep 2
|
| 148 |
done
|
| 149 |
---
|
|
|
|
| 164 |
annotations:
|
| 165 |
prometheus.io/scrape: "true"
|
| 166 |
prometheus.io/port: "8080"
|
| 167 |
+
prometheus.io/path: "/metrics.txt"
|
| 168 |
spec:
|
| 169 |
containers:
|
| 170 |
- name: catalog
|
|
|
|
| 215 |
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
|
| 216 |
echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
|
| 217 |
echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
|
| 218 |
+
} > /www/metrics.txt
|
| 219 |
sleep 2
|
| 220 |
done
|
| 221 |
---
|
|
|
|
| 236 |
annotations:
|
| 237 |
prometheus.io/scrape: "true"
|
| 238 |
prometheus.io/port: "8080"
|
| 239 |
+
prometheus.io/path: "/metrics.txt"
|
| 240 |
spec:
|
| 241 |
containers:
|
| 242 |
- name: checkout
|
|
|
|
| 287 |
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
|
| 288 |
echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
|
| 289 |
echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
|
| 290 |
+
} > /www/metrics.txt
|
| 291 |
sleep 2
|
| 292 |
done
|
| 293 |
---
|
|
|
|
| 308 |
annotations:
|
| 309 |
prometheus.io/scrape: "true"
|
| 310 |
prometheus.io/port: "8080"
|
| 311 |
+
prometheus.io/path: "/metrics.txt"
|
| 312 |
spec:
|
| 313 |
containers:
|
| 314 |
- name: payments
|
|
|
|
| 359 |
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
|
| 360 |
echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
|
| 361 |
echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
|
| 362 |
+
} > /www/metrics.txt
|
| 363 |
sleep 2
|
| 364 |
done
|
| 365 |
|
deploy/prometheus-helm-values.yaml
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
server:
|
| 2 |
+
global:
|
| 3 |
+
scrape_interval: 15s
|
| 4 |
+
evaluation_interval: 15s
|
| 5 |
+
|
| 6 |
+
extraScrapeConfigs: |
|
| 7 |
+
- job_name: antiatropos-fastapi
|
| 8 |
+
scrape_protocols:
|
| 9 |
+
- PrometheusText1.0.0
|
| 10 |
+
- PrometheusText0.0.4
|
| 11 |
+
metrics_path: /metrics
|
| 12 |
+
static_configs:
|
| 13 |
+
- targets: ['host.docker.internal:8000']
|
deploy/prometheus-local.yml
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
global:
|
| 2 |
+
scrape_interval: 15s
|
| 3 |
+
evaluation_interval: 15s
|
| 4 |
+
scrape_protocols:
|
| 5 |
+
- OpenMetricsText1.0.0
|
| 6 |
+
- OpenMetricsText0.0.1
|
| 7 |
+
- PrometheusText1.0.0
|
| 8 |
+
- PrometheusText0.0.4
|
| 9 |
+
|
| 10 |
+
scrape_configs:
|
| 11 |
+
- job_name: kubernetes-pods
|
| 12 |
+
scrape_protocols:
|
| 13 |
+
- PrometheusText1.0.0
|
| 14 |
+
- PrometheusText0.0.4
|
| 15 |
+
kubernetes_sd_configs:
|
| 16 |
+
- role: pod
|
| 17 |
+
relabel_configs:
|
| 18 |
+
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
|
| 19 |
+
action: keep
|
| 20 |
+
regex: true
|
| 21 |
+
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
|
| 22 |
+
action: replace
|
| 23 |
+
target_label: __metrics_path__
|
| 24 |
+
regex: (.+)
|
| 25 |
+
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
|
| 26 |
+
action: replace
|
| 27 |
+
regex: ([^:]+)(?::\d+)?;(\d+)
|
| 28 |
+
replacement: $1:$2
|
| 29 |
+
target_label: __address__
|
| 30 |
+
- source_labels: [__meta_kubernetes_namespace]
|
| 31 |
+
target_label: namespace
|
| 32 |
+
- source_labels: [__meta_kubernetes_pod_name]
|
| 33 |
+
target_label: pod
|
| 34 |
+
- source_labels: [__meta_kubernetes_pod_label_app]
|
| 35 |
+
target_label: app
|
simulator.py
CHANGED
|
@@ -64,8 +64,11 @@ CRITICAL_NODES: list[str] = ["node-0", "node-1", "node-2"]
|
|
| 64 |
|
| 65 |
# VIP / business-critical node weights.
|
| 66 |
# node-0 is the payment portal, so its queue growth or failure matters more.
|
|
|
|
|
|
|
|
|
|
| 67 |
VIP_NODE_WEIGHTS: dict[str, float] = {
|
| 68 |
-
"node-0":
|
| 69 |
}
|
| 70 |
|
| 71 |
|
|
|
|
| 64 |
|
| 65 |
# VIP / business-critical node weights.
|
| 66 |
# node-0 is the payment portal, so its queue growth or failure matters more.
|
| 67 |
+
# Reduced from 4.0 → 2.0 to prevent reward gradient from creating
|
| 68 |
+
# a local optimum where the agent only scales node-0.
|
| 69 |
+
# At 2×, node-0 is still prioritized but other nodes remain viable targets.
|
| 70 |
VIP_NODE_WEIGHTS: dict[str, float] = {
|
| 71 |
+
"node-0": 2.0,
|
| 72 |
}
|
| 73 |
|
| 74 |
|
start-grafana.ps1
CHANGED
|
@@ -1,12 +1,15 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
|
| 4 |
-
|
| 5 |
|
| 6 |
-
|
| 7 |
-
-v "$PWD\deploy\grafana\provisioning:/etc/grafana/provisioning:ro" `
|
| 8 |
-
-e GF_AUTH_ANONYMOUS_ENABLED=true `
|
| 9 |
-
-e GF_AUTH_ANONYMOUS_ORG_ROLE=Admin `
|
| 10 |
-
grafana/grafana:latest | Out-Null
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Start Grafana port-forward from Kind cluster
|
| 2 |
+
# Run 'deploy-local.ps1' first to ensure Grafana is deployed.
|
| 3 |
|
| 4 |
+
$port = 3000
|
| 5 |
|
| 6 |
+
Write-Host "Starting Grafana port-forward on localhost:$port..."
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
+
# Kill any existing on that port
|
| 9 |
+
$existing = Get-NetTCPConnection -LocalPort $port -ErrorAction SilentlyContinue
|
| 10 |
+
if ($existing) {
|
| 11 |
+
Stop-Process -Id $existing.OwningProcess -Force -ErrorAction SilentlyContinue
|
| 12 |
+
Start-Sleep -Seconds 1
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
kubectl port-forward -n monitoring svc/grafana ${port}:80
|
teardown-local.ps1
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AntiAtropos Local Cluster Teardown
|
| 2 |
+
# Removes workloads, Prometheus, and Grafana. Stops port-forwards.
|
| 3 |
+
|
| 4 |
+
Write-Host "=== AntiAtropos Local Teardown ===" -ForegroundColor Cyan
|
| 5 |
+
|
| 6 |
+
# --- Stop port-forward ---
|
| 7 |
+
Write-Host "[1/3] Stopping port-forwards..." -ForegroundColor Yellow
|
| 8 |
+
$connections = Get-NetTCPConnection -LocalPort 3000 -ErrorAction SilentlyContinue
|
| 9 |
+
if ($connections) {
|
| 10 |
+
$connections | ForEach-Object { Stop-Process -Id $_.OwningProcess -Force -ErrorAction SilentlyContinue }
|
| 11 |
+
Write-Host " Stopped port-forward on :3000"
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
# --- Uninstall Helm releases ---
|
| 15 |
+
Write-Host "[2/3] Uninstalling Helm releases..." -ForegroundColor Yellow
|
| 16 |
+
helm uninstall grafana -n monitoring 2>&1 | Out-Null
|
| 17 |
+
Write-Host " Grafana uninstalled."
|
| 18 |
+
helm uninstall prometheus -n monitoring 2>&1 | Out-Null
|
| 19 |
+
Write-Host " Prometheus uninstalled."
|
| 20 |
+
|
| 21 |
+
# --- Delete namespaces ---
|
| 22 |
+
Write-Host "[3/3] Deleting namespaces..." -ForegroundColor Yellow
|
| 23 |
+
kubectl delete ns prod-sre monitoring 2>&1 | Out-Null
|
| 24 |
+
|
| 25 |
+
Start-Sleep -Seconds 3
|
| 26 |
+
Write-Host ""
|
| 27 |
+
Write-Host "=== Teardown Complete ===" -ForegroundColor Green
|
| 28 |
+
kubectl get ns
|