div18 commited on
Commit
2244d0f
·
1 Parent(s): 923f89f
deploy-local.ps1 ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AntiAtropos Local Cluster Deploy
2
+ # Deploys workloads, Prometheus, and Grafana on the Kind cluster.
3
+ # Grafana port-forward starts automatically at the end.
4
+
5
+ param(
6
+ [switch]$SkipPortForward,
7
+ [int]$GrafanaPort = 3000
8
+ )
9
+
10
+ Write-Host "=== AntiAtropos Local Deploy ===" -ForegroundColor Cyan
11
+ Write-Host ""
12
+
13
+ # --- 1. Check cluster ---
14
+ Write-Host "[1/5] Checking Kind cluster..." -ForegroundColor Yellow
15
+ $cluster = kubectl config current-context 2>$null
16
+ if ($cluster -notmatch "antiatropos") {
17
+ Write-Host "WARNING: Current context is '$cluster', expected 'kind-antiatropos-local'. Proceed anyway? [Y/n]"
18
+ $r = Read-Host
19
+ if ($r -eq 'n') { exit 1 }
20
+ }
21
+
22
+ # --- 2. Deploy workload pods ---
23
+ Write-Host "[2/5] Deploying workload pods..." -ForegroundColor Yellow
24
+ kubectl create ns prod-sre 2>&1 | Out-Null
25
+ kubectl create ns monitoring 2>&1 | Out-Null
26
+ kubectl apply -f "$PSScriptRoot\deploy\local-laptop.yaml"
27
+ Write-Host " Waiting for workloads to be ready..."
28
+ kubectl wait --for=condition=ready pod -l app --all -n prod-sre --timeout=120s 2>$null
29
+ Write-Host " Workloads ready."
30
+
31
+ # --- 3. Deploy Prometheus ---
32
+ Write-Host "[3/5] Deploying Prometheus..." -ForegroundColor Yellow
33
+ $promRelease = helm list -n monitoring -q 2>$null | Select-String "prometheus"
34
+ if ($promRelease) {
35
+ helm upgrade prometheus prometheus-community/prometheus -n monitoring -f "$PSScriptRoot\deploy\prometheus-helm-values.yaml"
36
+ } else {
37
+ helm install prometheus prometheus-community/prometheus -n monitoring -f "$PSScriptRoot\deploy\prometheus-helm-values.yaml"
38
+ }
39
+ Write-Host " Waiting for Prometheus server..."
40
+ kubectl wait --for=condition=ready pod -l "app.kubernetes.io/name=prometheus" -n monitoring --timeout=120s 2>$null
41
+ Write-Host " Prometheus ready."
42
+
43
+ # --- 4. Deploy Grafana ---
44
+ Write-Host "[4/5] Deploying Grafana..." -ForegroundColor Yellow
45
+ # Update dashboard ConfigMap
46
+ kubectl delete configmap grafana-dashboards -n monitoring 2>$null
47
+ kubectl create configmap grafana-dashboards -n monitoring --from-file="$PSScriptRoot\deploy\grafana\provisioning\dashboards\json\"
48
+
49
+ $grafRelease = helm list -n monitoring -q 2>$null | Select-String "grafana"
50
+ if ($grafRelease) {
51
+ helm upgrade grafana grafana/grafana -n monitoring -f "$PSScriptRoot\deploy\grafana-helm-values.yaml"
52
+ } else {
53
+ helm install grafana grafana/grafana -n monitoring -f "$PSScriptRoot\deploy\grafana-helm-values.yaml"
54
+ }
55
+ Write-Host " Waiting for Grafana..."
56
+ kubectl wait --for=condition=ready pod -l "app.kubernetes.io/name=grafana" -n monitoring --timeout=120s 2>$null
57
+ Write-Host " Grafana ready."
58
+
59
+ # --- 5. Start Grafana port-forward ---
60
+ Write-Host "[5/5] Grafana port-forward..." -ForegroundColor Yellow
61
+ if (-not $SkipPortForward) {
62
+ # Kill any existing port-forward on the same port
63
+ $existing = Get-NetTCPConnection -LocalPort $GrafanaPort -ErrorAction SilentlyContinue 2>$null
64
+ if ($existing) {
65
+ $pid = $existing.OwningProcess
66
+ Stop-Process -Id $pid -Force -ErrorAction SilentlyContinue 2>$null
67
+ Start-Sleep -Seconds 1
68
+ }
69
+
70
+ Write-Host " Starting port-forward on localhost:$GrafanaPort..."
71
+ $proc = Start-Process -PassThru -NoNewWindow kubectl -ArgumentList "port-forward","-n","monitoring","svc/grafana","${GrafanaPort}:80"
72
+
73
+ Start-Sleep -Seconds 2
74
+ # Verify the port-forward is alive
75
+ try {
76
+ $null = Invoke-WebRequest -Uri "http://localhost:$GrafanaPort/api/health" -UseBasicParsing -TimeoutSec 5
77
+ Write-Host ""
78
+ Write-Host "=== Deploy Complete ===" -ForegroundColor Green
79
+ Write-Host " Grafana: http://localhost:$GrafanaPort (admin / antiatropos)"
80
+ Write-Host " Dashboards: AntiAtropos Overview, AntiAtropos Live Control Plane"
81
+ Write-Host " Port-forward PID: $($proc.Id)"
82
+ Write-Host ""
83
+ Write-Host "To stop port-forward: Stop-Process -Id $($proc.Id)"
84
+ } catch {
85
+ Write-Host "WARNING: Port-forward started but Grafana not reachable yet. Try: kubectl port-forward -n monitoring svc/grafana ${GrafanaPort}:80"
86
+ }
87
+ } else {
88
+ Write-Host ""
89
+ Write-Host "=== Deploy Complete ===" -ForegroundColor Green
90
+ Write-Host " To access Grafana: kubectl port-forward -n monitoring svc/grafana ${GrafanaPort}:80"
91
+ }
deploy/do/README.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DigitalOcean Droplet one-shot deploy
2
+
3
+ This deploy flow is for a single Ubuntu Droplet running:
4
+ - k3s (single-node Kubernetes)
5
+ - AntiAtropos sample workloads (`prod-sre`)
6
+ - Prometheus + Grafana (`monitoring`)
7
+ - FastAPI control server (`antiatropos-fastapi` systemd service)
8
+
9
+ ## Run
10
+
11
+ From repository root on the Droplet:
12
+
13
+ ```bash
14
+ sudo bash deploy/do/deploy-droplet-one-shot.sh
15
+ ```
16
+
17
+ Optional overrides:
18
+
19
+ ```bash
20
+ sudo REPO_DIR=/opt/AntiAtropos FASTAPI_PORT=8010 MAX_REPLICAS=200 bash deploy/do/deploy-droplet-one-shot.sh
21
+ ```
22
+
23
+ ## What the script configures
24
+
25
+ - k3s kubelet with `max-pods=250`
26
+ - Env file at `.env.droplet` with:
27
+ - `ANTIATROPOS_ENV_MODE=live`
28
+ - `KUBECONFIG=/etc/rancher/k3s/k3s.yaml`
29
+ - `ANTIATROPOS_WORKLOAD_MAP` for `node-0`..`node-4`
30
+ - Systemd service:
31
+ - Name: `antiatropos-fastapi`
32
+ - Exec: `uvicorn server.app:app --host 0.0.0.0 --port 8000`
33
+
34
+ ## Verify
35
+
36
+ ```bash
37
+ systemctl status antiatropos-fastapi --no-pager
38
+ curl http://127.0.0.1:8000/config/runtime
39
+ kubectl get deploy -n prod-sre
40
+ kubectl get pods -n monitoring
41
+ kubectl -n monitoring port-forward svc/grafana 3000:80
42
+ ```
43
+
44
+ ## Agent call example
45
+
46
+ ```bash
47
+ curl -X POST http://127.0.0.1:8000/step \
48
+ -H "Content-Type: application/json" \
49
+ -d '{"action_type":"SCALE_UP","target_node_id":"node-3","parameter":0.6}'
50
+ ```
deploy/do/deploy-droplet-one-shot.sh ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ # One-shot deploy for a single DigitalOcean Droplet:
5
+ # - Installs k3s with kubelet max-pods=250
6
+ # - Deploys workloads + Prometheus + Grafana
7
+ # - Creates env file for live Kubernetes scaling
8
+ # - Starts FastAPI server via systemd (antiatropos-fastapi)
9
+
10
+ if [[ "${EUID}" -ne 0 ]]; then
11
+ echo "Run as root: sudo bash deploy/do/deploy-droplet-one-shot.sh"
12
+ exit 1
13
+ fi
14
+
15
+ REPO_DIR="${REPO_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)}"
16
+ KUBECONFIG_PATH="${KUBECONFIG_PATH:-/etc/rancher/k3s/k3s.yaml}"
17
+ FASTAPI_PORT="${FASTAPI_PORT:-8000}"
18
+ FASTAPI_HOST="${FASTAPI_HOST:-0.0.0.0}"
19
+ K8S_NAMESPACE="${K8S_NAMESPACE:-prod-sre}"
20
+ MONITORING_NAMESPACE="${MONITORING_NAMESPACE:-monitoring}"
21
+ PY_VENV_DIR="${PY_VENV_DIR:-${REPO_DIR}/.venv-droplet}"
22
+ ENV_FILE="${ENV_FILE:-${REPO_DIR}/.env.droplet}"
23
+ MIN_REPLICAS="${MIN_REPLICAS:-1}"
24
+ MAX_REPLICAS="${MAX_REPLICAS:-250}"
25
+ SCALE_STEP="${SCALE_STEP:-3}"
26
+ WORKLOAD_MAP="${WORKLOAD_MAP:-{\"node-0\":{\"deployment\":\"payments\",\"namespace\":\"prod-sre\"},\"node-1\":{\"deployment\":\"checkout\",\"namespace\":\"prod-sre\"},\"node-2\":{\"deployment\":\"catalog\",\"namespace\":\"prod-sre\"},\"node-3\":{\"deployment\":\"cart\",\"namespace\":\"prod-sre\"},\"node-4\":{\"deployment\":\"auth\",\"namespace\":\"prod-sre\"}}}"
27
+
28
+ echo "=== AntiAtropos Droplet One-Shot Deploy ==="
29
+ echo "Repo: ${REPO_DIR}"
30
+ echo "Kubeconfig: ${KUBECONFIG_PATH}"
31
+ echo "FastAPI: ${FASTAPI_HOST}:${FASTAPI_PORT}"
32
+ echo ""
33
+
34
+ if [[ ! -f "${REPO_DIR}/deploy/local-laptop.yaml" ]]; then
35
+ echo "ERROR: deploy/local-laptop.yaml not found. Run from AntiAtropos checkout."
36
+ exit 1
37
+ fi
38
+
39
+ export DEBIAN_FRONTEND=noninteractive
40
+ apt-get update
41
+ apt-get install -y curl ca-certificates gnupg lsb-release python3 python3-venv python3-pip
42
+
43
+ if ! command -v kubectl >/dev/null 2>&1; then
44
+ echo "Installing k3s (includes kubectl)..."
45
+ curl -sfL https://get.k3s.io | sh -s - --write-kubeconfig-mode 644 --kubelet-arg=max-pods=250
46
+ else
47
+ echo "k3s/kubectl already present; skipping k3s install."
48
+ fi
49
+
50
+ if ! command -v helm >/dev/null 2>&1; then
51
+ echo "Installing Helm..."
52
+ curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
53
+ fi
54
+
55
+ export KUBECONFIG="${KUBECONFIG_PATH}"
56
+
57
+ echo "Waiting for Kubernetes node to be Ready..."
58
+ kubectl wait --for=condition=Ready node --all --timeout=180s
59
+
60
+ kubectl create ns "${K8S_NAMESPACE}" >/dev/null 2>&1 || true
61
+ kubectl create ns "${MONITORING_NAMESPACE}" >/dev/null 2>&1 || true
62
+
63
+ echo "Deploying AntiAtropos workloads..."
64
+ kubectl apply -f "${REPO_DIR}/deploy/local-laptop.yaml"
65
+
66
+ echo "Installing/upgrading Prometheus + Grafana..."
67
+ helm repo add prometheus-community https://prometheus-community.github.io/helm-charts >/dev/null 2>&1 || true
68
+ helm repo add grafana https://grafana.github.io/helm-charts >/dev/null 2>&1 || true
69
+ helm repo update
70
+
71
+ helm upgrade --install prometheus prometheus-community/prometheus \
72
+ -n "${MONITORING_NAMESPACE}" \
73
+ -f "${REPO_DIR}/deploy/prometheus-helm-values.yaml"
74
+
75
+ if [[ -d "${REPO_DIR}/deploy/grafana/provisioning/dashboards/json" ]]; then
76
+ kubectl delete configmap grafana-dashboards -n "${MONITORING_NAMESPACE}" >/dev/null 2>&1 || true
77
+ kubectl create configmap grafana-dashboards \
78
+ -n "${MONITORING_NAMESPACE}" \
79
+ --from-file="${REPO_DIR}/deploy/grafana/provisioning/dashboards/json/"
80
+ fi
81
+
82
+ helm upgrade --install grafana grafana/grafana \
83
+ -n "${MONITORING_NAMESPACE}" \
84
+ -f "${REPO_DIR}/deploy/grafana-helm-values.yaml"
85
+
86
+ if [[ ! -f "${ENV_FILE}" ]]; then
87
+ cat > "${ENV_FILE}" <<EOF
88
+ ANTIATROPOS_ENV_MODE=live
89
+ KUBECONFIG=/etc/rancher/k3s/k3s.yaml
90
+ ANTIATROPOS_K8S_NAMESPACE=prod-sre
91
+ ANTIATROPOS_MIN_REPLICAS=${MIN_REPLICAS}
92
+ ANTIATROPOS_MAX_REPLICAS=${MAX_REPLICAS}
93
+ ANTIATROPOS_SCALE_STEP=${SCALE_STEP}
94
+ ANTIATROPOS_WORKLOAD_MAP=${WORKLOAD_MAP}
95
+ EOF
96
+ echo "Created ${ENV_FILE}"
97
+ else
98
+ echo "Using existing ${ENV_FILE}"
99
+ fi
100
+
101
+ echo "Preparing Python environment..."
102
+ python3 -m venv "${PY_VENV_DIR}"
103
+ "${PY_VENV_DIR}/bin/python" -m pip install --upgrade pip
104
+ "${PY_VENV_DIR}/bin/pip" install -r "${REPO_DIR}/server/requirements.txt"
105
+ "${PY_VENV_DIR}/bin/pip" install -e "${REPO_DIR}"
106
+
107
+ cat > /etc/systemd/system/antiatropos-fastapi.service <<EOF
108
+ [Unit]
109
+ Description=AntiAtropos FastAPI Server
110
+ After=network-online.target k3s.service
111
+ Wants=network-online.target
112
+
113
+ [Service]
114
+ Type=simple
115
+ User=root
116
+ WorkingDirectory=${REPO_DIR}
117
+ EnvironmentFile=${ENV_FILE}
118
+ ExecStart=${PY_VENV_DIR}/bin/uvicorn server.app:app --host ${FASTAPI_HOST} --port ${FASTAPI_PORT}
119
+ Restart=always
120
+ RestartSec=3
121
+
122
+ [Install]
123
+ WantedBy=multi-user.target
124
+ EOF
125
+
126
+ systemctl daemon-reload
127
+ systemctl enable --now antiatropos-fastapi
128
+
129
+ echo ""
130
+ echo "Waiting for app readiness..."
131
+ for _ in {1..30}; do
132
+ if curl -fsS "http://127.0.0.1:${FASTAPI_PORT}/config/runtime" >/dev/null 2>&1; then
133
+ break
134
+ fi
135
+ sleep 2
136
+ done
137
+
138
+ echo ""
139
+ echo "=== Deploy Complete ==="
140
+ echo "FastAPI runtime: http://127.0.0.1:${FASTAPI_PORT}/config/runtime"
141
+ echo "FastAPI health: http://127.0.0.1:${FASTAPI_PORT}/state"
142
+ echo "Prometheus svc: kubectl -n ${MONITORING_NAMESPACE} get svc prometheus-server"
143
+ echo "Grafana access: kubectl -n ${MONITORING_NAMESPACE} port-forward svc/grafana 3000:80"
144
+ echo ""
145
+ echo "Service status command:"
146
+ echo " systemctl status antiatropos-fastapi --no-pager"
147
+ echo ""
148
+ echo "If needed, edit env and restart:"
149
+ echo " ${ENV_FILE}"
150
+ echo " systemctl restart antiatropos-fastapi"
deploy/grafana-datasource-local.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Grafana datasource provisioning - points to in-cluster Prometheus
2
+ apiVersion: 1
3
+
4
+ datasources:
5
+ - name: Prometheus
6
+ uid: PBFA97CFB590B2093
7
+ type: prometheus
8
+ access: proxy
9
+ url: http://prometheus-server.monitoring.svc.cluster.local
10
+ isDefault: true
11
+ editable: true
deploy/grafana-helm-values.yaml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Grafana self-hosted on Kind - Simplified dashboard + datasource setup
2
+
3
+ adminUser: admin
4
+ adminPassword: antiatropos
5
+
6
+ service:
7
+ type: ClusterIP
8
+ port: 80
9
+
10
+ persistence:
11
+ enabled: false
12
+
13
+ # Datasource provisioning - mount as separate file
14
+ datasources:
15
+ datasources.yaml:
16
+ apiVersion: 1
17
+ datasources:
18
+ - name: Prometheus
19
+ uid: PBFA97CFB590B2093
20
+ type: prometheus
21
+ access: proxy
22
+ url: http://prometheus-server.monitoring.svc.cluster.local
23
+ isDefault: true
24
+ editable: true
25
+
26
+ # Dashboard provider config
27
+ dashboardProviders:
28
+ dashboardproviders.yaml:
29
+ apiVersion: 1
30
+ providers:
31
+ - name: AntiAtropos
32
+ orgId: 1
33
+ folder: AntiAtropos
34
+ type: file
35
+ disableDeletion: false
36
+ editable: true
37
+ updateIntervalSeconds: 30
38
+ options:
39
+ path: /var/lib/grafana/dashboards/antiatropos
40
+
41
+ # Mount dashboard JSONs from ConfigMap
42
+ extraConfigmapMounts:
43
+ - name: grafana-dashboards
44
+ configMap: grafana-dashboards
45
+ mountPath: /var/lib/grafana/dashboards/antiatropos
46
+ readOnly: true
deploy/local-laptop.yaml CHANGED
@@ -20,7 +20,7 @@ spec:
20
  annotations:
21
  prometheus.io/scrape: "true"
22
  prometheus.io/port: "8080"
23
- prometheus.io/path: "/metrics"
24
  spec:
25
  containers:
26
  - name: auth
@@ -71,7 +71,7 @@ spec:
71
  echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
72
  echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
73
  echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
74
- } > /www/metrics
75
  sleep 2
76
  done
77
  ---
@@ -92,7 +92,7 @@ spec:
92
  annotations:
93
  prometheus.io/scrape: "true"
94
  prometheus.io/port: "8080"
95
- prometheus.io/path: "/metrics"
96
  spec:
97
  containers:
98
  - name: cart
@@ -143,7 +143,7 @@ spec:
143
  echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
144
  echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
145
  echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
146
- } > /www/metrics
147
  sleep 2
148
  done
149
  ---
@@ -164,7 +164,7 @@ spec:
164
  annotations:
165
  prometheus.io/scrape: "true"
166
  prometheus.io/port: "8080"
167
- prometheus.io/path: "/metrics"
168
  spec:
169
  containers:
170
  - name: catalog
@@ -215,7 +215,7 @@ spec:
215
  echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
216
  echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
217
  echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
218
- } > /www/metrics
219
  sleep 2
220
  done
221
  ---
@@ -236,7 +236,7 @@ spec:
236
  annotations:
237
  prometheus.io/scrape: "true"
238
  prometheus.io/port: "8080"
239
- prometheus.io/path: "/metrics"
240
  spec:
241
  containers:
242
  - name: checkout
@@ -287,7 +287,7 @@ spec:
287
  echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
288
  echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
289
  echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
290
- } > /www/metrics
291
  sleep 2
292
  done
293
  ---
@@ -308,7 +308,7 @@ spec:
308
  annotations:
309
  prometheus.io/scrape: "true"
310
  prometheus.io/port: "8080"
311
- prometheus.io/path: "/metrics"
312
  spec:
313
  containers:
314
  - name: payments
@@ -359,7 +359,7 @@ spec:
359
  echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
360
  echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
361
  echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
362
- } > /www/metrics
363
  sleep 2
364
  done
365
 
 
20
  annotations:
21
  prometheus.io/scrape: "true"
22
  prometheus.io/port: "8080"
23
+ prometheus.io/path: "/metrics.txt"
24
  spec:
25
  containers:
26
  - name: auth
 
71
  echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
72
  echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
73
  echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
74
+ } > /www/metrics.txt
75
  sleep 2
76
  done
77
  ---
 
92
  annotations:
93
  prometheus.io/scrape: "true"
94
  prometheus.io/port: "8080"
95
+ prometheus.io/path: "/metrics.txt"
96
  spec:
97
  containers:
98
  - name: cart
 
143
  echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
144
  echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
145
  echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
146
+ } > /www/metrics.txt
147
  sleep 2
148
  done
149
  ---
 
164
  annotations:
165
  prometheus.io/scrape: "true"
166
  prometheus.io/port: "8080"
167
+ prometheus.io/path: "/metrics.txt"
168
  spec:
169
  containers:
170
  - name: catalog
 
215
  echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
216
  echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
217
  echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
218
+ } > /www/metrics.txt
219
  sleep 2
220
  done
221
  ---
 
236
  annotations:
237
  prometheus.io/scrape: "true"
238
  prometheus.io/port: "8080"
239
+ prometheus.io/path: "/metrics.txt"
240
  spec:
241
  containers:
242
  - name: checkout
 
287
  echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
288
  echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
289
  echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
290
+ } > /www/metrics.txt
291
  sleep 2
292
  done
293
  ---
 
308
  annotations:
309
  prometheus.io/scrape: "true"
310
  prometheus.io/port: "8080"
311
+ prometheus.io/path: "/metrics.txt"
312
  spec:
313
  containers:
314
  - name: payments
 
359
  echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
360
  echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
361
  echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
362
+ } > /www/metrics.txt
363
  sleep 2
364
  done
365
 
deploy/prometheus-helm-values.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ server:
2
+ global:
3
+ scrape_interval: 15s
4
+ evaluation_interval: 15s
5
+
6
+ extraScrapeConfigs: |
7
+ - job_name: antiatropos-fastapi
8
+ scrape_protocols:
9
+ - PrometheusText1.0.0
10
+ - PrometheusText0.0.4
11
+ metrics_path: /metrics
12
+ static_configs:
13
+ - targets: ['host.docker.internal:8000']
deploy/prometheus-local.yml ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ global:
2
+ scrape_interval: 15s
3
+ evaluation_interval: 15s
4
+ scrape_protocols:
5
+ - OpenMetricsText1.0.0
6
+ - OpenMetricsText0.0.1
7
+ - PrometheusText1.0.0
8
+ - PrometheusText0.0.4
9
+
10
+ scrape_configs:
11
+ - job_name: kubernetes-pods
12
+ scrape_protocols:
13
+ - PrometheusText1.0.0
14
+ - PrometheusText0.0.4
15
+ kubernetes_sd_configs:
16
+ - role: pod
17
+ relabel_configs:
18
+ - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
19
+ action: keep
20
+ regex: true
21
+ - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
22
+ action: replace
23
+ target_label: __metrics_path__
24
+ regex: (.+)
25
+ - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
26
+ action: replace
27
+ regex: ([^:]+)(?::\d+)?;(\d+)
28
+ replacement: $1:$2
29
+ target_label: __address__
30
+ - source_labels: [__meta_kubernetes_namespace]
31
+ target_label: namespace
32
+ - source_labels: [__meta_kubernetes_pod_name]
33
+ target_label: pod
34
+ - source_labels: [__meta_kubernetes_pod_label_app]
35
+ target_label: app
simulator.py CHANGED
@@ -64,8 +64,11 @@ CRITICAL_NODES: list[str] = ["node-0", "node-1", "node-2"]
64
 
65
  # VIP / business-critical node weights.
66
  # node-0 is the payment portal, so its queue growth or failure matters more.
 
 
 
67
  VIP_NODE_WEIGHTS: dict[str, float] = {
68
- "node-0": 4.0,
69
  }
70
 
71
 
 
64
 
65
  # VIP / business-critical node weights.
66
  # node-0 is the payment portal, so its queue growth or failure matters more.
67
+ # Reduced from 4.0 → 2.0 to prevent reward gradient from creating
68
+ # a local optimum where the agent only scales node-0.
69
+ # At 2×, node-0 is still prioritized but other nodes remain viable targets.
70
  VIP_NODE_WEIGHTS: dict[str, float] = {
71
+ "node-0": 2.0,
72
  }
73
 
74
 
start-grafana.ps1 CHANGED
@@ -1,12 +1,15 @@
1
- docker stop antiatropos-grafana 2>$null
2
- docker rm antiatropos-grafana 2>$null
3
 
4
- Write-Host "Starting local Grafana (datasource -> host.docker.internal:9090)..."
5
 
6
- docker run -d --name antiatropos-grafana -p 3000:3000 `
7
- -v "$PWD\deploy\grafana\provisioning:/etc/grafana/provisioning:ro" `
8
- -e GF_AUTH_ANONYMOUS_ENABLED=true `
9
- -e GF_AUTH_ANONYMOUS_ORG_ROLE=Admin `
10
- grafana/grafana:latest | Out-Null
11
 
12
- Write-Host "Grafana is running at http://localhost:3000"
 
 
 
 
 
 
 
 
1
+ # Start Grafana port-forward from Kind cluster
2
+ # Run 'deploy-local.ps1' first to ensure Grafana is deployed.
3
 
4
+ $port = 3000
5
 
6
+ Write-Host "Starting Grafana port-forward on localhost:$port..."
 
 
 
 
7
 
8
+ # Kill any existing on that port
9
+ $existing = Get-NetTCPConnection -LocalPort $port -ErrorAction SilentlyContinue
10
+ if ($existing) {
11
+ Stop-Process -Id $existing.OwningProcess -Force -ErrorAction SilentlyContinue
12
+ Start-Sleep -Seconds 1
13
+ }
14
+
15
+ kubectl port-forward -n monitoring svc/grafana ${port}:80
teardown-local.ps1 ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AntiAtropos Local Cluster Teardown
2
+ # Removes workloads, Prometheus, and Grafana. Stops port-forwards.
3
+
4
+ Write-Host "=== AntiAtropos Local Teardown ===" -ForegroundColor Cyan
5
+
6
+ # --- Stop port-forward ---
7
+ Write-Host "[1/3] Stopping port-forwards..." -ForegroundColor Yellow
8
+ $connections = Get-NetTCPConnection -LocalPort 3000 -ErrorAction SilentlyContinue
9
+ if ($connections) {
10
+ $connections | ForEach-Object { Stop-Process -Id $_.OwningProcess -Force -ErrorAction SilentlyContinue }
11
+ Write-Host " Stopped port-forward on :3000"
12
+ }
13
+
14
+ # --- Uninstall Helm releases ---
15
+ Write-Host "[2/3] Uninstalling Helm releases..." -ForegroundColor Yellow
16
+ helm uninstall grafana -n monitoring 2>&1 | Out-Null
17
+ Write-Host " Grafana uninstalled."
18
+ helm uninstall prometheus -n monitoring 2>&1 | Out-Null
19
+ Write-Host " Prometheus uninstalled."
20
+
21
+ # --- Delete namespaces ---
22
+ Write-Host "[3/3] Deleting namespaces..." -ForegroundColor Yellow
23
+ kubectl delete ns prod-sre monitoring 2>&1 | Out-Null
24
+
25
+ Start-Sleep -Seconds 3
26
+ Write-Host ""
27
+ Write-Host "=== Teardown Complete ===" -ForegroundColor Green
28
+ kubectl get ns