div18 commited on
Commit ·
af7de22
1
Parent(s): d6496d4
Enhance Kubernetes executor and deployment configurations
Browse files- Introduced remote execution capabilities in KubernetesExecutor to delegate actions to a FastAPI control plane.
- Updated .env.example to clarify Prometheus URL usage for local and VM telemetry.
- Modified Prometheus Helm values to expose the service on NodePort 30090 and added a scrape job for annotated pods in the prod-sre namespace.
- Updated deployment scripts and README to reflect new Prometheus configurations and provide clearer instructions for local setup.
- .env.example +2 -1
- control/kubernetes_executor.py +74 -1
- deploy/do/README.md +9 -0
- deploy/do/deploy-droplet-one-shot.sh +7 -0
- deploy/prometheus-helm-values.yaml +23 -7
.env.example
CHANGED
|
@@ -6,7 +6,8 @@ ANTIATROPOS_ENV_MODE=live
|
|
| 6 |
# Reward output to agent
|
| 7 |
ANTIATROPOS_REWARD_OUTPUT_MODE=normalized
|
| 8 |
|
| 9 |
-
#
|
|
|
|
| 10 |
PROMETHEUS_URL=http://localhost:9090
|
| 11 |
ANTIATROPOS_PROM_TIMEOUT_S=5.0
|
| 12 |
ANTIATROPOS_STRICT_REAL=false
|
|
|
|
| 6 |
# Reward output to agent
|
| 7 |
ANTIATROPOS_REWARD_OUTPUT_MODE=normalized
|
| 8 |
|
| 9 |
+
# Prometheus endpoint used by local simulator FastAPI.
|
| 10 |
+
# For VM telemetry, set to droplet Prometheus NodePort, e.g. http://206.189.136.21:30090
|
| 11 |
PROMETHEUS_URL=http://localhost:9090
|
| 12 |
ANTIATROPOS_PROM_TIMEOUT_S=5.0
|
| 13 |
ANTIATROPOS_STRICT_REAL=false
|
control/kubernetes_executor.py
CHANGED
|
@@ -2,6 +2,7 @@ import os
|
|
| 2 |
import json
|
| 3 |
import time
|
| 4 |
import logging
|
|
|
|
| 5 |
from uuid import uuid4
|
| 6 |
from typing import Optional
|
| 7 |
|
|
@@ -15,7 +16,12 @@ class KubernetesExecutor:
|
|
| 15 |
def __init__(self, kubeconfig: Optional[str] = None):
|
| 16 |
# Use provided path or env var, defaulting to mock if neither is found
|
| 17 |
self.kubeconfig = kubeconfig or os.getenv("KUBECONFIG")
|
| 18 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
self.namespace = os.getenv("ANTIATROPOS_K8S_NAMESPACE", "default")
|
| 20 |
self.min_replicas = int(os.getenv("ANTIATROPOS_MIN_REPLICAS", "1"))
|
| 21 |
self.max_replicas = self._parse_max_replicas(os.getenv("ANTIATROPOS_MAX_REPLICAS"))
|
|
@@ -112,6 +118,9 @@ class KubernetesExecutor:
|
|
| 112 |
"""Execute bounded actions on a Kubernetes cluster."""
|
| 113 |
action = self._normalize_action_type(action_type)
|
| 114 |
|
|
|
|
|
|
|
|
|
|
| 115 |
if action == "NO_OP":
|
| 116 |
return "Ack: NO_OP - no cluster mutation"
|
| 117 |
|
|
@@ -163,6 +172,70 @@ class KubernetesExecutor:
|
|
| 163 |
f"in namespace {namespace} scaled {current}->{desired}"
|
| 164 |
)
|
| 165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
def _get_apps_v1_api(self):
|
| 167 |
if self._apps_v1_api is not None:
|
| 168 |
return self._apps_v1_api
|
|
|
|
| 2 |
import json
|
| 3 |
import time
|
| 4 |
import logging
|
| 5 |
+
import requests
|
| 6 |
from uuid import uuid4
|
| 7 |
from typing import Optional
|
| 8 |
|
|
|
|
| 16 |
def __init__(self, kubeconfig: Optional[str] = None):
|
| 17 |
# Use provided path or env var, defaulting to mock if neither is found
|
| 18 |
self.kubeconfig = kubeconfig or os.getenv("KUBECONFIG")
|
| 19 |
+
self.remote_control_url = os.getenv("ANTIATROPOS_CONTROL_PLANE_URL", "").strip().rstrip("/")
|
| 20 |
+
self.remote_timeout_s = float(os.getenv("ANTIATROPOS_CONTROL_TIMEOUT_S", "5.0"))
|
| 21 |
+
self.is_mock = (
|
| 22 |
+
not self.remote_control_url
|
| 23 |
+
and (not self.kubeconfig or self.kubeconfig.lower() == "mock")
|
| 24 |
+
)
|
| 25 |
self.namespace = os.getenv("ANTIATROPOS_K8S_NAMESPACE", "default")
|
| 26 |
self.min_replicas = int(os.getenv("ANTIATROPOS_MIN_REPLICAS", "1"))
|
| 27 |
self.max_replicas = self._parse_max_replicas(os.getenv("ANTIATROPOS_MAX_REPLICAS"))
|
|
|
|
| 118 |
"""Execute bounded actions on a Kubernetes cluster."""
|
| 119 |
action = self._normalize_action_type(action_type)
|
| 120 |
|
| 121 |
+
if self.remote_control_url:
|
| 122 |
+
return self._remote_execution(action, target, parameter)
|
| 123 |
+
|
| 124 |
if action == "NO_OP":
|
| 125 |
return "Ack: NO_OP - no cluster mutation"
|
| 126 |
|
|
|
|
| 172 |
f"in namespace {namespace} scaled {current}->{desired}"
|
| 173 |
)
|
| 174 |
|
| 175 |
+
def _remote_execution(self, action: str, target: str, parameter: float) -> str:
|
| 176 |
+
"""
|
| 177 |
+
Delegate action execution to a remote FastAPI control plane.
|
| 178 |
+
|
| 179 |
+
Expected remote endpoint contract:
|
| 180 |
+
- POST /step
|
| 181 |
+
- Request: {action_type, target_node_id, parameter}
|
| 182 |
+
- Success response includes ack_status and starts with "Ack:"
|
| 183 |
+
"""
|
| 184 |
+
if not self.remote_control_url:
|
| 185 |
+
raise ValueError("ANTIATROPOS_CONTROL_PLANE_URL is not configured")
|
| 186 |
+
|
| 187 |
+
endpoint = f"{self.remote_control_url}/step"
|
| 188 |
+
action_payload = {
|
| 189 |
+
"action_type": action,
|
| 190 |
+
"target_node_id": target,
|
| 191 |
+
"parameter": float(parameter),
|
| 192 |
+
}
|
| 193 |
+
payload = action_payload
|
| 194 |
+
|
| 195 |
+
try:
|
| 196 |
+
response = requests.post(endpoint, json=payload, timeout=self.remote_timeout_s)
|
| 197 |
+
except requests.RequestException as exc:
|
| 198 |
+
raise RuntimeError(f"Remote control-plane request failed: {exc}") from exc
|
| 199 |
+
|
| 200 |
+
if response.status_code == 422:
|
| 201 |
+
# OpenEnv server.app expects {"action": {...}} shape on /step.
|
| 202 |
+
try:
|
| 203 |
+
body = response.json()
|
| 204 |
+
detail = str(body.get("detail", body))
|
| 205 |
+
except Exception:
|
| 206 |
+
detail = response.text.strip()
|
| 207 |
+
if "body" in detail and "action" in detail:
|
| 208 |
+
try:
|
| 209 |
+
response = requests.post(
|
| 210 |
+
endpoint,
|
| 211 |
+
json={"action": action_payload},
|
| 212 |
+
timeout=self.remote_timeout_s,
|
| 213 |
+
)
|
| 214 |
+
except requests.RequestException as exc:
|
| 215 |
+
raise RuntimeError(f"Remote control-plane retry failed: {exc}") from exc
|
| 216 |
+
|
| 217 |
+
if response.status_code >= 400:
|
| 218 |
+
detail = ""
|
| 219 |
+
try:
|
| 220 |
+
body = response.json()
|
| 221 |
+
detail = str(body.get("detail", body))
|
| 222 |
+
except Exception:
|
| 223 |
+
detail = response.text.strip()
|
| 224 |
+
raise RuntimeError(
|
| 225 |
+
f"Remote control-plane rejected action ({response.status_code}): {detail}"
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
try:
|
| 229 |
+
data = response.json()
|
| 230 |
+
except Exception as exc:
|
| 231 |
+
raise RuntimeError("Remote control-plane returned non-JSON response") from exc
|
| 232 |
+
|
| 233 |
+
ack = str(data.get("ack_status", "")).strip()
|
| 234 |
+
if not ack:
|
| 235 |
+
action_id = str(data.get("action_id", "")).strip() or "remote"
|
| 236 |
+
return f"Ack: {action} for {target} via remote control-plane ({action_id})"
|
| 237 |
+
return ack
|
| 238 |
+
|
| 239 |
def _get_apps_v1_api(self):
|
| 240 |
if self._apps_v1_api is not None:
|
| 241 |
return self._apps_v1_api
|
deploy/do/README.md
CHANGED
|
@@ -23,6 +23,8 @@ sudo REPO_DIR=/opt/AntiAtropos FASTAPI_PORT=8010 MAX_REPLICAS=200 bash deploy/do
|
|
| 23 |
## What the script configures
|
| 24 |
|
| 25 |
- k3s kubelet with `max-pods=250`
|
|
|
|
|
|
|
| 26 |
- Env file at `.env.droplet` with:
|
| 27 |
- `ANTIATROPOS_ENV_MODE=live`
|
| 28 |
- `KUBECONFIG=/etc/rancher/k3s/k3s.yaml`
|
|
@@ -38,9 +40,16 @@ systemctl status antiatropos-fastapi --no-pager
|
|
| 38 |
curl http://127.0.0.1:8000/config/runtime
|
| 39 |
kubectl get deploy -n prod-sre
|
| 40 |
kubectl get pods -n monitoring
|
|
|
|
| 41 |
kubectl -n monitoring port-forward svc/grafana 3000:80
|
| 42 |
```
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
## Agent call example
|
| 45 |
|
| 46 |
```bash
|
|
|
|
| 23 |
## What the script configures
|
| 24 |
|
| 25 |
- k3s kubelet with `max-pods=250`
|
| 26 |
+
- Prometheus service exposed on NodePort `30090`
|
| 27 |
+
- Prometheus scrape job for annotated pods in namespace `prod-sre`
|
| 28 |
- Env file at `.env.droplet` with:
|
| 29 |
- `ANTIATROPOS_ENV_MODE=live`
|
| 30 |
- `KUBECONFIG=/etc/rancher/k3s/k3s.yaml`
|
|
|
|
| 40 |
curl http://127.0.0.1:8000/config/runtime
|
| 41 |
kubectl get deploy -n prod-sre
|
| 42 |
kubectl get pods -n monitoring
|
| 43 |
+
curl http://127.0.0.1:30090/api/v1/targets
|
| 44 |
kubectl -n monitoring port-forward svc/grafana 3000:80
|
| 45 |
```
|
| 46 |
|
| 47 |
+
If your local simulator FastAPI should use VM telemetry, set local `.env`:
|
| 48 |
+
|
| 49 |
+
```env
|
| 50 |
+
PROMETHEUS_URL=http://<droplet-ip>:30090
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
## Agent call example
|
| 54 |
|
| 55 |
```bash
|
deploy/do/deploy-droplet-one-shot.sh
CHANGED
|
@@ -139,11 +139,18 @@ for _ in {1..30}; do
|
|
| 139 |
sleep 2
|
| 140 |
done
|
| 141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
echo ""
|
| 143 |
echo "=== Deploy Complete ==="
|
| 144 |
echo "FastAPI runtime: http://127.0.0.1:${FASTAPI_PORT}/config/runtime"
|
| 145 |
echo "FastAPI health: http://127.0.0.1:${FASTAPI_PORT}/state"
|
| 146 |
echo "Prometheus svc: kubectl -n ${MONITORING_NAMESPACE} get svc prometheus-server"
|
|
|
|
| 147 |
echo "Grafana access: kubectl -n ${MONITORING_NAMESPACE} port-forward svc/grafana 3000:80"
|
| 148 |
echo ""
|
| 149 |
echo "Service status command:"
|
|
|
|
| 139 |
sleep 2
|
| 140 |
done
|
| 141 |
|
| 142 |
+
PUBLIC_IP="$(curl -fsS https://api.ipify.org 2>/dev/null || true)"
|
| 143 |
+
if [[ -z "${PUBLIC_IP}" ]]; then
|
| 144 |
+
PUBLIC_IP="$(hostname -I 2>/dev/null | awk '{print $1}')"
|
| 145 |
+
fi
|
| 146 |
+
PROM_URL_DISPLAY="http://${PUBLIC_IP:-<droplet-ip>}:30090"
|
| 147 |
+
|
| 148 |
echo ""
|
| 149 |
echo "=== Deploy Complete ==="
|
| 150 |
echo "FastAPI runtime: http://127.0.0.1:${FASTAPI_PORT}/config/runtime"
|
| 151 |
echo "FastAPI health: http://127.0.0.1:${FASTAPI_PORT}/state"
|
| 152 |
echo "Prometheus svc: kubectl -n ${MONITORING_NAMESPACE} get svc prometheus-server"
|
| 153 |
+
echo "Prometheus URL: ${PROM_URL_DISPLAY}"
|
| 154 |
echo "Grafana access: kubectl -n ${MONITORING_NAMESPACE} port-forward svc/grafana 3000:80"
|
| 155 |
echo ""
|
| 156 |
echo "Service status command:"
|
deploy/prometheus-helm-values.yaml
CHANGED
|
@@ -2,12 +2,28 @@ server:
|
|
| 2 |
global:
|
| 3 |
scrape_interval: 15s
|
| 4 |
evaluation_interval: 15s
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
extraScrapeConfigs: |
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
global:
|
| 3 |
scrape_interval: 15s
|
| 4 |
evaluation_interval: 15s
|
| 5 |
+
service:
|
| 6 |
+
type: NodePort
|
| 7 |
+
nodePort: 30090
|
| 8 |
|
| 9 |
extraScrapeConfigs: |
|
| 10 |
+
# Scrape all annotated workload pods in prod-sre.
|
| 11 |
+
- job_name: antiatropos-prod-sre-pods
|
| 12 |
+
kubernetes_sd_configs:
|
| 13 |
+
- role: pod
|
| 14 |
+
relabel_configs:
|
| 15 |
+
- source_labels: [__meta_kubernetes_namespace]
|
| 16 |
+
action: keep
|
| 17 |
+
regex: prod-sre
|
| 18 |
+
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
|
| 19 |
+
action: keep
|
| 20 |
+
regex: "true"
|
| 21 |
+
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
|
| 22 |
+
action: replace
|
| 23 |
+
target_label: __metrics_path__
|
| 24 |
+
regex: (.+)
|
| 25 |
+
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
|
| 26 |
+
action: replace
|
| 27 |
+
regex: ([^:]+)(?::\d+)?;(\d+)
|
| 28 |
+
replacement: $1:$2
|
| 29 |
+
target_label: __address__
|