div18 commited on
Commit
af7de22
·
1 Parent(s): d6496d4

Enhance Kubernetes executor and deployment configurations

Browse files

- Introduced remote execution capabilities in KubernetesExecutor to delegate actions to a FastAPI control plane.
- Updated .env.example to clarify Prometheus URL usage for local and VM telemetry.
- Modified Prometheus Helm values to expose the service on NodePort 30090 and added a scrape job for annotated pods in the prod-sre namespace.
- Updated deployment scripts and README to reflect new Prometheus configurations and provide clearer instructions for local setup.

.env.example CHANGED
@@ -6,7 +6,8 @@ ANTIATROPOS_ENV_MODE=live
6
  # Reward output to agent
7
  ANTIATROPOS_REWARD_OUTPUT_MODE=normalized
8
 
9
- # Local Prometheus endpoint (use kubectl port-forward to localhost:9090)
 
10
  PROMETHEUS_URL=http://localhost:9090
11
  ANTIATROPOS_PROM_TIMEOUT_S=5.0
12
  ANTIATROPOS_STRICT_REAL=false
 
6
  # Reward output to agent
7
  ANTIATROPOS_REWARD_OUTPUT_MODE=normalized
8
 
9
+ # Prometheus endpoint used by local simulator FastAPI.
10
+ # For VM telemetry, set to droplet Prometheus NodePort, e.g. http://206.189.136.21:30090
11
  PROMETHEUS_URL=http://localhost:9090
12
  ANTIATROPOS_PROM_TIMEOUT_S=5.0
13
  ANTIATROPOS_STRICT_REAL=false
control/kubernetes_executor.py CHANGED
@@ -2,6 +2,7 @@ import os
2
  import json
3
  import time
4
  import logging
 
5
  from uuid import uuid4
6
  from typing import Optional
7
 
@@ -15,7 +16,12 @@ class KubernetesExecutor:
15
  def __init__(self, kubeconfig: Optional[str] = None):
16
  # Use provided path or env var, defaulting to mock if neither is found
17
  self.kubeconfig = kubeconfig or os.getenv("KUBECONFIG")
18
- self.is_mock = not self.kubeconfig or self.kubeconfig.lower() == "mock"
 
 
 
 
 
19
  self.namespace = os.getenv("ANTIATROPOS_K8S_NAMESPACE", "default")
20
  self.min_replicas = int(os.getenv("ANTIATROPOS_MIN_REPLICAS", "1"))
21
  self.max_replicas = self._parse_max_replicas(os.getenv("ANTIATROPOS_MAX_REPLICAS"))
@@ -112,6 +118,9 @@ class KubernetesExecutor:
112
  """Execute bounded actions on a Kubernetes cluster."""
113
  action = self._normalize_action_type(action_type)
114
 
 
 
 
115
  if action == "NO_OP":
116
  return "Ack: NO_OP - no cluster mutation"
117
 
@@ -163,6 +172,70 @@ class KubernetesExecutor:
163
  f"in namespace {namespace} scaled {current}->{desired}"
164
  )
165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  def _get_apps_v1_api(self):
167
  if self._apps_v1_api is not None:
168
  return self._apps_v1_api
 
2
  import json
3
  import time
4
  import logging
5
+ import requests
6
  from uuid import uuid4
7
  from typing import Optional
8
 
 
16
  def __init__(self, kubeconfig: Optional[str] = None):
17
  # Use provided path or env var, defaulting to mock if neither is found
18
  self.kubeconfig = kubeconfig or os.getenv("KUBECONFIG")
19
+ self.remote_control_url = os.getenv("ANTIATROPOS_CONTROL_PLANE_URL", "").strip().rstrip("/")
20
+ self.remote_timeout_s = float(os.getenv("ANTIATROPOS_CONTROL_TIMEOUT_S", "5.0"))
21
+ self.is_mock = (
22
+ not self.remote_control_url
23
+ and (not self.kubeconfig or self.kubeconfig.lower() == "mock")
24
+ )
25
  self.namespace = os.getenv("ANTIATROPOS_K8S_NAMESPACE", "default")
26
  self.min_replicas = int(os.getenv("ANTIATROPOS_MIN_REPLICAS", "1"))
27
  self.max_replicas = self._parse_max_replicas(os.getenv("ANTIATROPOS_MAX_REPLICAS"))
 
118
  """Execute bounded actions on a Kubernetes cluster."""
119
  action = self._normalize_action_type(action_type)
120
 
121
+ if self.remote_control_url:
122
+ return self._remote_execution(action, target, parameter)
123
+
124
  if action == "NO_OP":
125
  return "Ack: NO_OP - no cluster mutation"
126
 
 
172
  f"in namespace {namespace} scaled {current}->{desired}"
173
  )
174
 
175
+ def _remote_execution(self, action: str, target: str, parameter: float) -> str:
176
+ """
177
+ Delegate action execution to a remote FastAPI control plane.
178
+
179
+ Expected remote endpoint contract:
180
+ - POST /step
181
+ - Request: {action_type, target_node_id, parameter}
182
+ - Success response includes ack_status and starts with "Ack:"
183
+ """
184
+ if not self.remote_control_url:
185
+ raise ValueError("ANTIATROPOS_CONTROL_PLANE_URL is not configured")
186
+
187
+ endpoint = f"{self.remote_control_url}/step"
188
+ action_payload = {
189
+ "action_type": action,
190
+ "target_node_id": target,
191
+ "parameter": float(parameter),
192
+ }
193
+ payload = action_payload
194
+
195
+ try:
196
+ response = requests.post(endpoint, json=payload, timeout=self.remote_timeout_s)
197
+ except requests.RequestException as exc:
198
+ raise RuntimeError(f"Remote control-plane request failed: {exc}") from exc
199
+
200
+ if response.status_code == 422:
201
+ # OpenEnv server.app expects {"action": {...}} shape on /step.
202
+ try:
203
+ body = response.json()
204
+ detail = str(body.get("detail", body))
205
+ except Exception:
206
+ detail = response.text.strip()
207
+ if "body" in detail and "action" in detail:
208
+ try:
209
+ response = requests.post(
210
+ endpoint,
211
+ json={"action": action_payload},
212
+ timeout=self.remote_timeout_s,
213
+ )
214
+ except requests.RequestException as exc:
215
+ raise RuntimeError(f"Remote control-plane retry failed: {exc}") from exc
216
+
217
+ if response.status_code >= 400:
218
+ detail = ""
219
+ try:
220
+ body = response.json()
221
+ detail = str(body.get("detail", body))
222
+ except Exception:
223
+ detail = response.text.strip()
224
+ raise RuntimeError(
225
+ f"Remote control-plane rejected action ({response.status_code}): {detail}"
226
+ )
227
+
228
+ try:
229
+ data = response.json()
230
+ except Exception as exc:
231
+ raise RuntimeError("Remote control-plane returned non-JSON response") from exc
232
+
233
+ ack = str(data.get("ack_status", "")).strip()
234
+ if not ack:
235
+ action_id = str(data.get("action_id", "")).strip() or "remote"
236
+ return f"Ack: {action} for {target} via remote control-plane ({action_id})"
237
+ return ack
238
+
239
  def _get_apps_v1_api(self):
240
  if self._apps_v1_api is not None:
241
  return self._apps_v1_api
deploy/do/README.md CHANGED
@@ -23,6 +23,8 @@ sudo REPO_DIR=/opt/AntiAtropos FASTAPI_PORT=8010 MAX_REPLICAS=200 bash deploy/do
23
  ## What the script configures
24
 
25
  - k3s kubelet with `max-pods=250`
 
 
26
  - Env file at `.env.droplet` with:
27
  - `ANTIATROPOS_ENV_MODE=live`
28
  - `KUBECONFIG=/etc/rancher/k3s/k3s.yaml`
@@ -38,9 +40,16 @@ systemctl status antiatropos-fastapi --no-pager
38
  curl http://127.0.0.1:8000/config/runtime
39
  kubectl get deploy -n prod-sre
40
  kubectl get pods -n monitoring
 
41
  kubectl -n monitoring port-forward svc/grafana 3000:80
42
  ```
43
 
 
 
 
 
 
 
44
  ## Agent call example
45
 
46
  ```bash
 
23
  ## What the script configures
24
 
25
  - k3s kubelet with `max-pods=250`
26
+ - Prometheus service exposed on NodePort `30090`
27
+ - Prometheus scrape job for annotated pods in namespace `prod-sre`
28
  - Env file at `.env.droplet` with:
29
  - `ANTIATROPOS_ENV_MODE=live`
30
  - `KUBECONFIG=/etc/rancher/k3s/k3s.yaml`
 
40
  curl http://127.0.0.1:8000/config/runtime
41
  kubectl get deploy -n prod-sre
42
  kubectl get pods -n monitoring
43
+ curl http://127.0.0.1:30090/api/v1/targets
44
  kubectl -n monitoring port-forward svc/grafana 3000:80
45
  ```
46
 
47
+ If your local simulator FastAPI should use VM telemetry, set local `.env`:
48
+
49
+ ```env
50
+ PROMETHEUS_URL=http://<droplet-ip>:30090
51
+ ```
52
+
53
  ## Agent call example
54
 
55
  ```bash
deploy/do/deploy-droplet-one-shot.sh CHANGED
@@ -139,11 +139,18 @@ for _ in {1..30}; do
139
  sleep 2
140
  done
141
 
 
 
 
 
 
 
142
  echo ""
143
  echo "=== Deploy Complete ==="
144
  echo "FastAPI runtime: http://127.0.0.1:${FASTAPI_PORT}/config/runtime"
145
  echo "FastAPI health: http://127.0.0.1:${FASTAPI_PORT}/state"
146
  echo "Prometheus svc: kubectl -n ${MONITORING_NAMESPACE} get svc prometheus-server"
 
147
  echo "Grafana access: kubectl -n ${MONITORING_NAMESPACE} port-forward svc/grafana 3000:80"
148
  echo ""
149
  echo "Service status command:"
 
139
  sleep 2
140
  done
141
 
142
+ PUBLIC_IP="$(curl -fsS https://api.ipify.org 2>/dev/null || true)"
143
+ if [[ -z "${PUBLIC_IP}" ]]; then
144
+ PUBLIC_IP="$(hostname -I 2>/dev/null | awk '{print $1}')"
145
+ fi
146
+ PROM_URL_DISPLAY="http://${PUBLIC_IP:-<droplet-ip>}:30090"
147
+
148
  echo ""
149
  echo "=== Deploy Complete ==="
150
  echo "FastAPI runtime: http://127.0.0.1:${FASTAPI_PORT}/config/runtime"
151
  echo "FastAPI health: http://127.0.0.1:${FASTAPI_PORT}/state"
152
  echo "Prometheus svc: kubectl -n ${MONITORING_NAMESPACE} get svc prometheus-server"
153
+ echo "Prometheus URL: ${PROM_URL_DISPLAY}"
154
  echo "Grafana access: kubectl -n ${MONITORING_NAMESPACE} port-forward svc/grafana 3000:80"
155
  echo ""
156
  echo "Service status command:"
deploy/prometheus-helm-values.yaml CHANGED
@@ -2,12 +2,28 @@ server:
2
  global:
3
  scrape_interval: 15s
4
  evaluation_interval: 15s
 
 
 
5
 
6
  extraScrapeConfigs: |
7
- - job_name: antiatropos-fastapi
8
- scrape_protocols:
9
- - PrometheusText1.0.0
10
- - PrometheusText0.0.4
11
- metrics_path: /metrics
12
- static_configs:
13
- - targets: ['host.docker.internal:8000']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  global:
3
  scrape_interval: 15s
4
  evaluation_interval: 15s
5
+ service:
6
+ type: NodePort
7
+ nodePort: 30090
8
 
9
  extraScrapeConfigs: |
10
+ # Scrape all annotated workload pods in prod-sre.
11
+ - job_name: antiatropos-prod-sre-pods
12
+ kubernetes_sd_configs:
13
+ - role: pod
14
+ relabel_configs:
15
+ - source_labels: [__meta_kubernetes_namespace]
16
+ action: keep
17
+ regex: prod-sre
18
+ - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
19
+ action: keep
20
+ regex: "true"
21
+ - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
22
+ action: replace
23
+ target_label: __metrics_path__
24
+ regex: (.+)
25
+ - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
26
+ action: replace
27
+ regex: ([^:]+)(?::\d+)?;(\d+)
28
+ replacement: $1:$2
29
+ target_label: __address__