div18 commited on
Commit
504ced2
·
1 Parent(s): af7de22

consolidate

Browse files
.env.example CHANGED
@@ -7,14 +7,16 @@ ANTIATROPOS_ENV_MODE=live
7
  ANTIATROPOS_REWARD_OUTPUT_MODE=normalized
8
 
9
  # Prometheus endpoint used by local simulator FastAPI.
10
- # For VM telemetry, set to droplet Prometheus NodePort, e.g. http://206.189.136.21:30090
11
- PROMETHEUS_URL=http://localhost:9090
12
  ANTIATROPOS_PROM_TIMEOUT_S=5.0
13
  ANTIATROPOS_STRICT_REAL=false
14
  ANTIATROPOS_METRIC_AGGREGATION=sum
15
 
16
  # Kubernetes execution settings
17
  KUBECONFIG=C:/Users/your-user/.kube/config
 
 
18
  ANTIATROPOS_K8S_NAMESPACE=prod-sre
19
  ANTIATROPOS_MIN_REPLICAS=1
20
  ANTIATROPOS_MAX_REPLICAS=
@@ -31,6 +33,7 @@ ANTIATROPOS_GRAFANA_MODE=local
31
  # If GROQ_API_KEY is set and API_BASE_URL is not set, inference.py auto-uses Groq.
32
  GROQ_API_KEY=
33
  MODEL_NAME=llama-3.1-8b-instant
 
34
  ENV_URL=http://localhost:8000
35
  ANTIATROPOS_MODE=live
36
  ANTIATROPOS_LABEL_NODE_MAP={"payments":"node-0","checkout":"node-1","catalog":"node-2","cart":"node-3","auth":"node-4"}
 
7
  ANTIATROPOS_REWARD_OUTPUT_MODE=normalized
8
 
9
  # Prometheus endpoint used by local simulator FastAPI.
10
+ # Consolidated path: droplet Prometheus NodePort.
11
+ PROMETHEUS_URL=http://<droplet-ip>:30090
12
  ANTIATROPOS_PROM_TIMEOUT_S=5.0
13
  ANTIATROPOS_STRICT_REAL=false
14
  ANTIATROPOS_METRIC_AGGREGATION=sum
15
 
16
  # Kubernetes execution settings
17
  KUBECONFIG=C:/Users/your-user/.kube/config
18
+ ANTIATROPOS_CONTROL_PLANE_URL=http://<droplet-ip>:8010
19
+ ANTIATROPOS_CONTROL_TIMEOUT_S=8.0
20
  ANTIATROPOS_K8S_NAMESPACE=prod-sre
21
  ANTIATROPOS_MIN_REPLICAS=1
22
  ANTIATROPOS_MAX_REPLICAS=
 
33
  # If GROQ_API_KEY is set and API_BASE_URL is not set, inference.py auto-uses Groq.
34
  GROQ_API_KEY=
35
  MODEL_NAME=llama-3.1-8b-instant
36
+ # Local OpenEnv runtime remains authoritative.
37
  ENV_URL=http://localhost:8000
38
  ANTIATROPOS_MODE=live
39
  ANTIATROPOS_LABEL_NODE_MAP={"payments":"node-0","checkout":"node-1","catalog":"node-2","cart":"node-3","auth":"node-4"}
control/kubernetes_executor.py CHANGED
@@ -18,6 +18,8 @@ class KubernetesExecutor:
18
  self.kubeconfig = kubeconfig or os.getenv("KUBECONFIG")
19
  self.remote_control_url = os.getenv("ANTIATROPOS_CONTROL_PLANE_URL", "").strip().rstrip("/")
20
  self.remote_timeout_s = float(os.getenv("ANTIATROPOS_CONTROL_TIMEOUT_S", "5.0"))
 
 
21
  self.is_mock = (
22
  not self.remote_control_url
23
  and (not self.kubeconfig or self.kubeconfig.lower() == "mock")
@@ -29,6 +31,8 @@ class KubernetesExecutor:
29
  self._apps_v1_api = None
30
  self._node_workload_map = self._load_node_workload_map()
31
  self._live_supported_actions = {"NO_OP", "SCALE_UP", "SCALE_DOWN"}
 
 
32
 
33
  @staticmethod
34
  def _parse_max_replicas(raw: Optional[str]) -> Optional[int]:
@@ -161,10 +165,11 @@ class KubernetesExecutor:
161
  f"(bounds {self.min_replicas}-{upper})"
162
  )
163
 
164
- apps_v1.patch_namespaced_deployment_scale(
165
- name=deployment_name,
 
166
  namespace=namespace,
167
- body={"spec": {"replicas": desired}},
168
  )
169
 
170
  return (
@@ -172,6 +177,34 @@ class KubernetesExecutor:
172
  f"in namespace {namespace} scaled {current}->{desired}"
173
  )
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  def _remote_execution(self, action: str, target: str, parameter: float) -> str:
176
  """
177
  Delegate action execution to a remote FastAPI control plane.
@@ -180,6 +213,9 @@ class KubernetesExecutor:
180
  - POST /step
181
  - Request: {action_type, target_node_id, parameter}
182
  - Success response includes ack_status and starts with "Ack:"
 
 
 
183
  """
184
  if not self.remote_control_url:
185
  raise ValueError("ANTIATROPOS_CONTROL_PLANE_URL is not configured")
@@ -192,27 +228,7 @@ class KubernetesExecutor:
192
  }
193
  payload = action_payload
194
 
195
- try:
196
- response = requests.post(endpoint, json=payload, timeout=self.remote_timeout_s)
197
- except requests.RequestException as exc:
198
- raise RuntimeError(f"Remote control-plane request failed: {exc}") from exc
199
-
200
- if response.status_code == 422:
201
- # OpenEnv server.app expects {"action": {...}} shape on /step.
202
- try:
203
- body = response.json()
204
- detail = str(body.get("detail", body))
205
- except Exception:
206
- detail = response.text.strip()
207
- if "body" in detail and "action" in detail:
208
- try:
209
- response = requests.post(
210
- endpoint,
211
- json={"action": action_payload},
212
- timeout=self.remote_timeout_s,
213
- )
214
- except requests.RequestException as exc:
215
- raise RuntimeError(f"Remote control-plane retry failed: {exc}") from exc
216
 
217
  if response.status_code >= 400:
218
  detail = ""
@@ -221,6 +237,12 @@ class KubernetesExecutor:
221
  detail = str(body.get("detail", body))
222
  except Exception:
223
  detail = response.text.strip()
 
 
 
 
 
 
224
  raise RuntimeError(
225
  f"Remote control-plane rejected action ({response.status_code}): {detail}"
226
  )
@@ -236,6 +258,47 @@ class KubernetesExecutor:
236
  return f"Ack: {action} for {target} via remote control-plane ({action_id})"
237
  return ack
238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  def _get_apps_v1_api(self):
240
  if self._apps_v1_api is not None:
241
  return self._apps_v1_api
 
18
  self.kubeconfig = kubeconfig or os.getenv("KUBECONFIG")
19
  self.remote_control_url = os.getenv("ANTIATROPOS_CONTROL_PLANE_URL", "").strip().rstrip("/")
20
  self.remote_timeout_s = float(os.getenv("ANTIATROPOS_CONTROL_TIMEOUT_S", "5.0"))
21
+ self.remote_retry_count = int(os.getenv("ANTIATROPOS_CONTROL_RETRY_COUNT", "2"))
22
+ self.remote_retry_backoff_s = float(os.getenv("ANTIATROPOS_CONTROL_RETRY_BACKOFF_S", "0.25"))
23
  self.is_mock = (
24
  not self.remote_control_url
25
  and (not self.kubeconfig or self.kubeconfig.lower() == "mock")
 
31
  self._apps_v1_api = None
32
  self._node_workload_map = self._load_node_workload_map()
33
  self._live_supported_actions = {"NO_OP", "SCALE_UP", "SCALE_DOWN"}
34
+ self.k8s_retry_count = int(os.getenv("ANTIATROPOS_K8S_RETRY_COUNT", "2"))
35
+ self.k8s_retry_backoff_s = float(os.getenv("ANTIATROPOS_K8S_RETRY_BACKOFF_S", "0.2"))
36
 
37
  @staticmethod
38
  def _parse_max_replicas(raw: Optional[str]) -> Optional[int]:
 
165
  f"(bounds {self.min_replicas}-{upper})"
166
  )
167
 
168
+ self._patch_deployment_scale_with_retry(
169
+ apps_v1=apps_v1,
170
+ deployment_name=deployment_name,
171
  namespace=namespace,
172
+ desired=desired,
173
  )
174
 
175
  return (
 
177
  f"in namespace {namespace} scaled {current}->{desired}"
178
  )
179
 
180
+ def _patch_deployment_scale_with_retry(self, apps_v1, deployment_name: str, namespace: str, desired: int) -> None:
181
+ """
182
+ Patch deployment replicas with retries for transient API server errors.
183
+ """
184
+ from kubernetes.client.rest import ApiException
185
+
186
+ max_attempts = max(1, self.k8s_retry_count + 1)
187
+ for attempt in range(1, max_attempts + 1):
188
+ try:
189
+ apps_v1.patch_namespaced_deployment_scale(
190
+ name=deployment_name,
191
+ namespace=namespace,
192
+ body={"spec": {"replicas": desired}},
193
+ )
194
+ return
195
+ except ApiException as exc:
196
+ retryable = exc.status in (409, 429, 500, 502, 503, 504)
197
+ if (not retryable) or attempt >= max_attempts:
198
+ raise
199
+ sleep_s = self.k8s_retry_backoff_s * (2 ** (attempt - 1))
200
+ logger.warning(
201
+ "Retrying deployment scale patch after ApiException status=%s attempt=%s/%s",
202
+ exc.status,
203
+ attempt,
204
+ max_attempts,
205
+ )
206
+ time.sleep(sleep_s)
207
+
208
  def _remote_execution(self, action: str, target: str, parameter: float) -> str:
209
  """
210
  Delegate action execution to a remote FastAPI control plane.
 
213
  - POST /step
214
  - Request: {action_type, target_node_id, parameter}
215
  - Success response includes ack_status and starts with "Ack:"
216
+
217
+ This contract matches server.local_laptop_control and is the only
218
+ supported remote control-plane format.
219
  """
220
  if not self.remote_control_url:
221
  raise ValueError("ANTIATROPOS_CONTROL_PLANE_URL is not configured")
 
228
  }
229
  payload = action_payload
230
 
231
+ response = self._post_with_retry(endpoint=endpoint, payload=payload)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
  if response.status_code >= 400:
234
  detail = ""
 
237
  detail = str(body.get("detail", body))
238
  except Exception:
239
  detail = response.text.strip()
240
+ if response.status_code == 422 and "action" in detail:
241
+ detail = (
242
+ f"{detail}. Expected lightweight control-plane contract at "
243
+ f"{endpoint}: "
244
+ '{"action_type":"SCALE_UP","target_node_id":"node-0","parameter":1.0}'
245
+ )
246
  raise RuntimeError(
247
  f"Remote control-plane rejected action ({response.status_code}): {detail}"
248
  )
 
258
  return f"Ack: {action} for {target} via remote control-plane ({action_id})"
259
  return ack
260
 
261
+ def _post_with_retry(self, endpoint: str, payload: dict) -> requests.Response:
262
+ """
263
+ POST helper with retries for transient HTTP/network failures.
264
+ """
265
+ max_attempts = max(1, self.remote_retry_count + 1)
266
+ last_exc: Optional[Exception] = None
267
+
268
+ for attempt in range(1, max_attempts + 1):
269
+ try:
270
+ response = requests.post(endpoint, json=payload, timeout=self.remote_timeout_s)
271
+ except requests.RequestException as exc:
272
+ last_exc = exc
273
+ if attempt >= max_attempts:
274
+ break
275
+ sleep_s = self.remote_retry_backoff_s * (2 ** (attempt - 1))
276
+ logger.warning(
277
+ "Retrying remote control-plane POST after network error attempt=%s/%s: %s",
278
+ attempt,
279
+ max_attempts,
280
+ exc,
281
+ )
282
+ time.sleep(sleep_s)
283
+ continue
284
+
285
+ if response.status_code >= 500 and attempt < max_attempts:
286
+ sleep_s = self.remote_retry_backoff_s * (2 ** (attempt - 1))
287
+ logger.warning(
288
+ "Retrying remote control-plane POST after HTTP %s attempt=%s/%s",
289
+ response.status_code,
290
+ attempt,
291
+ max_attempts,
292
+ )
293
+ time.sleep(sleep_s)
294
+ continue
295
+
296
+ return response
297
+
298
+ if last_exc is not None:
299
+ raise RuntimeError(f"Remote control-plane request failed: {last_exc}") from last_exc
300
+ raise RuntimeError("Remote control-plane request failed after retries")
301
+
302
  def _get_apps_v1_api(self):
303
  if self._apps_v1_api is not None:
304
  return self._apps_v1_api
deploy/do/README.md CHANGED
@@ -4,7 +4,12 @@ This deploy flow is for a single Ubuntu Droplet running:
4
  - k3s (single-node Kubernetes)
5
  - AntiAtropos sample workloads (`prod-sre`)
6
  - Prometheus + Grafana (`monitoring`)
7
- - FastAPI control server (`antiatropos-fastapi` systemd service)
 
 
 
 
 
8
 
9
  ## Run
10
 
@@ -17,7 +22,7 @@ sudo bash deploy/do/deploy-droplet-one-shot.sh
17
  Optional overrides:
18
 
19
  ```bash
20
- sudo REPO_DIR=/opt/AntiAtropos FASTAPI_PORT=8010 MAX_REPLICAS=200 bash deploy/do/deploy-droplet-one-shot.sh
21
  ```
22
 
23
  ## What the script configures
@@ -26,34 +31,62 @@ sudo REPO_DIR=/opt/AntiAtropos FASTAPI_PORT=8010 MAX_REPLICAS=200 bash deploy/do
26
  - Prometheus service exposed on NodePort `30090`
27
  - Prometheus scrape job for annotated pods in namespace `prod-sre`
28
  - Env file at `.env.droplet` with:
29
- - `ANTIATROPOS_ENV_MODE=live`
30
  - `KUBECONFIG=/etc/rancher/k3s/k3s.yaml`
31
  - `ANTIATROPOS_WORKLOAD_MAP` for `node-0`..`node-4`
32
  - Systemd service:
33
- - Name: `antiatropos-fastapi`
34
- - Exec: `uvicorn server.app:app --host 0.0.0.0 --port 8000`
 
 
35
 
36
  ## Verify
37
 
38
  ```bash
39
- systemctl status antiatropos-fastapi --no-pager
40
- curl http://127.0.0.1:8000/config/runtime
41
  kubectl get deploy -n prod-sre
42
  kubectl get pods -n monitoring
43
  curl http://127.0.0.1:30090/api/v1/targets
44
  kubectl -n monitoring port-forward svc/grafana 3000:80
45
  ```
46
 
47
- If your local simulator FastAPI should use VM telemetry, set local `.env`:
48
 
49
  ```env
 
 
50
  PROMETHEUS_URL=http://<droplet-ip>:30090
51
  ```
52
 
53
- ## Agent call example
 
 
54
 
55
  ```bash
56
- curl -X POST http://127.0.0.1:8000/step \
 
 
 
 
 
 
57
  -H "Content-Type: application/json" \
58
- -d '{"action_type":"SCALE_UP","target_node_id":"node-3","parameter":0.6}'
59
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  - k3s (single-node Kubernetes)
5
  - AntiAtropos sample workloads (`prod-sre`)
6
  - Prometheus + Grafana (`monitoring`)
7
+ - lightweight control-plane API (`antiatropos-control` on port `8010`)
8
+
9
+ The OpenEnv runtime (`server.app`) is intentionally **not** run on the droplet.
10
+ The only supported split is:
11
+ - local machine: OpenEnv server + inference loop
12
+ - droplet: Kubernetes executor API + observability stack
13
 
14
  ## Run
15
 
 
22
  Optional overrides:
23
 
24
  ```bash
25
+ sudo REPO_DIR=/opt/AntiAtropos CONTROL_PORT=8010 MAX_REPLICAS=200 bash deploy/do/deploy-droplet-one-shot.sh
26
  ```
27
 
28
  ## What the script configures
 
31
  - Prometheus service exposed on NodePort `30090`
32
  - Prometheus scrape job for annotated pods in namespace `prod-sre`
33
  - Env file at `.env.droplet` with:
 
34
  - `KUBECONFIG=/etc/rancher/k3s/k3s.yaml`
35
  - `ANTIATROPOS_WORKLOAD_MAP` for `node-0`..`node-4`
36
  - Systemd service:
37
+ - Name: `antiatropos-control`
38
+ - Exec: `uvicorn server.local_laptop_control:app --host 0.0.0.0 --port 8010`
39
+ - Legacy cleanup:
40
+ - `antiatropos-fastapi` (VM OpenEnv service) is disabled/removed by default deploy path
41
 
42
  ## Verify
43
 
44
  ```bash
45
+ systemctl status antiatropos-control --no-pager
46
+ curl http://127.0.0.1:8010/health
47
  kubectl get deploy -n prod-sre
48
  kubectl get pods -n monitoring
49
  curl http://127.0.0.1:30090/api/v1/targets
50
  kubectl -n monitoring port-forward svc/grafana 3000:80
51
  ```
52
 
53
+ Set local `.env` to use this consolidated path:
54
 
55
  ```env
56
+ ENV_URL=http://localhost:8000
57
+ ANTIATROPOS_CONTROL_PLANE_URL=http://<droplet-ip>:8010
58
  PROMETHEUS_URL=http://<droplet-ip>:30090
59
  ```
60
 
61
+ ## Deterministic remote-scaling proof
62
+
63
+ On droplet, watch desired replicas:
64
 
65
  ```bash
66
+ watch -n 1 'kubectl -n prod-sre get deploy -o custom-columns=NAME:.metadata.name,DESIRED:.spec.replicas,READY:.status.readyReplicas,AVAILABLE:.status.availableReplicas'
67
+ ```
68
+
69
+ From local machine, send one control action:
70
+
71
+ ```bash
72
+ curl -X POST http://<droplet-ip>:8010/step \
73
  -H "Content-Type: application/json" \
74
+ -d '{"action_type":"SCALE_UP","target_node_id":"node-0","parameter":1.0}'
75
  ```
76
+
77
+ If `payments` desired replicas increase, scaling is happening on droplet.
78
+
79
+ ## Troubleshooting
80
+
81
+ - **Pods do not move during inference**
82
+ - Verify local env points to droplet control API:
83
+ - `ANTIATROPOS_CONTROL_PLANE_URL=http://<droplet-ip>:8010`
84
+ - Check droplet control health:
85
+ - `curl http://127.0.0.1:8010/health`
86
+ - Check service status:
87
+ - `systemctl status antiatropos-control --no-pager`
88
+ - **Connection refused from local to droplet:8010**
89
+ - Service not running or firewall closed.
90
+ - Start service and open firewall if needed.
91
+ - **Need to remove legacy VM OpenEnv service**
92
+ - `sudo bash deploy/do/uninstall-legacy-openenv.sh`
deploy/do/antiatropos-control.service ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [Unit]
2
+ Description=AntiAtropos Droplet Control API
3
+ After=network-online.target k3s.service
4
+ Wants=network-online.target
5
+
6
+ [Service]
7
+ Type=simple
8
+ User=root
9
+ WorkingDirectory=/root/Anti-Atropos
10
+ EnvironmentFile=/root/Anti-Atropos/.env.droplet
11
+ ExecStart=/root/Anti-Atropos/.venv-droplet/bin/uvicorn server.local_laptop_control:app --host 0.0.0.0 --port 8010
12
+ Restart=always
13
+ RestartSec=3
14
+
15
+ [Install]
16
+ WantedBy=multi-user.target
deploy/do/deploy-droplet-one-shot.sh CHANGED
@@ -5,7 +5,7 @@ set -euo pipefail
5
  # - Installs k3s with kubelet max-pods=250
6
  # - Deploys workloads + Prometheus + Grafana
7
  # - Creates env file for live Kubernetes scaling
8
- # - Starts FastAPI server via systemd (antiatropos-fastapi)
9
 
10
  if [[ "${EUID}" -ne 0 ]]; then
11
  echo "Run as root: sudo bash deploy/do/deploy-droplet-one-shot.sh"
@@ -14,8 +14,8 @@ fi
14
 
15
  REPO_DIR="${REPO_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)}"
16
  KUBECONFIG_PATH="${KUBECONFIG_PATH:-/etc/rancher/k3s/k3s.yaml}"
17
- FASTAPI_PORT="${FASTAPI_PORT:-8000}"
18
- FASTAPI_HOST="${FASTAPI_HOST:-0.0.0.0}"
19
  K8S_NAMESPACE="${K8S_NAMESPACE:-prod-sre}"
20
  MONITORING_NAMESPACE="${MONITORING_NAMESPACE:-monitoring}"
21
  PY_VENV_DIR="${PY_VENV_DIR:-${REPO_DIR}/.venv-droplet}"
@@ -28,7 +28,7 @@ WORKLOAD_MAP="${WORKLOAD_MAP:-{\"node-0\":{\"deployment\":\"payments\",\"namespa
28
  echo "=== AntiAtropos Droplet One-Shot Deploy ==="
29
  echo "Repo: ${REPO_DIR}"
30
  echo "Kubeconfig: ${KUBECONFIG_PATH}"
31
- echo "FastAPI: ${FASTAPI_HOST}:${FASTAPI_PORT}"
32
  echo ""
33
 
34
  if [[ ! -f "${REPO_DIR}/deploy/local-laptop.yaml" ]]; then
@@ -85,7 +85,6 @@ helm upgrade --install grafana grafana/grafana \
85
 
86
  if [[ ! -f "${ENV_FILE}" ]]; then
87
  cat > "${ENV_FILE}" <<EOF
88
- ANTIATROPOS_ENV_MODE=live
89
  KUBECONFIG=/etc/rancher/k3s/k3s.yaml
90
  ANTIATROPOS_K8S_NAMESPACE=prod-sre
91
  ANTIATROPOS_MIN_REPLICAS=${MIN_REPLICAS}
@@ -108,9 +107,16 @@ else
108
  "${PY_VENV_DIR}/bin/pip" install -r "${REPO_DIR}/server/requirements.txt"
109
  fi
110
 
111
- cat > /etc/systemd/system/antiatropos-fastapi.service <<EOF
 
 
 
 
 
 
 
112
  [Unit]
113
- Description=AntiAtropos FastAPI Server
114
  After=network-online.target k3s.service
115
  Wants=network-online.target
116
 
@@ -119,7 +125,7 @@ Type=simple
119
  User=root
120
  WorkingDirectory=${REPO_DIR}
121
  EnvironmentFile=${ENV_FILE}
122
- ExecStart=${PY_VENV_DIR}/bin/uvicorn server.app:app --host ${FASTAPI_HOST} --port ${FASTAPI_PORT}
123
  Restart=always
124
  RestartSec=3
125
 
@@ -128,12 +134,12 @@ WantedBy=multi-user.target
128
  EOF
129
 
130
  systemctl daemon-reload
131
- systemctl enable --now antiatropos-fastapi
132
 
133
  echo ""
134
- echo "Waiting for app readiness..."
135
  for _ in {1..30}; do
136
- if curl -fsS "http://127.0.0.1:${FASTAPI_PORT}/config/runtime" >/dev/null 2>&1; then
137
  break
138
  fi
139
  sleep 2
@@ -147,15 +153,18 @@ PROM_URL_DISPLAY="http://${PUBLIC_IP:-<droplet-ip>}:30090"
147
 
148
  echo ""
149
  echo "=== Deploy Complete ==="
150
- echo "FastAPI runtime: http://127.0.0.1:${FASTAPI_PORT}/config/runtime"
151
- echo "FastAPI health: http://127.0.0.1:${FASTAPI_PORT}/state"
152
  echo "Prometheus svc: kubectl -n ${MONITORING_NAMESPACE} get svc prometheus-server"
153
  echo "Prometheus URL: ${PROM_URL_DISPLAY}"
154
  echo "Grafana access: kubectl -n ${MONITORING_NAMESPACE} port-forward svc/grafana 3000:80"
155
  echo ""
156
  echo "Service status command:"
157
- echo " systemctl status antiatropos-fastapi --no-pager"
158
  echo ""
159
- echo "If needed, edit env and restart:"
160
  echo " ${ENV_FILE}"
161
- echo " systemctl restart antiatropos-fastapi"
 
 
 
 
5
  # - Installs k3s with kubelet max-pods=250
6
  # - Deploys workloads + Prometheus + Grafana
7
  # - Creates env file for live Kubernetes scaling
8
+ # - Starts lightweight control-plane API via systemd (antiatropos-control)
9
 
10
  if [[ "${EUID}" -ne 0 ]]; then
11
  echo "Run as root: sudo bash deploy/do/deploy-droplet-one-shot.sh"
 
14
 
15
  REPO_DIR="${REPO_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)}"
16
  KUBECONFIG_PATH="${KUBECONFIG_PATH:-/etc/rancher/k3s/k3s.yaml}"
17
+ CONTROL_PORT="${CONTROL_PORT:-8010}"
18
+ CONTROL_HOST="${CONTROL_HOST:-0.0.0.0}"
19
  K8S_NAMESPACE="${K8S_NAMESPACE:-prod-sre}"
20
  MONITORING_NAMESPACE="${MONITORING_NAMESPACE:-monitoring}"
21
  PY_VENV_DIR="${PY_VENV_DIR:-${REPO_DIR}/.venv-droplet}"
 
28
  echo "=== AntiAtropos Droplet One-Shot Deploy ==="
29
  echo "Repo: ${REPO_DIR}"
30
  echo "Kubeconfig: ${KUBECONFIG_PATH}"
31
+ echo "Control API: ${CONTROL_HOST}:${CONTROL_PORT}"
32
  echo ""
33
 
34
  if [[ ! -f "${REPO_DIR}/deploy/local-laptop.yaml" ]]; then
 
85
 
86
  if [[ ! -f "${ENV_FILE}" ]]; then
87
  cat > "${ENV_FILE}" <<EOF
 
88
  KUBECONFIG=/etc/rancher/k3s/k3s.yaml
89
  ANTIATROPOS_K8S_NAMESPACE=prod-sre
90
  ANTIATROPOS_MIN_REPLICAS=${MIN_REPLICAS}
 
107
  "${PY_VENV_DIR}/bin/pip" install -r "${REPO_DIR}/server/requirements.txt"
108
  fi
109
 
110
+ # Hard cleanup: remove legacy VM OpenEnv service if it exists.
111
+ if systemctl list-unit-files | grep -q '^antiatropos-fastapi\.service'; then
112
+ echo "Disabling legacy service antiatropos-fastapi..."
113
+ systemctl disable --now antiatropos-fastapi >/dev/null 2>&1 || true
114
+ rm -f /etc/systemd/system/antiatropos-fastapi.service
115
+ fi
116
+
117
+ cat > /etc/systemd/system/antiatropos-control.service <<EOF
118
  [Unit]
119
+ Description=AntiAtropos Droplet Control API
120
  After=network-online.target k3s.service
121
  Wants=network-online.target
122
 
 
125
  User=root
126
  WorkingDirectory=${REPO_DIR}
127
  EnvironmentFile=${ENV_FILE}
128
+ ExecStart=${PY_VENV_DIR}/bin/uvicorn server.local_laptop_control:app --host ${CONTROL_HOST} --port ${CONTROL_PORT}
129
  Restart=always
130
  RestartSec=3
131
 
 
134
  EOF
135
 
136
  systemctl daemon-reload
137
+ systemctl enable --now antiatropos-control
138
 
139
  echo ""
140
+ echo "Waiting for control API readiness..."
141
  for _ in {1..30}; do
142
+ if curl -fsS "http://127.0.0.1:${CONTROL_PORT}/health" >/dev/null 2>&1; then
143
  break
144
  fi
145
  sleep 2
 
153
 
154
  echo ""
155
  echo "=== Deploy Complete ==="
156
+ echo "Control health: http://127.0.0.1:${CONTROL_PORT}/health"
157
+ echo "Control step: http://127.0.0.1:${CONTROL_PORT}/step"
158
  echo "Prometheus svc: kubectl -n ${MONITORING_NAMESPACE} get svc prometheus-server"
159
  echo "Prometheus URL: ${PROM_URL_DISPLAY}"
160
  echo "Grafana access: kubectl -n ${MONITORING_NAMESPACE} port-forward svc/grafana 3000:80"
161
  echo ""
162
  echo "Service status command:"
163
+ echo " systemctl status antiatropos-control --no-pager"
164
  echo ""
165
+ echo "If needed, edit env and restart control service:"
166
  echo " ${ENV_FILE}"
167
+ echo " systemctl restart antiatropos-control"
168
+ echo ""
169
+ echo "Verify remote scaling path:"
170
+ echo " watch -n 1 'kubectl -n prod-sre get deploy -o custom-columns=NAME:.metadata.name,DESIRED:.spec.replicas,READY:.status.readyReplicas'"
deploy/do/uninstall-legacy-openenv.sh ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ # Removes legacy VM OpenEnv service path.
5
+ # This keeps droplet runtime focused on control API + observability only.
6
+
7
+ if [[ "${EUID}" -ne 0 ]]; then
8
+ echo "Run as root: sudo bash deploy/do/uninstall-legacy-openenv.sh"
9
+ exit 1
10
+ fi
11
+
12
+ if systemctl list-unit-files | grep -q '^antiatropos-fastapi\.service'; then
13
+ echo "Stopping and disabling antiatropos-fastapi..."
14
+ systemctl disable --now antiatropos-fastapi >/dev/null 2>&1 || true
15
+ else
16
+ echo "antiatropos-fastapi service not registered."
17
+ fi
18
+
19
+ if [[ -f /etc/systemd/system/antiatropos-fastapi.service ]]; then
20
+ rm -f /etc/systemd/system/antiatropos-fastapi.service
21
+ echo "Removed /etc/systemd/system/antiatropos-fastapi.service"
22
+ fi
23
+
24
+ systemctl daemon-reload
25
+ echo "Legacy VM OpenEnv service cleanup complete."
deploy/prometheus-helm-values.yaml CHANGED
@@ -27,3 +27,4 @@ extraScrapeConfigs: |
27
  regex: ([^:]+)(?::\d+)?;(\d+)
28
  replacement: $1:$2
29
  target_label: __address__
 
 
27
  regex: ([^:]+)(?::\d+)?;(\d+)
28
  replacement: $1:$2
29
  target_label: __address__
30
+