div18 commited on
Commit ·
504ced2
1
Parent(s): af7de22
consolidate
Browse files- .env.example +5 -2
- control/kubernetes_executor.py +87 -24
- deploy/do/README.md +44 -11
- deploy/do/antiatropos-control.service +16 -0
- deploy/do/deploy-droplet-one-shot.sh +25 -16
- deploy/do/uninstall-legacy-openenv.sh +25 -0
- deploy/prometheus-helm-values.yaml +1 -0
.env.example
CHANGED
|
@@ -7,14 +7,16 @@ ANTIATROPOS_ENV_MODE=live
|
|
| 7 |
ANTIATROPOS_REWARD_OUTPUT_MODE=normalized
|
| 8 |
|
| 9 |
# Prometheus endpoint used by local simulator FastAPI.
|
| 10 |
-
#
|
| 11 |
-
PROMETHEUS_URL=http://
|
| 12 |
ANTIATROPOS_PROM_TIMEOUT_S=5.0
|
| 13 |
ANTIATROPOS_STRICT_REAL=false
|
| 14 |
ANTIATROPOS_METRIC_AGGREGATION=sum
|
| 15 |
|
| 16 |
# Kubernetes execution settings
|
| 17 |
KUBECONFIG=C:/Users/your-user/.kube/config
|
|
|
|
|
|
|
| 18 |
ANTIATROPOS_K8S_NAMESPACE=prod-sre
|
| 19 |
ANTIATROPOS_MIN_REPLICAS=1
|
| 20 |
ANTIATROPOS_MAX_REPLICAS=
|
|
@@ -31,6 +33,7 @@ ANTIATROPOS_GRAFANA_MODE=local
|
|
| 31 |
# If GROQ_API_KEY is set and API_BASE_URL is not set, inference.py auto-uses Groq.
|
| 32 |
GROQ_API_KEY=
|
| 33 |
MODEL_NAME=llama-3.1-8b-instant
|
|
|
|
| 34 |
ENV_URL=http://localhost:8000
|
| 35 |
ANTIATROPOS_MODE=live
|
| 36 |
ANTIATROPOS_LABEL_NODE_MAP={"payments":"node-0","checkout":"node-1","catalog":"node-2","cart":"node-3","auth":"node-4"}
|
|
|
|
| 7 |
ANTIATROPOS_REWARD_OUTPUT_MODE=normalized
|
| 8 |
|
| 9 |
# Prometheus endpoint used by local simulator FastAPI.
|
| 10 |
+
# Consolidated path: droplet Prometheus NodePort.
|
| 11 |
+
PROMETHEUS_URL=http://<droplet-ip>:30090
|
| 12 |
ANTIATROPOS_PROM_TIMEOUT_S=5.0
|
| 13 |
ANTIATROPOS_STRICT_REAL=false
|
| 14 |
ANTIATROPOS_METRIC_AGGREGATION=sum
|
| 15 |
|
| 16 |
# Kubernetes execution settings
|
| 17 |
KUBECONFIG=C:/Users/your-user/.kube/config
|
| 18 |
+
ANTIATROPOS_CONTROL_PLANE_URL=http://<droplet-ip>:8010
|
| 19 |
+
ANTIATROPOS_CONTROL_TIMEOUT_S=8.0
|
| 20 |
ANTIATROPOS_K8S_NAMESPACE=prod-sre
|
| 21 |
ANTIATROPOS_MIN_REPLICAS=1
|
| 22 |
ANTIATROPOS_MAX_REPLICAS=
|
|
|
|
| 33 |
# If GROQ_API_KEY is set and API_BASE_URL is not set, inference.py auto-uses Groq.
|
| 34 |
GROQ_API_KEY=
|
| 35 |
MODEL_NAME=llama-3.1-8b-instant
|
| 36 |
+
# Local OpenEnv runtime remains authoritative.
|
| 37 |
ENV_URL=http://localhost:8000
|
| 38 |
ANTIATROPOS_MODE=live
|
| 39 |
ANTIATROPOS_LABEL_NODE_MAP={"payments":"node-0","checkout":"node-1","catalog":"node-2","cart":"node-3","auth":"node-4"}
|
control/kubernetes_executor.py
CHANGED
|
@@ -18,6 +18,8 @@ class KubernetesExecutor:
|
|
| 18 |
self.kubeconfig = kubeconfig or os.getenv("KUBECONFIG")
|
| 19 |
self.remote_control_url = os.getenv("ANTIATROPOS_CONTROL_PLANE_URL", "").strip().rstrip("/")
|
| 20 |
self.remote_timeout_s = float(os.getenv("ANTIATROPOS_CONTROL_TIMEOUT_S", "5.0"))
|
|
|
|
|
|
|
| 21 |
self.is_mock = (
|
| 22 |
not self.remote_control_url
|
| 23 |
and (not self.kubeconfig or self.kubeconfig.lower() == "mock")
|
|
@@ -29,6 +31,8 @@ class KubernetesExecutor:
|
|
| 29 |
self._apps_v1_api = None
|
| 30 |
self._node_workload_map = self._load_node_workload_map()
|
| 31 |
self._live_supported_actions = {"NO_OP", "SCALE_UP", "SCALE_DOWN"}
|
|
|
|
|
|
|
| 32 |
|
| 33 |
@staticmethod
|
| 34 |
def _parse_max_replicas(raw: Optional[str]) -> Optional[int]:
|
|
@@ -161,10 +165,11 @@ class KubernetesExecutor:
|
|
| 161 |
f"(bounds {self.min_replicas}-{upper})"
|
| 162 |
)
|
| 163 |
|
| 164 |
-
|
| 165 |
-
|
|
|
|
| 166 |
namespace=namespace,
|
| 167 |
-
|
| 168 |
)
|
| 169 |
|
| 170 |
return (
|
|
@@ -172,6 +177,34 @@ class KubernetesExecutor:
|
|
| 172 |
f"in namespace {namespace} scaled {current}->{desired}"
|
| 173 |
)
|
| 174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
def _remote_execution(self, action: str, target: str, parameter: float) -> str:
|
| 176 |
"""
|
| 177 |
Delegate action execution to a remote FastAPI control plane.
|
|
@@ -180,6 +213,9 @@ class KubernetesExecutor:
|
|
| 180 |
- POST /step
|
| 181 |
- Request: {action_type, target_node_id, parameter}
|
| 182 |
- Success response includes ack_status and starts with "Ack:"
|
|
|
|
|
|
|
|
|
|
| 183 |
"""
|
| 184 |
if not self.remote_control_url:
|
| 185 |
raise ValueError("ANTIATROPOS_CONTROL_PLANE_URL is not configured")
|
|
@@ -192,27 +228,7 @@ class KubernetesExecutor:
|
|
| 192 |
}
|
| 193 |
payload = action_payload
|
| 194 |
|
| 195 |
-
|
| 196 |
-
response = requests.post(endpoint, json=payload, timeout=self.remote_timeout_s)
|
| 197 |
-
except requests.RequestException as exc:
|
| 198 |
-
raise RuntimeError(f"Remote control-plane request failed: {exc}") from exc
|
| 199 |
-
|
| 200 |
-
if response.status_code == 422:
|
| 201 |
-
# OpenEnv server.app expects {"action": {...}} shape on /step.
|
| 202 |
-
try:
|
| 203 |
-
body = response.json()
|
| 204 |
-
detail = str(body.get("detail", body))
|
| 205 |
-
except Exception:
|
| 206 |
-
detail = response.text.strip()
|
| 207 |
-
if "body" in detail and "action" in detail:
|
| 208 |
-
try:
|
| 209 |
-
response = requests.post(
|
| 210 |
-
endpoint,
|
| 211 |
-
json={"action": action_payload},
|
| 212 |
-
timeout=self.remote_timeout_s,
|
| 213 |
-
)
|
| 214 |
-
except requests.RequestException as exc:
|
| 215 |
-
raise RuntimeError(f"Remote control-plane retry failed: {exc}") from exc
|
| 216 |
|
| 217 |
if response.status_code >= 400:
|
| 218 |
detail = ""
|
|
@@ -221,6 +237,12 @@ class KubernetesExecutor:
|
|
| 221 |
detail = str(body.get("detail", body))
|
| 222 |
except Exception:
|
| 223 |
detail = response.text.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
raise RuntimeError(
|
| 225 |
f"Remote control-plane rejected action ({response.status_code}): {detail}"
|
| 226 |
)
|
|
@@ -236,6 +258,47 @@ class KubernetesExecutor:
|
|
| 236 |
return f"Ack: {action} for {target} via remote control-plane ({action_id})"
|
| 237 |
return ack
|
| 238 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
def _get_apps_v1_api(self):
|
| 240 |
if self._apps_v1_api is not None:
|
| 241 |
return self._apps_v1_api
|
|
|
|
| 18 |
self.kubeconfig = kubeconfig or os.getenv("KUBECONFIG")
|
| 19 |
self.remote_control_url = os.getenv("ANTIATROPOS_CONTROL_PLANE_URL", "").strip().rstrip("/")
|
| 20 |
self.remote_timeout_s = float(os.getenv("ANTIATROPOS_CONTROL_TIMEOUT_S", "5.0"))
|
| 21 |
+
self.remote_retry_count = int(os.getenv("ANTIATROPOS_CONTROL_RETRY_COUNT", "2"))
|
| 22 |
+
self.remote_retry_backoff_s = float(os.getenv("ANTIATROPOS_CONTROL_RETRY_BACKOFF_S", "0.25"))
|
| 23 |
self.is_mock = (
|
| 24 |
not self.remote_control_url
|
| 25 |
and (not self.kubeconfig or self.kubeconfig.lower() == "mock")
|
|
|
|
| 31 |
self._apps_v1_api = None
|
| 32 |
self._node_workload_map = self._load_node_workload_map()
|
| 33 |
self._live_supported_actions = {"NO_OP", "SCALE_UP", "SCALE_DOWN"}
|
| 34 |
+
self.k8s_retry_count = int(os.getenv("ANTIATROPOS_K8S_RETRY_COUNT", "2"))
|
| 35 |
+
self.k8s_retry_backoff_s = float(os.getenv("ANTIATROPOS_K8S_RETRY_BACKOFF_S", "0.2"))
|
| 36 |
|
| 37 |
@staticmethod
|
| 38 |
def _parse_max_replicas(raw: Optional[str]) -> Optional[int]:
|
|
|
|
| 165 |
f"(bounds {self.min_replicas}-{upper})"
|
| 166 |
)
|
| 167 |
|
| 168 |
+
self._patch_deployment_scale_with_retry(
|
| 169 |
+
apps_v1=apps_v1,
|
| 170 |
+
deployment_name=deployment_name,
|
| 171 |
namespace=namespace,
|
| 172 |
+
desired=desired,
|
| 173 |
)
|
| 174 |
|
| 175 |
return (
|
|
|
|
| 177 |
f"in namespace {namespace} scaled {current}->{desired}"
|
| 178 |
)
|
| 179 |
|
| 180 |
+
def _patch_deployment_scale_with_retry(self, apps_v1, deployment_name: str, namespace: str, desired: int) -> None:
|
| 181 |
+
"""
|
| 182 |
+
Patch deployment replicas with retries for transient API server errors.
|
| 183 |
+
"""
|
| 184 |
+
from kubernetes.client.rest import ApiException
|
| 185 |
+
|
| 186 |
+
max_attempts = max(1, self.k8s_retry_count + 1)
|
| 187 |
+
for attempt in range(1, max_attempts + 1):
|
| 188 |
+
try:
|
| 189 |
+
apps_v1.patch_namespaced_deployment_scale(
|
| 190 |
+
name=deployment_name,
|
| 191 |
+
namespace=namespace,
|
| 192 |
+
body={"spec": {"replicas": desired}},
|
| 193 |
+
)
|
| 194 |
+
return
|
| 195 |
+
except ApiException as exc:
|
| 196 |
+
retryable = exc.status in (409, 429, 500, 502, 503, 504)
|
| 197 |
+
if (not retryable) or attempt >= max_attempts:
|
| 198 |
+
raise
|
| 199 |
+
sleep_s = self.k8s_retry_backoff_s * (2 ** (attempt - 1))
|
| 200 |
+
logger.warning(
|
| 201 |
+
"Retrying deployment scale patch after ApiException status=%s attempt=%s/%s",
|
| 202 |
+
exc.status,
|
| 203 |
+
attempt,
|
| 204 |
+
max_attempts,
|
| 205 |
+
)
|
| 206 |
+
time.sleep(sleep_s)
|
| 207 |
+
|
| 208 |
def _remote_execution(self, action: str, target: str, parameter: float) -> str:
|
| 209 |
"""
|
| 210 |
Delegate action execution to a remote FastAPI control plane.
|
|
|
|
| 213 |
- POST /step
|
| 214 |
- Request: {action_type, target_node_id, parameter}
|
| 215 |
- Success response includes ack_status and starts with "Ack:"
|
| 216 |
+
|
| 217 |
+
This contract matches server.local_laptop_control and is the only
|
| 218 |
+
supported remote control-plane format.
|
| 219 |
"""
|
| 220 |
if not self.remote_control_url:
|
| 221 |
raise ValueError("ANTIATROPOS_CONTROL_PLANE_URL is not configured")
|
|
|
|
| 228 |
}
|
| 229 |
payload = action_payload
|
| 230 |
|
| 231 |
+
response = self._post_with_retry(endpoint=endpoint, payload=payload)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
|
| 233 |
if response.status_code >= 400:
|
| 234 |
detail = ""
|
|
|
|
| 237 |
detail = str(body.get("detail", body))
|
| 238 |
except Exception:
|
| 239 |
detail = response.text.strip()
|
| 240 |
+
if response.status_code == 422 and "action" in detail:
|
| 241 |
+
detail = (
|
| 242 |
+
f"{detail}. Expected lightweight control-plane contract at "
|
| 243 |
+
f"{endpoint}: "
|
| 244 |
+
'{"action_type":"SCALE_UP","target_node_id":"node-0","parameter":1.0}'
|
| 245 |
+
)
|
| 246 |
raise RuntimeError(
|
| 247 |
f"Remote control-plane rejected action ({response.status_code}): {detail}"
|
| 248 |
)
|
|
|
|
| 258 |
return f"Ack: {action} for {target} via remote control-plane ({action_id})"
|
| 259 |
return ack
|
| 260 |
|
| 261 |
+
def _post_with_retry(self, endpoint: str, payload: dict) -> requests.Response:
|
| 262 |
+
"""
|
| 263 |
+
POST helper with retries for transient HTTP/network failures.
|
| 264 |
+
"""
|
| 265 |
+
max_attempts = max(1, self.remote_retry_count + 1)
|
| 266 |
+
last_exc: Optional[Exception] = None
|
| 267 |
+
|
| 268 |
+
for attempt in range(1, max_attempts + 1):
|
| 269 |
+
try:
|
| 270 |
+
response = requests.post(endpoint, json=payload, timeout=self.remote_timeout_s)
|
| 271 |
+
except requests.RequestException as exc:
|
| 272 |
+
last_exc = exc
|
| 273 |
+
if attempt >= max_attempts:
|
| 274 |
+
break
|
| 275 |
+
sleep_s = self.remote_retry_backoff_s * (2 ** (attempt - 1))
|
| 276 |
+
logger.warning(
|
| 277 |
+
"Retrying remote control-plane POST after network error attempt=%s/%s: %s",
|
| 278 |
+
attempt,
|
| 279 |
+
max_attempts,
|
| 280 |
+
exc,
|
| 281 |
+
)
|
| 282 |
+
time.sleep(sleep_s)
|
| 283 |
+
continue
|
| 284 |
+
|
| 285 |
+
if response.status_code >= 500 and attempt < max_attempts:
|
| 286 |
+
sleep_s = self.remote_retry_backoff_s * (2 ** (attempt - 1))
|
| 287 |
+
logger.warning(
|
| 288 |
+
"Retrying remote control-plane POST after HTTP %s attempt=%s/%s",
|
| 289 |
+
response.status_code,
|
| 290 |
+
attempt,
|
| 291 |
+
max_attempts,
|
| 292 |
+
)
|
| 293 |
+
time.sleep(sleep_s)
|
| 294 |
+
continue
|
| 295 |
+
|
| 296 |
+
return response
|
| 297 |
+
|
| 298 |
+
if last_exc is not None:
|
| 299 |
+
raise RuntimeError(f"Remote control-plane request failed: {last_exc}") from last_exc
|
| 300 |
+
raise RuntimeError("Remote control-plane request failed after retries")
|
| 301 |
+
|
| 302 |
def _get_apps_v1_api(self):
|
| 303 |
if self._apps_v1_api is not None:
|
| 304 |
return self._apps_v1_api
|
deploy/do/README.md
CHANGED
|
@@ -4,7 +4,12 @@ This deploy flow is for a single Ubuntu Droplet running:
|
|
| 4 |
- k3s (single-node Kubernetes)
|
| 5 |
- AntiAtropos sample workloads (`prod-sre`)
|
| 6 |
- Prometheus + Grafana (`monitoring`)
|
| 7 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
## Run
|
| 10 |
|
|
@@ -17,7 +22,7 @@ sudo bash deploy/do/deploy-droplet-one-shot.sh
|
|
| 17 |
Optional overrides:
|
| 18 |
|
| 19 |
```bash
|
| 20 |
-
sudo REPO_DIR=/opt/AntiAtropos
|
| 21 |
```
|
| 22 |
|
| 23 |
## What the script configures
|
|
@@ -26,34 +31,62 @@ sudo REPO_DIR=/opt/AntiAtropos FASTAPI_PORT=8010 MAX_REPLICAS=200 bash deploy/do
|
|
| 26 |
- Prometheus service exposed on NodePort `30090`
|
| 27 |
- Prometheus scrape job for annotated pods in namespace `prod-sre`
|
| 28 |
- Env file at `.env.droplet` with:
|
| 29 |
-
- `ANTIATROPOS_ENV_MODE=live`
|
| 30 |
- `KUBECONFIG=/etc/rancher/k3s/k3s.yaml`
|
| 31 |
- `ANTIATROPOS_WORKLOAD_MAP` for `node-0`..`node-4`
|
| 32 |
- Systemd service:
|
| 33 |
-
- Name: `antiatropos-
|
| 34 |
-
- Exec: `uvicorn server.
|
|
|
|
|
|
|
| 35 |
|
| 36 |
## Verify
|
| 37 |
|
| 38 |
```bash
|
| 39 |
-
systemctl status antiatropos-
|
| 40 |
-
curl http://127.0.0.1:
|
| 41 |
kubectl get deploy -n prod-sre
|
| 42 |
kubectl get pods -n monitoring
|
| 43 |
curl http://127.0.0.1:30090/api/v1/targets
|
| 44 |
kubectl -n monitoring port-forward svc/grafana 3000:80
|
| 45 |
```
|
| 46 |
|
| 47 |
-
|
| 48 |
|
| 49 |
```env
|
|
|
|
|
|
|
| 50 |
PROMETHEUS_URL=http://<droplet-ip>:30090
|
| 51 |
```
|
| 52 |
|
| 53 |
-
##
|
|
|
|
|
|
|
| 54 |
|
| 55 |
```bash
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
-H "Content-Type: application/json" \
|
| 58 |
-
-d '{"action_type":"SCALE_UP","target_node_id":"node-
|
| 59 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
- k3s (single-node Kubernetes)
|
| 5 |
- AntiAtropos sample workloads (`prod-sre`)
|
| 6 |
- Prometheus + Grafana (`monitoring`)
|
| 7 |
+
- lightweight control-plane API (`antiatropos-control` on port `8010`)
|
| 8 |
+
|
| 9 |
+
The OpenEnv runtime (`server.app`) is intentionally **not** run on the droplet.
|
| 10 |
+
The only supported split is:
|
| 11 |
+
- local machine: OpenEnv server + inference loop
|
| 12 |
+
- droplet: Kubernetes executor API + observability stack
|
| 13 |
|
| 14 |
## Run
|
| 15 |
|
|
|
|
| 22 |
Optional overrides:
|
| 23 |
|
| 24 |
```bash
|
| 25 |
+
sudo REPO_DIR=/opt/AntiAtropos CONTROL_PORT=8010 MAX_REPLICAS=200 bash deploy/do/deploy-droplet-one-shot.sh
|
| 26 |
```
|
| 27 |
|
| 28 |
## What the script configures
|
|
|
|
| 31 |
- Prometheus service exposed on NodePort `30090`
|
| 32 |
- Prometheus scrape job for annotated pods in namespace `prod-sre`
|
| 33 |
- Env file at `.env.droplet` with:
|
|
|
|
| 34 |
- `KUBECONFIG=/etc/rancher/k3s/k3s.yaml`
|
| 35 |
- `ANTIATROPOS_WORKLOAD_MAP` for `node-0`..`node-4`
|
| 36 |
- Systemd service:
|
| 37 |
+
- Name: `antiatropos-control`
|
| 38 |
+
- Exec: `uvicorn server.local_laptop_control:app --host 0.0.0.0 --port 8010`
|
| 39 |
+
- Legacy cleanup:
|
| 40 |
+
- `antiatropos-fastapi` (VM OpenEnv service) is disabled/removed by default deploy path
|
| 41 |
|
| 42 |
## Verify
|
| 43 |
|
| 44 |
```bash
|
| 45 |
+
systemctl status antiatropos-control --no-pager
|
| 46 |
+
curl http://127.0.0.1:8010/health
|
| 47 |
kubectl get deploy -n prod-sre
|
| 48 |
kubectl get pods -n monitoring
|
| 49 |
curl http://127.0.0.1:30090/api/v1/targets
|
| 50 |
kubectl -n monitoring port-forward svc/grafana 3000:80
|
| 51 |
```
|
| 52 |
|
| 53 |
+
Set local `.env` to use this consolidated path:
|
| 54 |
|
| 55 |
```env
|
| 56 |
+
ENV_URL=http://localhost:8000
|
| 57 |
+
ANTIATROPOS_CONTROL_PLANE_URL=http://<droplet-ip>:8010
|
| 58 |
PROMETHEUS_URL=http://<droplet-ip>:30090
|
| 59 |
```
|
| 60 |
|
| 61 |
+
## Deterministic remote-scaling proof
|
| 62 |
+
|
| 63 |
+
On droplet, watch desired replicas:
|
| 64 |
|
| 65 |
```bash
|
| 66 |
+
watch -n 1 'kubectl -n prod-sre get deploy -o custom-columns=NAME:.metadata.name,DESIRED:.spec.replicas,READY:.status.readyReplicas,AVAILABLE:.status.availableReplicas'
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
From local machine, send one control action:
|
| 70 |
+
|
| 71 |
+
```bash
|
| 72 |
+
curl -X POST http://<droplet-ip>:8010/step \
|
| 73 |
-H "Content-Type: application/json" \
|
| 74 |
+
-d '{"action_type":"SCALE_UP","target_node_id":"node-0","parameter":1.0}'
|
| 75 |
```
|
| 76 |
+
|
| 77 |
+
If `payments` desired replicas increase, scaling is happening on droplet.
|
| 78 |
+
|
| 79 |
+
## Troubleshooting
|
| 80 |
+
|
| 81 |
+
- **Pods do not move during inference**
|
| 82 |
+
- Verify local env points to droplet control API:
|
| 83 |
+
- `ANTIATROPOS_CONTROL_PLANE_URL=http://<droplet-ip>:8010`
|
| 84 |
+
- Check droplet control health:
|
| 85 |
+
- `curl http://127.0.0.1:8010/health`
|
| 86 |
+
- Check service status:
|
| 87 |
+
- `systemctl status antiatropos-control --no-pager`
|
| 88 |
+
- **Connection refused from local to droplet:8010**
|
| 89 |
+
- Service not running or firewall closed.
|
| 90 |
+
- Start service and open firewall if needed.
|
| 91 |
+
- **Need to remove legacy VM OpenEnv service**
|
| 92 |
+
- `sudo bash deploy/do/uninstall-legacy-openenv.sh`
|
deploy/do/antiatropos-control.service
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[Unit]
|
| 2 |
+
Description=AntiAtropos Droplet Control API
|
| 3 |
+
After=network-online.target k3s.service
|
| 4 |
+
Wants=network-online.target
|
| 5 |
+
|
| 6 |
+
[Service]
|
| 7 |
+
Type=simple
|
| 8 |
+
User=root
|
| 9 |
+
WorkingDirectory=/root/Anti-Atropos
|
| 10 |
+
EnvironmentFile=/root/Anti-Atropos/.env.droplet
|
| 11 |
+
ExecStart=/root/Anti-Atropos/.venv-droplet/bin/uvicorn server.local_laptop_control:app --host 0.0.0.0 --port 8010
|
| 12 |
+
Restart=always
|
| 13 |
+
RestartSec=3
|
| 14 |
+
|
| 15 |
+
[Install]
|
| 16 |
+
WantedBy=multi-user.target
|
deploy/do/deploy-droplet-one-shot.sh
CHANGED
|
@@ -5,7 +5,7 @@ set -euo pipefail
|
|
| 5 |
# - Installs k3s with kubelet max-pods=250
|
| 6 |
# - Deploys workloads + Prometheus + Grafana
|
| 7 |
# - Creates env file for live Kubernetes scaling
|
| 8 |
-
# - Starts
|
| 9 |
|
| 10 |
if [[ "${EUID}" -ne 0 ]]; then
|
| 11 |
echo "Run as root: sudo bash deploy/do/deploy-droplet-one-shot.sh"
|
|
@@ -14,8 +14,8 @@ fi
|
|
| 14 |
|
| 15 |
REPO_DIR="${REPO_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)}"
|
| 16 |
KUBECONFIG_PATH="${KUBECONFIG_PATH:-/etc/rancher/k3s/k3s.yaml}"
|
| 17 |
-
|
| 18 |
-
|
| 19 |
K8S_NAMESPACE="${K8S_NAMESPACE:-prod-sre}"
|
| 20 |
MONITORING_NAMESPACE="${MONITORING_NAMESPACE:-monitoring}"
|
| 21 |
PY_VENV_DIR="${PY_VENV_DIR:-${REPO_DIR}/.venv-droplet}"
|
|
@@ -28,7 +28,7 @@ WORKLOAD_MAP="${WORKLOAD_MAP:-{\"node-0\":{\"deployment\":\"payments\",\"namespa
|
|
| 28 |
echo "=== AntiAtropos Droplet One-Shot Deploy ==="
|
| 29 |
echo "Repo: ${REPO_DIR}"
|
| 30 |
echo "Kubeconfig: ${KUBECONFIG_PATH}"
|
| 31 |
-
echo "
|
| 32 |
echo ""
|
| 33 |
|
| 34 |
if [[ ! -f "${REPO_DIR}/deploy/local-laptop.yaml" ]]; then
|
|
@@ -85,7 +85,6 @@ helm upgrade --install grafana grafana/grafana \
|
|
| 85 |
|
| 86 |
if [[ ! -f "${ENV_FILE}" ]]; then
|
| 87 |
cat > "${ENV_FILE}" <<EOF
|
| 88 |
-
ANTIATROPOS_ENV_MODE=live
|
| 89 |
KUBECONFIG=/etc/rancher/k3s/k3s.yaml
|
| 90 |
ANTIATROPOS_K8S_NAMESPACE=prod-sre
|
| 91 |
ANTIATROPOS_MIN_REPLICAS=${MIN_REPLICAS}
|
|
@@ -108,9 +107,16 @@ else
|
|
| 108 |
"${PY_VENV_DIR}/bin/pip" install -r "${REPO_DIR}/server/requirements.txt"
|
| 109 |
fi
|
| 110 |
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
[Unit]
|
| 113 |
-
Description=AntiAtropos
|
| 114 |
After=network-online.target k3s.service
|
| 115 |
Wants=network-online.target
|
| 116 |
|
|
@@ -119,7 +125,7 @@ Type=simple
|
|
| 119 |
User=root
|
| 120 |
WorkingDirectory=${REPO_DIR}
|
| 121 |
EnvironmentFile=${ENV_FILE}
|
| 122 |
-
ExecStart=${PY_VENV_DIR}/bin/uvicorn server.
|
| 123 |
Restart=always
|
| 124 |
RestartSec=3
|
| 125 |
|
|
@@ -128,12 +134,12 @@ WantedBy=multi-user.target
|
|
| 128 |
EOF
|
| 129 |
|
| 130 |
systemctl daemon-reload
|
| 131 |
-
systemctl enable --now antiatropos-
|
| 132 |
|
| 133 |
echo ""
|
| 134 |
-
echo "Waiting for
|
| 135 |
for _ in {1..30}; do
|
| 136 |
-
if curl -fsS "http://127.0.0.1:${
|
| 137 |
break
|
| 138 |
fi
|
| 139 |
sleep 2
|
|
@@ -147,15 +153,18 @@ PROM_URL_DISPLAY="http://${PUBLIC_IP:-<droplet-ip>}:30090"
|
|
| 147 |
|
| 148 |
echo ""
|
| 149 |
echo "=== Deploy Complete ==="
|
| 150 |
-
echo "
|
| 151 |
-
echo "
|
| 152 |
echo "Prometheus svc: kubectl -n ${MONITORING_NAMESPACE} get svc prometheus-server"
|
| 153 |
echo "Prometheus URL: ${PROM_URL_DISPLAY}"
|
| 154 |
echo "Grafana access: kubectl -n ${MONITORING_NAMESPACE} port-forward svc/grafana 3000:80"
|
| 155 |
echo ""
|
| 156 |
echo "Service status command:"
|
| 157 |
-
echo " systemctl status antiatropos-
|
| 158 |
echo ""
|
| 159 |
-
echo "If needed, edit env and restart:"
|
| 160 |
echo " ${ENV_FILE}"
|
| 161 |
-
echo " systemctl restart antiatropos-
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
# - Installs k3s with kubelet max-pods=250
|
| 6 |
# - Deploys workloads + Prometheus + Grafana
|
| 7 |
# - Creates env file for live Kubernetes scaling
|
| 8 |
+
# - Starts lightweight control-plane API via systemd (antiatropos-control)
|
| 9 |
|
| 10 |
if [[ "${EUID}" -ne 0 ]]; then
|
| 11 |
echo "Run as root: sudo bash deploy/do/deploy-droplet-one-shot.sh"
|
|
|
|
| 14 |
|
| 15 |
REPO_DIR="${REPO_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)}"
|
| 16 |
KUBECONFIG_PATH="${KUBECONFIG_PATH:-/etc/rancher/k3s/k3s.yaml}"
|
| 17 |
+
CONTROL_PORT="${CONTROL_PORT:-8010}"
|
| 18 |
+
CONTROL_HOST="${CONTROL_HOST:-0.0.0.0}"
|
| 19 |
K8S_NAMESPACE="${K8S_NAMESPACE:-prod-sre}"
|
| 20 |
MONITORING_NAMESPACE="${MONITORING_NAMESPACE:-monitoring}"
|
| 21 |
PY_VENV_DIR="${PY_VENV_DIR:-${REPO_DIR}/.venv-droplet}"
|
|
|
|
| 28 |
echo "=== AntiAtropos Droplet One-Shot Deploy ==="
|
| 29 |
echo "Repo: ${REPO_DIR}"
|
| 30 |
echo "Kubeconfig: ${KUBECONFIG_PATH}"
|
| 31 |
+
echo "Control API: ${CONTROL_HOST}:${CONTROL_PORT}"
|
| 32 |
echo ""
|
| 33 |
|
| 34 |
if [[ ! -f "${REPO_DIR}/deploy/local-laptop.yaml" ]]; then
|
|
|
|
| 85 |
|
| 86 |
if [[ ! -f "${ENV_FILE}" ]]; then
|
| 87 |
cat > "${ENV_FILE}" <<EOF
|
|
|
|
| 88 |
KUBECONFIG=/etc/rancher/k3s/k3s.yaml
|
| 89 |
ANTIATROPOS_K8S_NAMESPACE=prod-sre
|
| 90 |
ANTIATROPOS_MIN_REPLICAS=${MIN_REPLICAS}
|
|
|
|
| 107 |
"${PY_VENV_DIR}/bin/pip" install -r "${REPO_DIR}/server/requirements.txt"
|
| 108 |
fi
|
| 109 |
|
| 110 |
+
# Hard cleanup: remove legacy VM OpenEnv service if it exists.
|
| 111 |
+
if systemctl list-unit-files | grep -q '^antiatropos-fastapi\.service'; then
|
| 112 |
+
echo "Disabling legacy service antiatropos-fastapi..."
|
| 113 |
+
systemctl disable --now antiatropos-fastapi >/dev/null 2>&1 || true
|
| 114 |
+
rm -f /etc/systemd/system/antiatropos-fastapi.service
|
| 115 |
+
fi
|
| 116 |
+
|
| 117 |
+
cat > /etc/systemd/system/antiatropos-control.service <<EOF
|
| 118 |
[Unit]
|
| 119 |
+
Description=AntiAtropos Droplet Control API
|
| 120 |
After=network-online.target k3s.service
|
| 121 |
Wants=network-online.target
|
| 122 |
|
|
|
|
| 125 |
User=root
|
| 126 |
WorkingDirectory=${REPO_DIR}
|
| 127 |
EnvironmentFile=${ENV_FILE}
|
| 128 |
+
ExecStart=${PY_VENV_DIR}/bin/uvicorn server.local_laptop_control:app --host ${CONTROL_HOST} --port ${CONTROL_PORT}
|
| 129 |
Restart=always
|
| 130 |
RestartSec=3
|
| 131 |
|
|
|
|
| 134 |
EOF
|
| 135 |
|
| 136 |
systemctl daemon-reload
|
| 137 |
+
systemctl enable --now antiatropos-control
|
| 138 |
|
| 139 |
echo ""
|
| 140 |
+
echo "Waiting for control API readiness..."
|
| 141 |
for _ in {1..30}; do
|
| 142 |
+
if curl -fsS "http://127.0.0.1:${CONTROL_PORT}/health" >/dev/null 2>&1; then
|
| 143 |
break
|
| 144 |
fi
|
| 145 |
sleep 2
|
|
|
|
| 153 |
|
| 154 |
echo ""
|
| 155 |
echo "=== Deploy Complete ==="
|
| 156 |
+
echo "Control health: http://127.0.0.1:${CONTROL_PORT}/health"
|
| 157 |
+
echo "Control step: http://127.0.0.1:${CONTROL_PORT}/step"
|
| 158 |
echo "Prometheus svc: kubectl -n ${MONITORING_NAMESPACE} get svc prometheus-server"
|
| 159 |
echo "Prometheus URL: ${PROM_URL_DISPLAY}"
|
| 160 |
echo "Grafana access: kubectl -n ${MONITORING_NAMESPACE} port-forward svc/grafana 3000:80"
|
| 161 |
echo ""
|
| 162 |
echo "Service status command:"
|
| 163 |
+
echo " systemctl status antiatropos-control --no-pager"
|
| 164 |
echo ""
|
| 165 |
+
echo "If needed, edit env and restart control service:"
|
| 166 |
echo " ${ENV_FILE}"
|
| 167 |
+
echo " systemctl restart antiatropos-control"
|
| 168 |
+
echo ""
|
| 169 |
+
echo "Verify remote scaling path:"
|
| 170 |
+
echo " watch -n 1 'kubectl -n prod-sre get deploy -o custom-columns=NAME:.metadata.name,DESIRED:.spec.replicas,READY:.status.readyReplicas'"
|
deploy/do/uninstall-legacy-openenv.sh
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
# Removes legacy VM OpenEnv service path.
|
| 5 |
+
# This keeps droplet runtime focused on control API + observability only.
|
| 6 |
+
|
| 7 |
+
if [[ "${EUID}" -ne 0 ]]; then
|
| 8 |
+
echo "Run as root: sudo bash deploy/do/uninstall-legacy-openenv.sh"
|
| 9 |
+
exit 1
|
| 10 |
+
fi
|
| 11 |
+
|
| 12 |
+
if systemctl list-unit-files | grep -q '^antiatropos-fastapi\.service'; then
|
| 13 |
+
echo "Stopping and disabling antiatropos-fastapi..."
|
| 14 |
+
systemctl disable --now antiatropos-fastapi >/dev/null 2>&1 || true
|
| 15 |
+
else
|
| 16 |
+
echo "antiatropos-fastapi service not registered."
|
| 17 |
+
fi
|
| 18 |
+
|
| 19 |
+
if [[ -f /etc/systemd/system/antiatropos-fastapi.service ]]; then
|
| 20 |
+
rm -f /etc/systemd/system/antiatropos-fastapi.service
|
| 21 |
+
echo "Removed /etc/systemd/system/antiatropos-fastapi.service"
|
| 22 |
+
fi
|
| 23 |
+
|
| 24 |
+
systemctl daemon-reload
|
| 25 |
+
echo "Legacy VM OpenEnv service cleanup complete."
|
deploy/prometheus-helm-values.yaml
CHANGED
|
@@ -27,3 +27,4 @@ extraScrapeConfigs: |
|
|
| 27 |
regex: ([^:]+)(?::\d+)?;(\d+)
|
| 28 |
replacement: $1:$2
|
| 29 |
target_label: __address__
|
|
|
|
|
|
| 27 |
regex: ([^:]+)(?::\d+)?;(\d+)
|
| 28 |
replacement: $1:$2
|
| 29 |
target_label: __address__
|
| 30 |
+
|