Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- AGENTS.md +56 -0
- CLAUDE.md +1 -0
- Dockerfile +13 -1
- README.md +1 -1
- __init__.py +25 -25
- agent_smoke.py +80 -0
- client.py +143 -140
- control/__init__.py +2 -2
- control/kubernetes_executor.py +396 -230
- control/validation.py +69 -38
- curriculum.py +131 -0
- deploy-local.ps1 +91 -0
- deploy/LOCAL_LAPTOP_FASTAPI_GUIDE.md +74 -0
- deploy/aws/ARCHITECTURE.md +361 -0
- deploy/aws/FASTAPI_AWS_MODE_GUIDE.md +72 -0
- deploy/aws/OPERATIONS.md +465 -0
- deploy/aws/README.md +361 -0
- deploy/aws/cluster-autoscaler-values.yaml +57 -0
- deploy/aws/deploy-all.ps1 +493 -0
- deploy/aws/deploy.ps1 +369 -0
- deploy/aws/deploy.sh +204 -0
- deploy/aws/eksctl-cluster.yaml +58 -0
- deploy/aws/generate-kubeconfig.ps1 +131 -0
- deploy/aws/generate-kubeconfig.sh +138 -0
- deploy/aws/grafana-trust-policy.json +12 -0
- deploy/aws/grafana-values.yaml +68 -0
- deploy/aws/k8s-workloads.yaml +296 -0
- deploy/aws/kubeconfig-antiatropos.yaml +34 -0
- deploy/aws/prometheus-agent-values.yaml +95 -0
- deploy/aws/teardown-all.ps1 +242 -0
- deploy/do/README.md +92 -0
- deploy/do/antiatropos-control.service +16 -0
- deploy/do/deploy-droplet-one-shot.sh +183 -0
- deploy/do/uninstall-legacy-openenv.sh +25 -0
- deploy/entrypoint.sh +71 -62
- deploy/grafana-datasource-local.yaml +11 -0
- deploy/grafana-helm-values.yaml +46 -0
- deploy/grafana/grafana.ini +21 -21
- deploy/grafana/provisioning/dashboards/dashboard.yaml +12 -12
- deploy/grafana/provisioning/dashboards/json/antiatropos-live.json +334 -334
- deploy/grafana/provisioning/dashboards/json/antiatropos-overview.json +21 -16
- deploy/grafana/provisioning/dashboards/json/antiatropos-workloads.json +436 -0
- deploy/grafana/provisioning/datasources/prometheus.yaml +2 -2
- deploy/index.html +473 -473
- deploy/kind-maxpods-250.yaml +11 -0
- deploy/local-laptop.yaml +365 -0
- deploy/local/datasource-local.yaml +10 -0
- deploy/local/grafana-local-values.yaml +34 -0
- deploy/local/prometheus-local-values.yaml +49 -0
- deploy/nginx.conf +89 -89
AGENTS.md
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AntiAtropos: The Physics of Autonomous SRE
|
| 2 |
+
|
| 3 |
+
> **"Infrastructure is not a static set of configurations; it is a dynamic system of energy, flow, and stability."**
|
| 4 |
+
|
| 5 |
+
## The Vision
|
| 6 |
+
AntiAtropos is a next-generation **Autonomous SRE (Site Reliability Engineering) Control Environment**. While traditional DevOps relies on static thresholds (e.g., "if CPU > 80%"), AntiAtropos treats a microservice cluster as a **Physics Engine**.
|
| 7 |
+
|
| 8 |
+
Our vision is to move from reactive scripts to **Dynamical System Control**. We are building an environment where AI agents don't just "fix things"—they balance the "Potential Energy" of a cluster to maintain equilibrium under extreme pressure.
|
| 9 |
+
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
## 1. The Physics Engine Concept
|
| 13 |
+
Traditional observability measures metrics; we measure **Stability**. We have modeled our 5-node cluster using **Fluid Queue Dynamics**, treating request flow like water and nodes like reservoirs.
|
| 14 |
+
|
| 15 |
+
### The Lyapunov Potential ($V$)
|
| 16 |
+
The "North Star" of our environment is the **Lyapunov Energy Function**:
|
| 17 |
+
$$V(s) = \sum_{i=1}^{N} w_i \cdot Q_i^2$$
|
| 18 |
+
* **$Q_i$ (Queue Depth):** The "Potential Energy" or mass accumulated in a service.
|
| 19 |
+
* **$w_i$ (Weight):** The "Gravity" or business importance (node-0 is the VIP Payment Gateway).
|
| 20 |
+
* **Cascading Failures:** Our physics engine models "Backlog Pressure," where one failing node can trigger a chain reaction across its neighbors.
|
| 21 |
+
|
| 22 |
+
### Advanced Latency Dynamics (M/M/1)
|
| 23 |
+
We move beyond linear latency models. AntiAtropos implements a **"Hockey-Stick" Latency Curve**. As utilization approaches 100%, latency increases exponentially—modeling the "Point of No Return" that real-world on-call engineers fear.
|
| 24 |
+
|
| 25 |
+
---
|
| 26 |
+
|
| 27 |
+
## 2. Training Strategy: The Professional Loop
|
| 28 |
+
To build a hackathon-winning agent, we use a complex training pipeline coordinated between **Google Colab** and **Hugging Face**:
|
| 29 |
+
|
| 30 |
+
### Progressive Curriculum Learning
|
| 31 |
+
Agents are not trained at random. They follow a **Curriculum** (`curriculum.py`) that graduates them through increasingly difficult stages:
|
| 32 |
+
1. **Stage 1-3:** Capacity Ramping (Learning to scale).
|
| 33 |
+
2. **Stage 4-5:** Fault Tolerance (Learning to reroute).
|
| 34 |
+
3. **Stage 6-8:** Surge Stability (Learning to balance competing pressures).
|
| 35 |
+
4. **Finals:** Sustained protection under cascading failure conditions.
|
| 36 |
+
|
| 37 |
+
### Episodic Replay Buffer
|
| 38 |
+
Using `replay.py`, our agents maintain a "Long-term Memory" of **Key Transitions**. Instead of relearning from scratch, the model uses **Few-Shot Demonstrations** to see how successful previous strategies were executed.
|
| 39 |
+
|
| 40 |
+
---
|
| 41 |
+
|
| 42 |
+
## 3. Upcoming & Unconfirmed Roadmap
|
| 43 |
+
> [!IMPORTANT]
|
| 44 |
+
> **DISCLAIMER:** The following features are in the research phase and are NOT yet finalized or confirmed. Please consult with the core team before assuming implementation details.
|
| 45 |
+
|
| 46 |
+
* **Multi-Token Attention for SRE:** Investigating the use of frequency-selective transformation to capture "cluster breathiness" (p99 jitter) rather than just global averages.
|
| 47 |
+
* **Graph Neural Network (GNN) Control:** Potential pivot toward modeling the cluster as a dynamic graph to directly manage the "topology of stress."
|
| 48 |
+
* **Cross-Cluster Generalization:** Testing models trained on 5 nodes against 10 and 20 node environments.
|
| 49 |
+
|
| 50 |
+
---
|
| 51 |
+
|
| 52 |
+
## Why This Wins
|
| 53 |
+
AntiAtropos doesn't follow runbooks. It understands the **laws of motion** within a cluster. By training agents to minimize "System Energy," we create infrastructure that is inherently self-healing, cost-efficient, and mathematically stable.
|
| 54 |
+
|
| 55 |
+
---
|
| 56 |
+
*Created for the 2026 AntiAtropos Hackathon.*
|
CLAUDE.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Refer to AGENT.md for instructions
|
Dockerfile
CHANGED
|
@@ -6,7 +6,19 @@ ENV DEBIAN_FRONTEND=noninteractive \
|
|
| 6 |
PROMETHEUS_VERSION=3.5.1 \
|
| 7 |
GRAFANA_VERSION=12.3.1 \
|
| 8 |
PROMETHEUS_ARCH=linux-amd64 \
|
| 9 |
-
GRAFANA_ARCH=linux-amd64
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 12 |
bash \
|
|
|
|
| 6 |
PROMETHEUS_VERSION=3.5.1 \
|
| 7 |
GRAFANA_VERSION=12.3.1 \
|
| 8 |
PROMETHEUS_ARCH=linux-amd64 \
|
| 9 |
+
GRAFANA_ARCH=linux-amd64 \
|
| 10 |
+
ANTIATROPOS_ENV_MODE=live \
|
| 11 |
+
ANTIATROPOS_REWARD_OUTPUT_MODE=normalized \
|
| 12 |
+
ANTIATROPOS_CONTROL_TIMEOUT_S=8.0 \
|
| 13 |
+
ANTIATROPOS_PROM_TIMEOUT_S=5.0 \
|
| 14 |
+
ANTIATROPOS_STRICT_REAL=false \
|
| 15 |
+
ANTIATROPOS_METRIC_AGGREGATION=sum \
|
| 16 |
+
ANTIATROPOS_K8S_NAMESPACE=prod-sre \
|
| 17 |
+
ANTIATROPOS_MIN_REPLICAS=1 \
|
| 18 |
+
ANTIATROPOS_SCALE_STEP=3 \
|
| 19 |
+
ANTIATROPOS_CONTROL_PLANE_URL=http://206.189.136.21:8010 \
|
| 20 |
+
PROMETHEUS_URL=http://206.189.136.21:30090 \
|
| 21 |
+
ANTIATROPOS_WORKLOAD_MAP={"node-0":{"deployment":"payments","namespace":"prod-sre"},"node-1":{"deployment":"checkout","namespace":"prod-sre"},"node-2":{"deployment":"catalog","namespace":"prod-sre"},"node-3":{"deployment":"cart","namespace":"prod-sre"},"node-4":{"deployment":"auth","namespace":"prod-sre"}}
|
| 22 |
|
| 23 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 24 |
bash \
|
README.md
CHANGED
|
@@ -274,4 +274,4 @@ For fixed-seed studies, use controlled simulator seeding in evaluation harnesses
|
|
| 274 |
| Grader quality | Deterministic, interpretable composite score in `[0, 1]` |
|
| 275 |
| Environment design | Dense Lyapunov-grounded reward, clean reset/step loop, explicit episode boundaries |
|
| 276 |
| Code quality | Typed Pydantic models, modular components, OpenEnv manifest, containerized runtime |
|
| 277 |
-
| Novelty | Lyapunov reward shaping + live K8s control plane + Prometheus telemetry + observability-first design |
|
|
|
|
| 274 |
| Grader quality | Deterministic, interpretable composite score in `[0, 1]` |
|
| 275 |
| Environment design | Dense Lyapunov-grounded reward, clean reset/step loop, explicit episode boundaries |
|
| 276 |
| Code quality | Typed Pydantic models, modular components, OpenEnv manifest, containerized runtime |
|
| 277 |
+
| Novelty | Lyapunov reward shaping + live K8s control plane + Prometheus telemetry + observability-first design |
|
__init__.py
CHANGED
|
@@ -1,25 +1,25 @@
|
|
| 1 |
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
-
# All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# This source code is licensed under the BSD-style license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
-
|
| 7 |
-
"""AntiAtropos Environment."""
|
| 8 |
-
|
| 9 |
-
from .client import AntiAtroposEnv
|
| 10 |
-
from .models import (
|
| 11 |
-
SREAction,
|
| 12 |
-
ActionType,
|
| 13 |
-
ClusterObservation,
|
| 14 |
-
NodeObservation,
|
| 15 |
-
NodeStatus,
|
| 16 |
-
)
|
| 17 |
-
|
| 18 |
-
__all__ = [
|
| 19 |
-
"AntiAtroposEnv",
|
| 20 |
-
"SREAction",
|
| 21 |
-
"ActionType",
|
| 22 |
-
"ClusterObservation",
|
| 23 |
-
"NodeObservation",
|
| 24 |
-
"NodeStatus",
|
| 25 |
-
]
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""AntiAtropos Environment."""
|
| 8 |
+
|
| 9 |
+
from .client import AntiAtroposEnv
|
| 10 |
+
from .models import (
|
| 11 |
+
SREAction,
|
| 12 |
+
ActionType,
|
| 13 |
+
ClusterObservation,
|
| 14 |
+
NodeObservation,
|
| 15 |
+
NodeStatus,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
__all__ = [
|
| 19 |
+
"AntiAtroposEnv",
|
| 20 |
+
"SREAction",
|
| 21 |
+
"ActionType",
|
| 22 |
+
"ClusterObservation",
|
| 23 |
+
"NodeObservation",
|
| 24 |
+
"NodeStatus",
|
| 25 |
+
]
|
agent_smoke.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Quick autonomous agent smoke test against the running AntiAtropos FastAPI server.
|
| 4 |
+
|
| 5 |
+
This does NOT require an LLM API key.
|
| 6 |
+
It uses a simple heuristic policy to validate end-to-end control-plane + telemetry wiring.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import asyncio
|
| 10 |
+
import os
|
| 11 |
+
from dataclasses import dataclass
|
| 12 |
+
|
| 13 |
+
try:
|
| 14 |
+
from AntiAtropos.client import AntiAtroposEnv
|
| 15 |
+
from AntiAtropos.models import SREAction, ActionType
|
| 16 |
+
except ImportError:
|
| 17 |
+
from client import AntiAtroposEnv # type: ignore
|
| 18 |
+
from models import SREAction, ActionType # type: ignore
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@dataclass
|
| 22 |
+
class Config:
|
| 23 |
+
env_url: str = os.getenv("ENV_URL", "http://localhost:8000")
|
| 24 |
+
task_id: str = os.getenv("ANTIATROPOS_TASK", "task-1")
|
| 25 |
+
mode: str = os.getenv("ANTIATROPOS_MODE", os.getenv("ANTIATROPOS_ENV_MODE", "aws"))
|
| 26 |
+
max_steps: int = int(os.getenv("ANTIATROPOS_SMOKE_STEPS", "20"))
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def pick_action(obs) -> SREAction:
|
| 30 |
+
# Pick node with highest queue depth as target
|
| 31 |
+
target = max(obs.nodes, key=lambda n: float(getattr(n, "queue_depth", 0.0)))
|
| 32 |
+
|
| 33 |
+
avg_latency = float(getattr(obs, "average_latency_ms", 0.0))
|
| 34 |
+
backlog = float(getattr(obs, "total_queue_backlog", 0.0))
|
| 35 |
+
|
| 36 |
+
# Heuristic policy:
|
| 37 |
+
# - If stressed, scale up busiest node
|
| 38 |
+
# - If very calm, scale down non-VIP node
|
| 39 |
+
# - Otherwise no-op
|
| 40 |
+
if avg_latency > 0.20 or backlog > 0.45:
|
| 41 |
+
return SREAction(action_type=ActionType.SCALE_UP, target_node_id=target.node_id, parameter=0.6)
|
| 42 |
+
|
| 43 |
+
non_vips = [n for n in obs.nodes if not bool(getattr(n, "is_vip", False))]
|
| 44 |
+
if avg_latency < 0.08 and backlog < 0.15 and non_vips:
|
| 45 |
+
down_target = max(non_vips, key=lambda n: float(getattr(n, "capacity", 0.0)))
|
| 46 |
+
return SREAction(action_type=ActionType.SCALE_DOWN, target_node_id=down_target.node_id, parameter=0.4)
|
| 47 |
+
|
| 48 |
+
return SREAction(action_type=ActionType.NO_OP, target_node_id=target.node_id, parameter=0.0)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
async def main() -> None:
|
| 52 |
+
cfg = Config()
|
| 53 |
+
print(f"[agent-smoke] env={cfg.env_url} task={cfg.task_id} mode={cfg.mode} steps={cfg.max_steps}")
|
| 54 |
+
|
| 55 |
+
async with AntiAtroposEnv(cfg.env_url, message_timeout_s=120) as env:
|
| 56 |
+
result = await env.reset(task_id=cfg.task_id, mode=cfg.mode)
|
| 57 |
+
print(f"[reset] step={result.observation.step} latency={result.observation.average_latency_ms:.3f} backlog={result.observation.total_queue_backlog:.3f}")
|
| 58 |
+
|
| 59 |
+
rewards = []
|
| 60 |
+
for i in range(1, cfg.max_steps + 1):
|
| 61 |
+
action = pick_action(result.observation)
|
| 62 |
+
result = await env.step(action)
|
| 63 |
+
rewards.append(float(result.reward or 0.0))
|
| 64 |
+
ack = getattr(result.observation, "action_ack_status", "")
|
| 65 |
+
print(
|
| 66 |
+
f"[step {i:02d}] {action.action_type.value} {action.target_node_id} p={action.parameter:.2f} "
|
| 67 |
+
f"reward={float(result.reward or 0.0):.3f} done={bool(result.done)} ack={ack}"
|
| 68 |
+
)
|
| 69 |
+
if result.done:
|
| 70 |
+
break
|
| 71 |
+
|
| 72 |
+
if rewards:
|
| 73 |
+
avg_reward = sum(rewards) / len(rewards)
|
| 74 |
+
print(f"[done] steps={len(rewards)} avg_reward={avg_reward:.3f} final_latency={result.observation.average_latency_ms:.3f} final_backlog={result.observation.total_queue_backlog:.3f}")
|
| 75 |
+
else:
|
| 76 |
+
print("[done] no steps executed")
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
if __name__ == "__main__":
|
| 80 |
+
asyncio.run(main())
|
client.py
CHANGED
|
@@ -1,140 +1,143 @@
|
|
| 1 |
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
-
# All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# This source code is licensed under the BSD-style license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
-
|
| 7 |
-
"""AntiAtropos Environment Client."""
|
| 8 |
-
|
| 9 |
-
from typing import Dict
|
| 10 |
-
|
| 11 |
-
from openenv.core import EnvClient
|
| 12 |
-
from openenv.core.client_types import StepResult
|
| 13 |
-
from openenv.core.env_server.types import State
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
...
|
| 35 |
-
... result
|
| 36 |
-
...
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
... client.
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
episode_id
|
| 139 |
-
|
| 140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""AntiAtropos Environment Client."""
|
| 8 |
+
|
| 9 |
+
from typing import Dict
|
| 10 |
+
|
| 11 |
+
from openenv.core import EnvClient
|
| 12 |
+
from openenv.core.client_types import StepResult
|
| 13 |
+
from openenv.core.env_server.types import State
|
| 14 |
+
|
| 15 |
+
try:
|
| 16 |
+
from .models import SREAction, ClusterObservation, NodeObservation, NodeStatus
|
| 17 |
+
except ImportError:
|
| 18 |
+
from models import SREAction, ClusterObservation, NodeObservation, NodeStatus # type: ignore
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class AntiAtroposEnv(
|
| 22 |
+
EnvClient[SREAction, ClusterObservation, State]
|
| 23 |
+
):
|
| 24 |
+
"""
|
| 25 |
+
Client for the AntiAtropos Environment.
|
| 26 |
+
|
| 27 |
+
This client maintains a persistent WebSocket connection to the environment server,
|
| 28 |
+
enabling efficient multi-step interactions with lower latency.
|
| 29 |
+
Each client instance has its own dedicated environment session on the server.
|
| 30 |
+
|
| 31 |
+
Example:
|
| 32 |
+
>>> # Connect to a running server
|
| 33 |
+
>>> with AntiAtroposEnv(base_url="http://localhost:8000") as client:
|
| 34 |
+
... result = client.reset()
|
| 35 |
+
... print(result.observation.average_latency_ms)
|
| 36 |
+
...
|
| 37 |
+
... action = SREAction(action_type="SCALE_UP", target_node_id="node-0", parameter=2.0)
|
| 38 |
+
... result = client.step(action)
|
| 39 |
+
... print(result.observation.lyapunov_energy)
|
| 40 |
+
|
| 41 |
+
Example with Docker:
|
| 42 |
+
>>> # Automatically start container and connect
|
| 43 |
+
>>> client = AntiAtroposEnv.from_docker_image("AntiAtropos-env:latest")
|
| 44 |
+
>>> try:
|
| 45 |
+
... result = client.reset()
|
| 46 |
+
... result = client.step(SREAction(action_type="NO_OP"))
|
| 47 |
+
... finally:
|
| 48 |
+
... client.close()
|
| 49 |
+
"""
|
| 50 |
+
|
| 51 |
+
def _step_payload(self, action: SREAction) -> Dict:
|
| 52 |
+
"""
|
| 53 |
+
Convert SREAction to JSON payload for step message.
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
action: SREAction instance
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
Dictionary representation suitable for JSON encoding
|
| 60 |
+
"""
|
| 61 |
+
return {
|
| 62 |
+
"action_type": action.action_type.value,
|
| 63 |
+
"target_node_id": action.target_node_id,
|
| 64 |
+
"parameter": action.parameter,
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
def _parse_result(self, payload: Dict) -> StepResult[ClusterObservation]:
|
| 68 |
+
"""
|
| 69 |
+
Parse server response into StepResult[ClusterObservation].
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
payload: JSON response data from server
|
| 73 |
+
|
| 74 |
+
Returns:
|
| 75 |
+
StepResult with ClusterObservation
|
| 76 |
+
"""
|
| 77 |
+
obs_data = payload.get("observation", {})
|
| 78 |
+
|
| 79 |
+
# Parse per-node list into NodeObservation objects
|
| 80 |
+
raw_nodes = obs_data.get("nodes", [])
|
| 81 |
+
node_obs = [
|
| 82 |
+
NodeObservation(
|
| 83 |
+
node_id=n.get("node_id", ""),
|
| 84 |
+
status=NodeStatus(n.get("status", NodeStatus.HEALTHY)),
|
| 85 |
+
is_vip=n.get("is_vip", False),
|
| 86 |
+
queue_depth=n.get("queue_depth", 0),
|
| 87 |
+
latency_ms=n.get("latency_ms", 0.0),
|
| 88 |
+
incoming_request_rate=n.get("incoming_request_rate", 0.0),
|
| 89 |
+
cpu_utilization=n.get("cpu_utilization", 0.0),
|
| 90 |
+
importance_weight=n.get("importance_weight", 1.0),
|
| 91 |
+
done=n.get("done", False),
|
| 92 |
+
reward=n.get("reward", 0.0),
|
| 93 |
+
)
|
| 94 |
+
for n in raw_nodes
|
| 95 |
+
]
|
| 96 |
+
|
| 97 |
+
observation = ClusterObservation(
|
| 98 |
+
cluster_id=obs_data.get("cluster_id", ""),
|
| 99 |
+
task_id=obs_data.get("task_id", "task-1"),
|
| 100 |
+
mode=obs_data.get("mode", "simulated"),
|
| 101 |
+
active_nodes=obs_data.get("active_nodes", 0),
|
| 102 |
+
average_latency_ms=obs_data.get("average_latency_ms", 0.0),
|
| 103 |
+
error_rate=obs_data.get("error_rate", 0.0),
|
| 104 |
+
total_queue_backlog=obs_data.get("total_queue_backlog", 0),
|
| 105 |
+
current_cost_per_hour=obs_data.get("current_cost_per_hour", 0.0),
|
| 106 |
+
lyapunov_energy=obs_data.get("lyapunov_energy", 0.0),
|
| 107 |
+
nodes=node_obs,
|
| 108 |
+
step=obs_data.get("step", 0),
|
| 109 |
+
max_steps=obs_data.get("max_steps", 100),
|
| 110 |
+
sla_violations=obs_data.get("sla_violations", 0),
|
| 111 |
+
invalid_action_count=obs_data.get("invalid_action_count", 0),
|
| 112 |
+
vip_failure_count=obs_data.get("vip_failure_count", 0),
|
| 113 |
+
metric_timestamp=obs_data.get("metric_timestamp", 0.0),
|
| 114 |
+
data_freshness_ms=obs_data.get("data_freshness_ms", 0),
|
| 115 |
+
action_ack_status=obs_data.get("action_ack_status", "success"),
|
| 116 |
+
choke_level=obs_data.get("choke_level", 0.0),
|
| 117 |
+
raw_reward=obs_data.get("raw_reward", 0.0),
|
| 118 |
+
normalized_reward=obs_data.get("normalized_reward", 0.0),
|
| 119 |
+
reward_scale_version=obs_data.get("reward_scale_version", "sigmoid-v1"),
|
| 120 |
+
done=payload.get("done", False),
|
| 121 |
+
reward=payload.get("reward", 0.0),
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
return StepResult(
|
| 125 |
+
observation=observation,
|
| 126 |
+
reward=payload.get("reward", 0.0),
|
| 127 |
+
done=payload.get("done", False),
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
def _parse_state(self, payload: Dict) -> State:
|
| 131 |
+
"""
|
| 132 |
+
Parse server response into State object.
|
| 133 |
+
|
| 134 |
+
Args:
|
| 135 |
+
payload: JSON response from state request
|
| 136 |
+
|
| 137 |
+
Returns:
|
| 138 |
+
State object with episode_id and step_count
|
| 139 |
+
"""
|
| 140 |
+
return State(
|
| 141 |
+
episode_id=payload.get("episode_id"),
|
| 142 |
+
step_count=payload.get("step_count", 0),
|
| 143 |
+
)
|
control/__init__.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
| 1 |
-
from .kubernetes_executor import KubernetesExecutor
|
| 2 |
-
from .validation import ActionValidator
|
|
|
|
| 1 |
+
from .kubernetes_executor import KubernetesExecutor
|
| 2 |
+
from .validation import ActionValidator
|
control/kubernetes_executor.py
CHANGED
|
@@ -1,230 +1,396 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import json
|
| 3 |
-
import time
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
self.
|
| 19 |
-
self.
|
| 20 |
-
self.
|
| 21 |
-
self.
|
| 22 |
-
self.
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
"""
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
"""
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
"""
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
"""
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
def
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
def
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
def
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
)
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import time
|
| 4 |
+
import logging
|
| 5 |
+
import requests
|
| 6 |
+
from uuid import uuid4
|
| 7 |
+
from typing import Optional
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger("kubernetes_executor")
|
| 10 |
+
|
| 11 |
+
class KubernetesExecutor:
|
| 12 |
+
"""
|
| 13 |
+
Executes high-level SRE actions on a Kubernetes cluster.
|
| 14 |
+
Provides a safe layer between SREAgent and actual infrastructure.
|
| 15 |
+
"""
|
| 16 |
+
def __init__(self, kubeconfig: Optional[str] = None):
|
| 17 |
+
# Use provided path or env var, defaulting to mock if neither is found
|
| 18 |
+
self.kubeconfig = kubeconfig or os.getenv("KUBECONFIG")
|
| 19 |
+
self.remote_control_url = os.getenv("ANTIATROPOS_CONTROL_PLANE_URL", "").strip().rstrip("/")
|
| 20 |
+
self.remote_timeout_s = float(os.getenv("ANTIATROPOS_CONTROL_TIMEOUT_S", "5.0"))
|
| 21 |
+
self.remote_retry_count = int(os.getenv("ANTIATROPOS_CONTROL_RETRY_COUNT", "2"))
|
| 22 |
+
self.remote_retry_backoff_s = float(os.getenv("ANTIATROPOS_CONTROL_RETRY_BACKOFF_S", "0.25"))
|
| 23 |
+
self.is_mock = (
|
| 24 |
+
not self.remote_control_url
|
| 25 |
+
and (not self.kubeconfig or self.kubeconfig.lower() == "mock")
|
| 26 |
+
)
|
| 27 |
+
self.namespace = os.getenv("ANTIATROPOS_K8S_NAMESPACE", "default")
|
| 28 |
+
self.min_replicas = int(os.getenv("ANTIATROPOS_MIN_REPLICAS", "1"))
|
| 29 |
+
self.max_replicas = self._parse_max_replicas(os.getenv("ANTIATROPOS_MAX_REPLICAS"))
|
| 30 |
+
self.scale_step = int(os.getenv("ANTIATROPOS_SCALE_STEP", "3"))
|
| 31 |
+
self._apps_v1_api = None
|
| 32 |
+
self._node_workload_map = self._load_node_workload_map()
|
| 33 |
+
self._live_supported_actions = {"NO_OP", "SCALE_UP", "SCALE_DOWN"}
|
| 34 |
+
self.k8s_retry_count = int(os.getenv("ANTIATROPOS_K8S_RETRY_COUNT", "2"))
|
| 35 |
+
self.k8s_retry_backoff_s = float(os.getenv("ANTIATROPOS_K8S_RETRY_BACKOFF_S", "0.2"))
|
| 36 |
+
|
| 37 |
+
@staticmethod
|
| 38 |
+
def _parse_max_replicas(raw: Optional[str]) -> Optional[int]:
|
| 39 |
+
"""
|
| 40 |
+
Parse optional max replicas.
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
- int when a positive explicit cap is provided
|
| 44 |
+
- None when scale-up should be unbounded
|
| 45 |
+
"""
|
| 46 |
+
if raw is None:
|
| 47 |
+
return None
|
| 48 |
+
value = str(raw).strip().lower()
|
| 49 |
+
if value in ("", "none", "unbounded", "inf", "infinite"):
|
| 50 |
+
return None
|
| 51 |
+
try:
|
| 52 |
+
parsed = int(value)
|
| 53 |
+
except ValueError:
|
| 54 |
+
return None
|
| 55 |
+
if parsed <= 0:
|
| 56 |
+
return None
|
| 57 |
+
return parsed
|
| 58 |
+
|
| 59 |
+
@staticmethod
|
| 60 |
+
def _normalize_action_type(action_type) -> str:
|
| 61 |
+
if hasattr(action_type, "value"):
|
| 62 |
+
return str(action_type.value)
|
| 63 |
+
return str(action_type)
|
| 64 |
+
|
| 65 |
+
def execute(self, action_type: str, target: str, parameter: float) -> str:
|
| 66 |
+
"""
|
| 67 |
+
Translates SRE actions to Kube requests (ScaleDeployment, PatchIngress, etc.)
|
| 68 |
+
"""
|
| 69 |
+
return self.execute_with_metadata(action_type, target, parameter)["ack_status"]
|
| 70 |
+
|
| 71 |
+
def execute_with_metadata(self, action_type: str, target: str, parameter: float) -> dict:
|
| 72 |
+
"""
|
| 73 |
+
Execute action and return acknowledgement plus executor metadata.
|
| 74 |
+
"""
|
| 75 |
+
action_id = str(uuid4())
|
| 76 |
+
started = time.perf_counter()
|
| 77 |
+
ack_status = ""
|
| 78 |
+
error_code = ""
|
| 79 |
+
|
| 80 |
+
if self.is_mock:
|
| 81 |
+
ack_status = self._mock_execution(action_type, target, parameter)
|
| 82 |
+
else:
|
| 83 |
+
try:
|
| 84 |
+
ack_status = self._real_execution(action_type, target, parameter)
|
| 85 |
+
except Exception as e:
|
| 86 |
+
logger.error(f"Execution failed for {action_type} on {target}: {str(e)}")
|
| 87 |
+
ack_status = f"Error: Failed to execute {action_type} on {target}: {str(e)}"
|
| 88 |
+
error_code = "EXECUTION_ERROR"
|
| 89 |
+
|
| 90 |
+
if ack_status.startswith("Rejected:") and not error_code:
|
| 91 |
+
error_code = "REJECTED_ACTION"
|
| 92 |
+
elif ack_status.startswith("Error:") and not error_code:
|
| 93 |
+
error_code = "EXECUTION_ERROR"
|
| 94 |
+
|
| 95 |
+
latency_ms = (time.perf_counter() - started) * 1000.0
|
| 96 |
+
return {
|
| 97 |
+
"action_id": action_id,
|
| 98 |
+
"ack_status": ack_status,
|
| 99 |
+
"executor_latency_ms": latency_ms,
|
| 100 |
+
"executor_error_code": error_code,
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
def live_enabled_actions(self) -> set[str]:
|
| 104 |
+
"""Action types that are actually executable in real live mode."""
|
| 105 |
+
if self.is_mock:
|
| 106 |
+
return {"NO_OP"}
|
| 107 |
+
return set(self._live_supported_actions)
|
| 108 |
+
|
| 109 |
+
def live_capability_error(self, action_type: str) -> Optional[str]:
|
| 110 |
+
"""Returns reason when action is not runnable in live mode, else None."""
|
| 111 |
+
action = self._normalize_action_type(action_type)
|
| 112 |
+
if action not in self.live_enabled_actions():
|
| 113 |
+
if self.is_mock:
|
| 114 |
+
return (
|
| 115 |
+
f"Live mode rejected {action}: no real Kubernetes executor is configured "
|
| 116 |
+
"(set KUBECONFIG and ANTIATROPOS_WORKLOAD_MAP)."
|
| 117 |
+
)
|
| 118 |
+
return f"Live mode rejected {action}: no executor is enabled for this action."
|
| 119 |
+
return None
|
| 120 |
+
|
| 121 |
+
def _real_execution(self, action_type: str, target: str, parameter: float) -> str:
|
| 122 |
+
"""Execute bounded actions on a Kubernetes cluster."""
|
| 123 |
+
action = self._normalize_action_type(action_type)
|
| 124 |
+
|
| 125 |
+
if self.remote_control_url:
|
| 126 |
+
return self._remote_execution(action, target, parameter)
|
| 127 |
+
|
| 128 |
+
if action == "NO_OP":
|
| 129 |
+
return "Ack: NO_OP - no cluster mutation"
|
| 130 |
+
|
| 131 |
+
if action in ("SCALE_UP", "SCALE_DOWN"):
|
| 132 |
+
return self._scale_deployment(action, target, parameter)
|
| 133 |
+
|
| 134 |
+
return f"Rejected: {action} is not enabled for live Kubernetes execution"
|
| 135 |
+
|
| 136 |
+
def _mock_execution(self, action_type: str, target: str, parameter: float) -> str:
|
| 137 |
+
"""Returns mock acknowledgement for actions."""
|
| 138 |
+
# TODO: Add realistic latency simulation for K8s control plane
|
| 139 |
+
action = self._normalize_action_type(action_type)
|
| 140 |
+
return f"Ack: {action} for {target} with value {parameter} - Status: Applied"
|
| 141 |
+
|
| 142 |
+
def _scale_deployment(self, action_type: str, target: str, parameter: float) -> str:
|
| 143 |
+
namespace, deployment_name = self._resolve_workload_target(target)
|
| 144 |
+
apps_v1 = self._get_apps_v1_api()
|
| 145 |
+
|
| 146 |
+
scale_obj = apps_v1.read_namespaced_deployment_scale(
|
| 147 |
+
name=deployment_name,
|
| 148 |
+
namespace=namespace,
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
current = int(scale_obj.spec.replicas or self.min_replicas)
|
| 152 |
+
delta = max(1, int(float(parameter) * self.scale_step))
|
| 153 |
+
if action_type == "SCALE_UP":
|
| 154 |
+
if self.max_replicas is None:
|
| 155 |
+
desired = current + delta
|
| 156 |
+
else:
|
| 157 |
+
desired = min(self.max_replicas, current + delta)
|
| 158 |
+
else:
|
| 159 |
+
desired = max(self.min_replicas, current - delta)
|
| 160 |
+
|
| 161 |
+
if desired == current:
|
| 162 |
+
upper = "unbounded" if self.max_replicas is None else str(self.max_replicas)
|
| 163 |
+
return (
|
| 164 |
+
f"Ack: {action_type} for {target} - replicas unchanged at {current} "
|
| 165 |
+
f"(bounds {self.min_replicas}-{upper})"
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
self._patch_deployment_scale_with_retry(
|
| 169 |
+
apps_v1=apps_v1,
|
| 170 |
+
deployment_name=deployment_name,
|
| 171 |
+
namespace=namespace,
|
| 172 |
+
desired=desired,
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
return (
|
| 176 |
+
f"Ack: {action_type} for {target} - deployment {deployment_name} "
|
| 177 |
+
f"in namespace {namespace} scaled {current}->{desired}"
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
def _patch_deployment_scale_with_retry(self, apps_v1, deployment_name: str, namespace: str, desired: int) -> None:
|
| 181 |
+
"""
|
| 182 |
+
Patch deployment replicas with retries for transient API server errors.
|
| 183 |
+
"""
|
| 184 |
+
from kubernetes.client.rest import ApiException
|
| 185 |
+
|
| 186 |
+
max_attempts = max(1, self.k8s_retry_count + 1)
|
| 187 |
+
for attempt in range(1, max_attempts + 1):
|
| 188 |
+
try:
|
| 189 |
+
apps_v1.patch_namespaced_deployment_scale(
|
| 190 |
+
name=deployment_name,
|
| 191 |
+
namespace=namespace,
|
| 192 |
+
body={"spec": {"replicas": desired}},
|
| 193 |
+
)
|
| 194 |
+
return
|
| 195 |
+
except ApiException as exc:
|
| 196 |
+
retryable = exc.status in (409, 429, 500, 502, 503, 504)
|
| 197 |
+
if (not retryable) or attempt >= max_attempts:
|
| 198 |
+
raise
|
| 199 |
+
sleep_s = self.k8s_retry_backoff_s * (2 ** (attempt - 1))
|
| 200 |
+
logger.warning(
|
| 201 |
+
"Retrying deployment scale patch after ApiException status=%s attempt=%s/%s",
|
| 202 |
+
exc.status,
|
| 203 |
+
attempt,
|
| 204 |
+
max_attempts,
|
| 205 |
+
)
|
| 206 |
+
time.sleep(sleep_s)
|
| 207 |
+
|
| 208 |
+
def _remote_execution(self, action: str, target: str, parameter: float) -> str:
|
| 209 |
+
"""
|
| 210 |
+
Delegate action execution to a remote FastAPI control plane.
|
| 211 |
+
|
| 212 |
+
Expected remote endpoint contract:
|
| 213 |
+
- POST /step
|
| 214 |
+
- Request: {action_type, target_node_id, parameter}
|
| 215 |
+
- Success response includes ack_status and starts with "Ack:"
|
| 216 |
+
|
| 217 |
+
This contract matches server.local_laptop_control and is the only
|
| 218 |
+
supported remote control-plane format.
|
| 219 |
+
"""
|
| 220 |
+
if not self.remote_control_url:
|
| 221 |
+
raise ValueError("ANTIATROPOS_CONTROL_PLANE_URL is not configured")
|
| 222 |
+
|
| 223 |
+
endpoint = f"{self.remote_control_url}/step"
|
| 224 |
+
action_payload = {
|
| 225 |
+
"action_type": action,
|
| 226 |
+
"target_node_id": target,
|
| 227 |
+
"parameter": float(parameter),
|
| 228 |
+
}
|
| 229 |
+
payload = action_payload
|
| 230 |
+
|
| 231 |
+
response = self._post_with_retry(endpoint=endpoint, payload=payload)
|
| 232 |
+
|
| 233 |
+
if response.status_code >= 400:
|
| 234 |
+
detail = ""
|
| 235 |
+
try:
|
| 236 |
+
body = response.json()
|
| 237 |
+
detail = str(body.get("detail", body))
|
| 238 |
+
except Exception:
|
| 239 |
+
detail = response.text.strip()
|
| 240 |
+
if response.status_code == 422 and "action" in detail:
|
| 241 |
+
detail = (
|
| 242 |
+
f"{detail}. Expected lightweight control-plane contract at "
|
| 243 |
+
f"{endpoint}: "
|
| 244 |
+
'{"action_type":"SCALE_UP","target_node_id":"node-0","parameter":1.0}'
|
| 245 |
+
)
|
| 246 |
+
raise RuntimeError(
|
| 247 |
+
f"Remote control-plane rejected action ({response.status_code}): {detail}"
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
+
try:
|
| 251 |
+
data = response.json()
|
| 252 |
+
except Exception as exc:
|
| 253 |
+
raise RuntimeError("Remote control-plane returned non-JSON response") from exc
|
| 254 |
+
|
| 255 |
+
ack = str(data.get("ack_status", "")).strip()
|
| 256 |
+
if not ack:
|
| 257 |
+
action_id = str(data.get("action_id", "")).strip() or "remote"
|
| 258 |
+
return f"Ack: {action} for {target} via remote control-plane ({action_id})"
|
| 259 |
+
return ack
|
| 260 |
+
|
| 261 |
+
def _post_with_retry(self, endpoint: str, payload: dict) -> requests.Response:
|
| 262 |
+
"""
|
| 263 |
+
POST helper with retries for transient HTTP/network failures.
|
| 264 |
+
"""
|
| 265 |
+
max_attempts = max(1, self.remote_retry_count + 1)
|
| 266 |
+
last_exc: Optional[Exception] = None
|
| 267 |
+
|
| 268 |
+
for attempt in range(1, max_attempts + 1):
|
| 269 |
+
try:
|
| 270 |
+
response = requests.post(endpoint, json=payload, timeout=self.remote_timeout_s)
|
| 271 |
+
except requests.RequestException as exc:
|
| 272 |
+
last_exc = exc
|
| 273 |
+
if attempt >= max_attempts:
|
| 274 |
+
break
|
| 275 |
+
sleep_s = self.remote_retry_backoff_s * (2 ** (attempt - 1))
|
| 276 |
+
logger.warning(
|
| 277 |
+
"Retrying remote control-plane POST after network error attempt=%s/%s: %s",
|
| 278 |
+
attempt,
|
| 279 |
+
max_attempts,
|
| 280 |
+
exc,
|
| 281 |
+
)
|
| 282 |
+
time.sleep(sleep_s)
|
| 283 |
+
continue
|
| 284 |
+
|
| 285 |
+
if response.status_code >= 500 and attempt < max_attempts:
|
| 286 |
+
sleep_s = self.remote_retry_backoff_s * (2 ** (attempt - 1))
|
| 287 |
+
logger.warning(
|
| 288 |
+
"Retrying remote control-plane POST after HTTP %s attempt=%s/%s",
|
| 289 |
+
response.status_code,
|
| 290 |
+
attempt,
|
| 291 |
+
max_attempts,
|
| 292 |
+
)
|
| 293 |
+
time.sleep(sleep_s)
|
| 294 |
+
continue
|
| 295 |
+
|
| 296 |
+
return response
|
| 297 |
+
|
| 298 |
+
if last_exc is not None:
|
| 299 |
+
raise RuntimeError(f"Remote control-plane request failed: {last_exc}") from last_exc
|
| 300 |
+
raise RuntimeError("Remote control-plane request failed after retries")
|
| 301 |
+
|
| 302 |
+
def _get_apps_v1_api(self):
|
| 303 |
+
if self._apps_v1_api is not None:
|
| 304 |
+
return self._apps_v1_api
|
| 305 |
+
|
| 306 |
+
from kubernetes import client, config
|
| 307 |
+
|
| 308 |
+
if self.kubeconfig and self.kubeconfig.lower() not in ("mock", ""):
|
| 309 |
+
config.load_kube_config(config_file=self.kubeconfig)
|
| 310 |
+
else:
|
| 311 |
+
config.load_incluster_config()
|
| 312 |
+
|
| 313 |
+
self._apps_v1_api = client.AppsV1Api()
|
| 314 |
+
return self._apps_v1_api
|
| 315 |
+
|
| 316 |
+
def _load_node_workload_map(self) -> dict[str, dict[str, str]]:
|
| 317 |
+
"""
|
| 318 |
+
Load node->workload mapping.
|
| 319 |
+
|
| 320 |
+
Preferred format (ANTIATROPOS_WORKLOAD_MAP):
|
| 321 |
+
{
|
| 322 |
+
"node-0": {"deployment": "payments", "namespace": "prod-sre"},
|
| 323 |
+
"node-1": {"deployment": "checkout"}
|
| 324 |
+
}
|
| 325 |
+
|
| 326 |
+
Legacy fallback (ANTIATROPOS_NODE_DEPLOYMENT_MAP):
|
| 327 |
+
{
|
| 328 |
+
"node-0": "payments",
|
| 329 |
+
"node-1": "checkout"
|
| 330 |
+
}
|
| 331 |
+
"""
|
| 332 |
+
raw = os.getenv("ANTIATROPOS_WORKLOAD_MAP", "")
|
| 333 |
+
if raw:
|
| 334 |
+
parsed = self._parse_json_mapping(raw)
|
| 335 |
+
if parsed is not None:
|
| 336 |
+
return parsed
|
| 337 |
+
|
| 338 |
+
legacy_raw = os.getenv("ANTIATROPOS_NODE_DEPLOYMENT_MAP", "")
|
| 339 |
+
if legacy_raw:
|
| 340 |
+
legacy = self._parse_legacy_mapping(legacy_raw)
|
| 341 |
+
if legacy is not None:
|
| 342 |
+
return legacy
|
| 343 |
+
|
| 344 |
+
return {}
|
| 345 |
+
|
| 346 |
+
def _parse_json_mapping(self, raw: str) -> Optional[dict[str, dict[str, str]]]:
|
| 347 |
+
try:
|
| 348 |
+
data = json.loads(raw)
|
| 349 |
+
except json.JSONDecodeError:
|
| 350 |
+
return None
|
| 351 |
+
|
| 352 |
+
if not isinstance(data, dict):
|
| 353 |
+
return None
|
| 354 |
+
|
| 355 |
+
out: dict[str, dict[str, str]] = {}
|
| 356 |
+
for node_id, workload in data.items():
|
| 357 |
+
if not isinstance(workload, dict):
|
| 358 |
+
return None
|
| 359 |
+
deployment = workload.get("deployment")
|
| 360 |
+
if not deployment:
|
| 361 |
+
return None
|
| 362 |
+
namespace = workload.get("namespace", self.namespace)
|
| 363 |
+
out[str(node_id)] = {
|
| 364 |
+
"deployment": str(deployment),
|
| 365 |
+
"namespace": str(namespace),
|
| 366 |
+
}
|
| 367 |
+
return out
|
| 368 |
+
|
| 369 |
+
def _parse_legacy_mapping(self, raw: str) -> Optional[dict[str, dict[str, str]]]:
|
| 370 |
+
try:
|
| 371 |
+
data = json.loads(raw)
|
| 372 |
+
except json.JSONDecodeError:
|
| 373 |
+
return None
|
| 374 |
+
|
| 375 |
+
if not isinstance(data, dict):
|
| 376 |
+
return None
|
| 377 |
+
|
| 378 |
+
out: dict[str, dict[str, str]] = {}
|
| 379 |
+
for node_id, deployment in data.items():
|
| 380 |
+
if not deployment:
|
| 381 |
+
return None
|
| 382 |
+
out[str(node_id)] = {
|
| 383 |
+
"deployment": str(deployment),
|
| 384 |
+
"namespace": self.namespace,
|
| 385 |
+
}
|
| 386 |
+
return out
|
| 387 |
+
|
| 388 |
+
def _resolve_workload_target(self, target: str) -> tuple[str, str]:
|
| 389 |
+
if target not in self._node_workload_map:
|
| 390 |
+
raise ValueError(
|
| 391 |
+
f"Missing workload mapping for target '{target}'. "
|
| 392 |
+
"Set ANTIATROPOS_WORKLOAD_MAP with node->deployment bindings."
|
| 393 |
+
)
|
| 394 |
+
|
| 395 |
+
workload = self._node_workload_map[target]
|
| 396 |
+
return workload["namespace"], workload["deployment"]
|
control/validation.py
CHANGED
|
@@ -1,38 +1,69 @@
|
|
| 1 |
-
from typing import List, Optional
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Optional, Tuple
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class ActionValidator:
|
| 5 |
+
"""
|
| 6 |
+
Validates SRE actions to ensure they stay within safety boundaries.
|
| 7 |
+
Prevents destructive operations like 100% shedding on critical nodes.
|
| 8 |
+
|
| 9 |
+
Implements soft cooldown for scaling actions: instead of hard-rejecting
|
| 10 |
+
a rapid re-scale, the action passes with a penalty signal. The environment
|
| 11 |
+
can use this penalty to reduce the reward, teaching the agent to wait
|
| 12 |
+
without blocking emergency scaling.
|
| 13 |
+
"""
|
| 14 |
+
def __init__(self, critical_nodes: Optional[List[str]] = None, cooldown_ticks: int = 3):
|
| 15 |
+
self.critical_nodes = critical_nodes or ["node-0", "node-1", "node-2"]
|
| 16 |
+
self.cooldown_ticks = cooldown_ticks
|
| 17 |
+
# Track last scale action per node: {node_id: (tick, action_type)}
|
| 18 |
+
self._last_scale: dict[str, Tuple[int, str]] = {}
|
| 19 |
+
self._current_tick: int = 0
|
| 20 |
+
|
| 21 |
+
def set_tick(self, tick: int) -> None:
|
| 22 |
+
"""Update the current tick counter for cooldown tracking."""
|
| 23 |
+
self._current_tick = tick
|
| 24 |
+
|
| 25 |
+
def validate(self, action_type: str, target: str, parameter: float, valid_targets: Optional[List[str]] = None) -> Tuple[bool, str, float]:
|
| 26 |
+
"""
|
| 27 |
+
Returns (is_valid, error_message, cooldown_penalty).
|
| 28 |
+
|
| 29 |
+
cooldown_penalty is in [0, 1]:
|
| 30 |
+
0.0 = no penalty (action is fine)
|
| 31 |
+
>0 = soft penalty for rapid re-scaling (action still executes)
|
| 32 |
+
Hard violations (critical shed, out-of-range) still reject with penalty=0.
|
| 33 |
+
"""
|
| 34 |
+
if hasattr(action_type, "value"):
|
| 35 |
+
action = str(action_type.value)
|
| 36 |
+
else:
|
| 37 |
+
action = str(action_type)
|
| 38 |
+
|
| 39 |
+
cooldown_penalty = 0.0
|
| 40 |
+
|
| 41 |
+
if valid_targets is not None and target not in valid_targets:
|
| 42 |
+
return False, f"Unknown target node: {target}", 0.0
|
| 43 |
+
|
| 44 |
+
if action == "SHED_LOAD" and target in self.critical_nodes:
|
| 45 |
+
return False, f"Forbidden: Load shedding on critical node {target}.", 0.0
|
| 46 |
+
|
| 47 |
+
if action in ["SCALE_UP", "SCALE_DOWN"]:
|
| 48 |
+
if parameter < 0.0:
|
| 49 |
+
return False, "Negative scaling parameters are not allowed.", 0.0
|
| 50 |
+
if parameter > 10.0:
|
| 51 |
+
return False, "Scaling parameter must be <= 10.0.", 0.0
|
| 52 |
+
|
| 53 |
+
# Soft cooldown: penalize but don't block rapid re-scaling.
|
| 54 |
+
# Dynamic window: if the node is DEGRADED, reduce cooldown (emergency allowed).
|
| 55 |
+
last_tick, last_action = self._last_scale.get(target, (0, ""))
|
| 56 |
+
ticks_since = self._current_tick - last_tick
|
| 57 |
+
if ticks_since < self.cooldown_ticks and last_action == action:
|
| 58 |
+
# Penalty decays linearly: full penalty at 0 ticks, 0 at cooldown_ticks
|
| 59 |
+
cooldown_penalty = (self.cooldown_ticks - ticks_since) / self.cooldown_ticks
|
| 60 |
+
# Don't reject — just flag the penalty
|
| 61 |
+
self._last_scale[target] = (self._current_tick, action)
|
| 62 |
+
|
| 63 |
+
if action in ["REROUTE_TRAFFIC", "SHED_LOAD"] and not (0.0 <= parameter <= 1.0):
|
| 64 |
+
return False, f"{action} parameter must be in [0.0, 1.0].", 0.0
|
| 65 |
+
|
| 66 |
+
if action == "NO_OP" and parameter != 0.0:
|
| 67 |
+
return False, "NO_OP requires parameter=0.0.", 0.0
|
| 68 |
+
|
| 69 |
+
return True, "Success", cooldown_penalty
|
curriculum.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
AntiAtropos Curriculum Training.
|
| 3 |
+
|
| 4 |
+
Defines progressive difficulty stages that the agent must pass before advancing.
|
| 5 |
+
Failed stages are retried with higher temperature for exploration.
|
| 6 |
+
|
| 7 |
+
Each stage specifies:
|
| 8 |
+
- task: Which task to run
|
| 9 |
+
- max_steps: Episode length (shorter = easier)
|
| 10 |
+
- pass_threshold: Minimum composite score to advance
|
| 11 |
+
- temperature: Suggest LLM temperature for this stage
|
| 12 |
+
- description: Human-readable label
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from dataclasses import dataclass
|
| 16 |
+
from typing import List, Optional
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@dataclass
|
| 20 |
+
class CurriculumStage:
|
| 21 |
+
"""A single stage in the training curriculum."""
|
| 22 |
+
task: str
|
| 23 |
+
max_steps: int
|
| 24 |
+
pass_threshold: float
|
| 25 |
+
temperature: float = 0.0
|
| 26 |
+
description: str = ""
|
| 27 |
+
retries: int = 0 # Number of failed attempts so far
|
| 28 |
+
max_retries: int = 3 # Max retries before advancing anyway
|
| 29 |
+
|
| 30 |
+
@property
|
| 31 |
+
def retry_temperature(self) -> float:
|
| 32 |
+
"""Temperature increases with retries to encourage exploration."""
|
| 33 |
+
if self.retries == 0:
|
| 34 |
+
return self.temperature
|
| 35 |
+
# 0.3, 0.6, 0.9 on retries
|
| 36 |
+
return min(1.0, self.temperature + self.retries * 0.3)
|
| 37 |
+
|
| 38 |
+
@property
|
| 39 |
+
def should_skip(self) -> bool:
|
| 40 |
+
"""Skip this stage if too many retries."""
|
| 41 |
+
return self.retries >= self.max_retries
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# Progressive curriculum: start easy, add complexity
|
| 45 |
+
CURRICULUM: List[CurriculumStage] = [
|
| 46 |
+
CurriculumStage(
|
| 47 |
+
task="task-1", max_steps=40, pass_threshold=0.40,
|
| 48 |
+
temperature=0.0, description="Short ramp — learn basic scaling",
|
| 49 |
+
),
|
| 50 |
+
CurriculumStage(
|
| 51 |
+
task="task-1", max_steps=60, pass_threshold=0.50,
|
| 52 |
+
temperature=0.0, description="Standard ramp — scale proactively",
|
| 53 |
+
),
|
| 54 |
+
CurriculumStage(
|
| 55 |
+
task="task-1", max_steps=100, pass_threshold=0.55,
|
| 56 |
+
temperature=0.0, description="Full ramp — cost-aware scaling",
|
| 57 |
+
),
|
| 58 |
+
CurriculumStage(
|
| 59 |
+
task="task-2", max_steps=40, pass_threshold=0.35,
|
| 60 |
+
temperature=0.0, description="Short fault — learn reroute/scale on failure",
|
| 61 |
+
),
|
| 62 |
+
CurriculumStage(
|
| 63 |
+
task="task-2", max_steps=60, pass_threshold=0.45,
|
| 64 |
+
temperature=0.3, description="Standard fault — fast recovery",
|
| 65 |
+
),
|
| 66 |
+
CurriculumStage(
|
| 67 |
+
task="task-3", max_steps=40, pass_threshold=0.35,
|
| 68 |
+
temperature=0.0, description="Short surge — protect VIP during spike",
|
| 69 |
+
),
|
| 70 |
+
CurriculumStage(
|
| 71 |
+
task="task-3", max_steps=60, pass_threshold=0.45,
|
| 72 |
+
temperature=0.3, description="Standard surge — sustained VIP protection",
|
| 73 |
+
),
|
| 74 |
+
# Final combined test
|
| 75 |
+
CurriculumStage(
|
| 76 |
+
task="task-1", max_steps=100, pass_threshold=0.55,
|
| 77 |
+
temperature=0.0, description="Final: full ramp at low temp",
|
| 78 |
+
),
|
| 79 |
+
CurriculumStage(
|
| 80 |
+
task="task-2", max_steps=60, pass_threshold=0.50,
|
| 81 |
+
temperature=0.0, description="Final: fault recovery at low temp",
|
| 82 |
+
),
|
| 83 |
+
CurriculumStage(
|
| 84 |
+
task="task-3", max_steps=60, pass_threshold=0.50,
|
| 85 |
+
temperature=0.0, description="Final: surge protection at low temp",
|
| 86 |
+
),
|
| 87 |
+
]
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
class CurriculumTracker:
|
| 91 |
+
"""Tracks progress through the curriculum stages."""
|
| 92 |
+
|
| 93 |
+
def __init__(self, stages: Optional[List[CurriculumStage]] = None):
|
| 94 |
+
self._stages = stages or CURRICULUM
|
| 95 |
+
self._current_idx: int = 0
|
| 96 |
+
|
| 97 |
+
@property
|
| 98 |
+
def current(self) -> CurriculumStage:
|
| 99 |
+
return self._stages[self._current_idx]
|
| 100 |
+
|
| 101 |
+
@property
|
| 102 |
+
def current_index(self) -> int:
|
| 103 |
+
return self._current_idx
|
| 104 |
+
|
| 105 |
+
@property
|
| 106 |
+
def total_stages(self) -> int:
|
| 107 |
+
return len(self._stages)
|
| 108 |
+
|
| 109 |
+
@property
|
| 110 |
+
def is_complete(self) -> bool:
|
| 111 |
+
return self._current_idx >= len(self._stages)
|
| 112 |
+
|
| 113 |
+
def report_score(self, score: float) -> bool:
|
| 114 |
+
"""Report a score for the current stage. Returns True if passed."""
|
| 115 |
+
if score >= self.current.pass_threshold:
|
| 116 |
+
self._current_idx += 1
|
| 117 |
+
return True
|
| 118 |
+
else:
|
| 119 |
+
self.current.retries += 1
|
| 120 |
+
if self.current.should_skip:
|
| 121 |
+
self._current_idx += 1
|
| 122 |
+
return False
|
| 123 |
+
|
| 124 |
+
def progress_summary(self) -> str:
|
| 125 |
+
stage = self.current
|
| 126 |
+
return (
|
| 127 |
+
f"Stage {self._current_idx + 1}/{self.total_stages}: "
|
| 128 |
+
f"{stage.description} "
|
| 129 |
+
f"(task={stage.task}, max_steps={stage.max_steps}, "
|
| 130 |
+
f"threshold={stage.pass_threshold}, retries={stage.retries})"
|
| 131 |
+
)
|
deploy-local.ps1
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AntiAtropos Local Cluster Deploy
|
| 2 |
+
# Deploys workloads, Prometheus, and Grafana on the Kind cluster.
|
| 3 |
+
# Grafana port-forward starts automatically at the end.
|
| 4 |
+
|
| 5 |
+
param(
|
| 6 |
+
[switch]$SkipPortForward,
|
| 7 |
+
[int]$GrafanaPort = 3000
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
Write-Host "=== AntiAtropos Local Deploy ===" -ForegroundColor Cyan
|
| 11 |
+
Write-Host ""
|
| 12 |
+
|
| 13 |
+
# --- 1. Check cluster ---
|
| 14 |
+
Write-Host "[1/5] Checking Kind cluster..." -ForegroundColor Yellow
|
| 15 |
+
$cluster = kubectl config current-context 2>$null
|
| 16 |
+
if ($cluster -notmatch "antiatropos") {
|
| 17 |
+
Write-Host "WARNING: Current context is '$cluster', expected 'kind-antiatropos-local'. Proceed anyway? [Y/n]"
|
| 18 |
+
$r = Read-Host
|
| 19 |
+
if ($r -eq 'n') { exit 1 }
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
# --- 2. Deploy workload pods ---
|
| 23 |
+
Write-Host "[2/5] Deploying workload pods..." -ForegroundColor Yellow
|
| 24 |
+
kubectl create ns prod-sre 2>&1 | Out-Null
|
| 25 |
+
kubectl create ns monitoring 2>&1 | Out-Null
|
| 26 |
+
kubectl apply -f "$PSScriptRoot\deploy\local-laptop.yaml"
|
| 27 |
+
Write-Host " Waiting for workloads to be ready..."
|
| 28 |
+
kubectl wait --for=condition=ready pod -l app --all -n prod-sre --timeout=120s 2>$null
|
| 29 |
+
Write-Host " Workloads ready."
|
| 30 |
+
|
| 31 |
+
# --- 3. Deploy Prometheus ---
|
| 32 |
+
Write-Host "[3/5] Deploying Prometheus..." -ForegroundColor Yellow
|
| 33 |
+
$promRelease = helm list -n monitoring -q 2>$null | Select-String "prometheus"
|
| 34 |
+
if ($promRelease) {
|
| 35 |
+
helm upgrade prometheus prometheus-community/prometheus -n monitoring -f "$PSScriptRoot\deploy\prometheus-helm-values.yaml"
|
| 36 |
+
} else {
|
| 37 |
+
helm install prometheus prometheus-community/prometheus -n monitoring -f "$PSScriptRoot\deploy\prometheus-helm-values.yaml"
|
| 38 |
+
}
|
| 39 |
+
Write-Host " Waiting for Prometheus server..."
|
| 40 |
+
kubectl wait --for=condition=ready pod -l "app.kubernetes.io/name=prometheus" -n monitoring --timeout=120s 2>$null
|
| 41 |
+
Write-Host " Prometheus ready."
|
| 42 |
+
|
| 43 |
+
# --- 4. Deploy Grafana ---
|
| 44 |
+
Write-Host "[4/5] Deploying Grafana..." -ForegroundColor Yellow
|
| 45 |
+
# Update dashboard ConfigMap
|
| 46 |
+
kubectl delete configmap grafana-dashboards -n monitoring 2>$null
|
| 47 |
+
kubectl create configmap grafana-dashboards -n monitoring --from-file="$PSScriptRoot\deploy\grafana\provisioning\dashboards\json\"
|
| 48 |
+
|
| 49 |
+
$grafRelease = helm list -n monitoring -q 2>$null | Select-String "grafana"
|
| 50 |
+
if ($grafRelease) {
|
| 51 |
+
helm upgrade grafana grafana/grafana -n monitoring -f "$PSScriptRoot\deploy\grafana-helm-values.yaml"
|
| 52 |
+
} else {
|
| 53 |
+
helm install grafana grafana/grafana -n monitoring -f "$PSScriptRoot\deploy\grafana-helm-values.yaml"
|
| 54 |
+
}
|
| 55 |
+
Write-Host " Waiting for Grafana..."
|
| 56 |
+
kubectl wait --for=condition=ready pod -l "app.kubernetes.io/name=grafana" -n monitoring --timeout=120s 2>$null
|
| 57 |
+
Write-Host " Grafana ready."
|
| 58 |
+
|
| 59 |
+
# --- 5. Start Grafana port-forward ---
|
| 60 |
+
Write-Host "[5/5] Grafana port-forward..." -ForegroundColor Yellow
|
| 61 |
+
if (-not $SkipPortForward) {
|
| 62 |
+
# Kill any existing port-forward on the same port
|
| 63 |
+
$existing = Get-NetTCPConnection -LocalPort $GrafanaPort -ErrorAction SilentlyContinue 2>$null
|
| 64 |
+
if ($existing) {
|
| 65 |
+
$pid = $existing.OwningProcess
|
| 66 |
+
Stop-Process -Id $pid -Force -ErrorAction SilentlyContinue 2>$null
|
| 67 |
+
Start-Sleep -Seconds 1
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
Write-Host " Starting port-forward on localhost:$GrafanaPort..."
|
| 71 |
+
$proc = Start-Process -PassThru -NoNewWindow kubectl -ArgumentList "port-forward","-n","monitoring","svc/grafana","${GrafanaPort}:80"
|
| 72 |
+
|
| 73 |
+
Start-Sleep -Seconds 2
|
| 74 |
+
# Verify the port-forward is alive
|
| 75 |
+
try {
|
| 76 |
+
$null = Invoke-WebRequest -Uri "http://localhost:$GrafanaPort/api/health" -UseBasicParsing -TimeoutSec 5
|
| 77 |
+
Write-Host ""
|
| 78 |
+
Write-Host "=== Deploy Complete ===" -ForegroundColor Green
|
| 79 |
+
Write-Host " Grafana: http://localhost:$GrafanaPort (admin / antiatropos)"
|
| 80 |
+
Write-Host " Dashboards: AntiAtropos Overview, AntiAtropos Live Control Plane"
|
| 81 |
+
Write-Host " Port-forward PID: $($proc.Id)"
|
| 82 |
+
Write-Host ""
|
| 83 |
+
Write-Host "To stop port-forward: Stop-Process -Id $($proc.Id)"
|
| 84 |
+
} catch {
|
| 85 |
+
Write-Host "WARNING: Port-forward started but Grafana not reachable yet. Try: kubectl port-forward -n monitoring svc/grafana ${GrafanaPort}:80"
|
| 86 |
+
}
|
| 87 |
+
} else {
|
| 88 |
+
Write-Host ""
|
| 89 |
+
Write-Host "=== Deploy Complete ===" -ForegroundColor Green
|
| 90 |
+
Write-Host " To access Grafana: kubectl port-forward -n monitoring svc/grafana ${GrafanaPort}:80"
|
| 91 |
+
}
|
deploy/LOCAL_LAPTOP_FASTAPI_GUIDE.md
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Local Laptop Kubernetes Control with FastAPI
|
| 2 |
+
|
| 3 |
+
This guide uses your local manifest [deploy/local-laptop.yaml](deploy/local-laptop.yaml) and a lightweight server [server/local_laptop_control.py](server/local_laptop_control.py).
|
| 4 |
+
|
| 5 |
+
## 1) Deploy local workloads
|
| 6 |
+
|
| 7 |
+
```powershell
|
| 8 |
+
kubectl apply -f deploy/local-laptop.yaml
|
| 9 |
+
kubectl get deploy -n prod-sre
|
| 10 |
+
```
|
| 11 |
+
|
| 12 |
+
Expected deployments:
|
| 13 |
+
- `auth`
|
| 14 |
+
- `cart`
|
| 15 |
+
- `catalog`
|
| 16 |
+
- `checkout`
|
| 17 |
+
- `payments`
|
| 18 |
+
|
| 19 |
+
## 2) Set required environment variables
|
| 20 |
+
|
| 21 |
+
The controller requires `KUBECONFIG` and `ANTIATROPOS_WORKLOAD_MAP`.
|
| 22 |
+
|
| 23 |
+
```powershell
|
| 24 |
+
$env:KUBECONFIG = "$HOME/.kube/config"
|
| 25 |
+
$env:ANTIATROPOS_K8S_NAMESPACE = "prod-sre"
|
| 26 |
+
$env:ANTIATROPOS_MIN_REPLICAS = "1"
|
| 27 |
+
$env:ANTIATROPOS_MAX_REPLICAS = "" # empty => unbounded scale-up
|
| 28 |
+
$env:ANTIATROPOS_SCALE_STEP = "3"
|
| 29 |
+
$env:ANTIATROPOS_WORKLOAD_MAP = '{"node-0":{"deployment":"payments","namespace":"prod-sre"},"node-1":{"deployment":"checkout","namespace":"prod-sre"},"node-2":{"deployment":"catalog","namespace":"prod-sre"},"node-3":{"deployment":"cart","namespace":"prod-sre"},"node-4":{"deployment":"auth","namespace":"prod-sre"}}'
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
If you already have these in [.env](.env), load them first.
|
| 33 |
+
|
| 34 |
+
## 3) Start lightweight FastAPI server
|
| 35 |
+
|
| 36 |
+
```powershell
|
| 37 |
+
uvicorn server.local_laptop_control:app --host 0.0.0.0 --port 8010
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
## 4) Validate server health
|
| 41 |
+
|
| 42 |
+
```powershell
|
| 43 |
+
Invoke-RestMethod http://localhost:8010/health
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
Check:
|
| 47 |
+
- `is_mock` should be `False`
|
| 48 |
+
- `mapped_targets` should include `node-0`..`node-4`
|
| 49 |
+
|
| 50 |
+
## 5) Let your agent execute actions
|
| 51 |
+
|
| 52 |
+
The server accepts `POST /step` with:
|
| 53 |
+
- `action_type`: `NO_OP` | `SCALE_UP` | `SCALE_DOWN`
|
| 54 |
+
- `target_node_id`: `node-*`
|
| 55 |
+
- `parameter`: float
|
| 56 |
+
|
| 57 |
+
Example:
|
| 58 |
+
|
| 59 |
+
```powershell
|
| 60 |
+
Invoke-RestMethod -Method Post -Uri http://localhost:8010/step -ContentType "application/json" -Body '{"action_type":"SCALE_UP","target_node_id":"node-3","parameter":0.6}'
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
## 6) Verify Kubernetes effect
|
| 64 |
+
|
| 65 |
+
```powershell
|
| 66 |
+
kubectl get deploy cart -n prod-sre
|
| 67 |
+
kubectl get deploy -n prod-sre
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
## Notes
|
| 71 |
+
|
| 72 |
+
- This controller is intentionally minimal and does not provide simulator rewards.
|
| 73 |
+
- It is suitable for direct action execution tests from your agent.
|
| 74 |
+
- If you need OpenEnv-compatible `/reset` + `/step` + reward loop, use [server/app.py](server/app.py) in `aws` mode.
|
deploy/aws/ARCHITECTURE.md
ADDED
|
@@ -0,0 +1,361 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AntiAtropos Architecture Guide
|
| 2 |
+
|
| 3 |
+
A complete explanation of how AntiAtropos works across Hugging Face Spaces and AWS, written for someone who is technically strong but new to Kubernetes.
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## The Big Picture
|
| 8 |
+
|
| 9 |
+
AntiAtropos trains AI agents to be Site Reliability Engineers (SREs). An SRE agent watches a simulated microservice cluster and decides when to scale services, reroute traffic, or shed load to keep things running smoothly.
|
| 10 |
+
|
| 11 |
+
The system is split across two platforms:
|
| 12 |
+
|
| 13 |
+
```
|
| 14 |
+
Hugging Face Spaces AWS
|
| 15 |
+
===================== ======================
|
| 16 |
+
The "brain" The "muscle"
|
| 17 |
+
|
| 18 |
+
AntiAtropos FastAPI server EKS (Kubernetes cluster)
|
| 19 |
+
- Runs the simulator - Runs the actual microservice pods
|
| 20 |
+
- Runs the SRE agent logic - The agent scales these pods
|
| 21 |
+
- Queries Prometheus for metrics - Prometheus Agent scrapes metrics
|
| 22 |
+
- Sends scale commands to K8s - Metrics flow to AMP
|
| 23 |
+
- Grafana (AMG) visualizes it all
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
Why split? HF Spaces is free/cheap for running the Python server. AWS EKS is where the real infrastructure lives that the agent practices on.
|
| 27 |
+
|
| 28 |
+
---
|
| 29 |
+
|
| 30 |
+
## Kubernetes Concepts You Need
|
| 31 |
+
|
| 32 |
+
### Pod
|
| 33 |
+
|
| 34 |
+
The smallest unit in Kubernetes. A pod is one or more containers that run together. In our case, each pod runs a single nginx container that simulates a microservice (like "payments" or "checkout").
|
| 35 |
+
|
| 36 |
+
Think of it as: one running instance of a service.
|
| 37 |
+
|
| 38 |
+
### Deployment
|
| 39 |
+
|
| 40 |
+
A Deployment is a recipe that tells Kubernetes "keep N copies of this pod running at all times." If a pod dies, the Deployment automatically replaces it.
|
| 41 |
+
|
| 42 |
+
The key field is `spec.replicas` — this is the number the SRE agent changes when it scales a service up or down.
|
| 43 |
+
|
| 44 |
+
```
|
| 45 |
+
Deployment: payments
|
| 46 |
+
replicas: 3 <-- the agent changes this number
|
| 47 |
+
|
|
| 48 |
+
+-- Pod: payments-abc123 (running)
|
| 49 |
+
+-- Pod: payments-def456 (running)
|
| 50 |
+
+-- Pod: payments-ghi789 (running)
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
**The agent scales replicas, not pods.** When it sets `replicas: 5`, Kubernetes creates 5 pods. When it sets `replicas: 2`, Kubernetes kills 3 pods.
|
| 54 |
+
|
| 55 |
+
### Service
|
| 56 |
+
|
| 57 |
+
A Service gives pods a stable network name. Instead of connecting to `payments-abc123` directly (which changes when the pod is recreated), you connect to `payments` (the Service), which routes to whichever pods are healthy.
|
| 58 |
+
|
| 59 |
+
### Namespace
|
| 60 |
+
|
| 61 |
+
A namespace is a folder for organizing resources. We use:
|
| 62 |
+
- `prod-sre` — where the 5 microservice Deployments live
|
| 63 |
+
- `monitoring` — where the Prometheus Agent pod lives
|
| 64 |
+
- `kube-system` — where AWS/EKS system pods live
|
| 65 |
+
|
| 66 |
+
### Node
|
| 67 |
+
|
| 68 |
+
A node is one EC2 virtual machine in the EKS cluster. Our cluster has 2-4 nodes. Each node runs multiple pods. When all nodes are full and the agent wants to scale up, Kubernetes adds more nodes (up to `maxSize: 4` in our config).
|
| 69 |
+
|
| 70 |
+
```
|
| 71 |
+
EKS Cluster
|
| 72 |
+
Node 1 (t3.medium - 4 vCPU, 8GB RAM)
|
| 73 |
+
Pod: payments-abc123
|
| 74 |
+
Pod: checkout-def456
|
| 75 |
+
Pod: catalog-ghi789
|
| 76 |
+
Pod: prometheus-agent-xyz
|
| 77 |
+
Node 2 (t3.medium - 4 vCPU, 8GB RAM)
|
| 78 |
+
Pod: payments-jkl012 <-- agent scaled payments from 1 to 2
|
| 79 |
+
Pod: cart-mno345
|
| 80 |
+
Pod: auth-pqr678
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
### ResourceQuota
|
| 84 |
+
|
| 85 |
+
A hard limit on how many resources a namespace can use. We set one on `prod-sre` that caps total pods at 30. This is a safety net — even if the Python code cap fails, Kubernetes itself will refuse to create more than 30 pods.
|
| 86 |
+
|
| 87 |
+
---
|
| 88 |
+
|
| 89 |
+
## How the SRE Agent Works
|
| 90 |
+
|
| 91 |
+
### The Loop
|
| 92 |
+
|
| 93 |
+
Every "tick" (one step of the simulation), the agent goes through this cycle:
|
| 94 |
+
|
| 95 |
+
```
|
| 96 |
+
1. OBSERVE -- Read telemetry (CPU, latency, queue depth) from Prometheus
|
| 97 |
+
2. DECIDE -- Choose an action (SCALE_UP, SCALE_DOWN, REROUTE_TRAFFIC, SHED_LOAD, NO_OP)
|
| 98 |
+
3. ACT -- Send the action to KubernetesExecutor
|
| 99 |
+
4. REWARD -- Compute Lyapunov stability reward (was the cluster more or less stable?)
|
| 100 |
+
5. REPEAT
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
### How Each Action Works
|
| 104 |
+
|
| 105 |
+
| Action | What the Agent Decides | What Happens on EKS |
|
| 106 |
+
|---|---|---|
|
| 107 |
+
| `SCALE_UP` | "node-0 needs more capacity" | `KubernetesExecutor` patches `payments` Deployment: `replicas: 2 -> 5` |
|
| 108 |
+
| `SCALE_DOWN` | "node-3 is over-provisioned" | `KubernetesExecutor` patches `cart` Deployment: `replicas: 4 -> 1` |
|
| 109 |
+
| `REROUTE_TRAFFIC` | "Move traffic away from node-2" | Currently simulation-only (no live K8s ingress patching) |
|
| 110 |
+
| `SHED_LOAD` | "Drop 50% of traffic to node-3" | Currently simulation-only (no live K8s traffic shaping) |
|
| 111 |
+
| `NO_OP` | "Do nothing this tick" | Nothing changes on EKS |
|
| 112 |
+
|
| 113 |
+
### The SCALE_UP Flow in Detail
|
| 114 |
+
|
| 115 |
+
Here is exactly what happens when the agent decides to scale up `node-0` (the payments service):
|
| 116 |
+
|
| 117 |
+
```
|
| 118 |
+
HF Spaces AWS EKS
|
| 119 |
+
---------- --------
|
| 120 |
+
|
| 121 |
+
Agent: "SCALE_UP, node-0, parameter=0.5"
|
| 122 |
+
|
|
| 123 |
+
v
|
| 124 |
+
AntiAtroposEnvironment.step()
|
| 125 |
+
|
|
| 126 |
+
v
|
| 127 |
+
KubernetesExecutor.execute_with_metadata()
|
| 128 |
+
|
|
| 129 |
+
v
|
| 130 |
+
_load_node_workload_map()
|
| 131 |
+
reads: node-0 -> {"deployment": "payments", "namespace": "prod-sre"}
|
| 132 |
+
|
|
| 133 |
+
v
|
| 134 |
+
_scale_deployment("SCALE_UP", "node-0", 0.5)
|
| 135 |
+
|
|
| 136 |
+
+-- 1. Read current replicas: apps_v1.read_namespaced_deployment_scale("payments", "prod-sre")
|
| 137 |
+
| Current replicas = 2
|
| 138 |
+
|
|
| 139 |
+
+-- 2. Calculate delta: max(1, int(0.5 * 3)) = 1
|
| 140 |
+
| Desired = min(6, 2 + 1) = 3 <-- max_replicas cap from env var
|
| 141 |
+
|
|
| 142 |
+
+-- 3. Patch: apps_v1.patch_namespaced_deployment_scale("payments", "prod-sre",
|
| 143 |
+
| body={"spec": {"replicas": 3}})
|
| 144 |
+
|
|
| 145 |
+
v +---------------------------+
|
| 146 |
+
Returns: "Ack: SCALE_UP for node-0 - | K8s creates 1 new pod: |
|
| 147 |
+
deployment payments in namespace | payments-newpod-xyz |
|
| 148 |
+
prod-sre scaled 2->3" +---------------------------+
|
| 149 |
+
```
|
| 150 |
+
|
| 151 |
+
### The Telemetry Flow in Detail
|
| 152 |
+
|
| 153 |
+
How the agent reads metrics from the real cluster:
|
| 154 |
+
|
| 155 |
+
```
|
| 156 |
+
EKS Cluster AMP HF Spaces
|
| 157 |
+
----------- --- ----------
|
| 158 |
+
|
| 159 |
+
Workload pods AMP Workspace AntiAtropos
|
| 160 |
+
(payments, checkout...) stores all metrics PrometheusClient
|
| 161 |
+
| ^ |
|
| 162 |
+
| /metrics (scraped every 15s) | |
|
| 163 |
+
v | |
|
| 164 |
+
Prometheus Agent | |
|
| 165 |
+
| | |
|
| 166 |
+
| remote-write (SigV4 auth) | |
|
| 167 |
+
+-------------------------------------------> |
|
| 168 |
+
| |
|
| 169 |
+
| HTTPS query |
|
| 170 |
+
+------------------------>
|
| 171 |
+
(PROMETHEUS_URL env var)
|
| 172 |
+
|
|
| 173 |
+
v
|
| 174 |
+
_fetch_real_metrics()
|
| 175 |
+
runs PromQL like:
|
| 176 |
+
sum(rate(http_requests_total[1m])) by (pod)
|
| 177 |
+
returns: TelemetryRecord for each node
|
| 178 |
+
```
|
| 179 |
+
|
| 180 |
+
---
|
| 181 |
+
|
| 182 |
+
## The Three Layers of Scaling Caps
|
| 183 |
+
|
| 184 |
+
This is the most important thing to understand for cost control. There are **three** independent limits:
|
| 185 |
+
|
| 186 |
+
### Layer 1: Python Code Cap (Soft)
|
| 187 |
+
|
| 188 |
+
**Where:** `ANTIATROPOS_MAX_REPLICAS` env var on HF Spaces, read by `kubernetes_executor.py` line 18.
|
| 189 |
+
|
| 190 |
+
**How it works:** The `_scale_deployment()` method calculates `desired = min(self.max_replicas, current + delta)`. If the agent tries to scale above 6, it gets:
|
| 191 |
+
|
| 192 |
+
```
|
| 193 |
+
Ack: SCALE_UP for node-0 - replicas unchanged at 6 (bounds 1-6)
|
| 194 |
+
```
|
| 195 |
+
|
| 196 |
+
**Can it be bypassed?** Yes. A bug in the code, or someone running `kubectl scale deployment payments --replicas=50` directly.
|
| 197 |
+
|
| 198 |
+
**Set to:** `6` on HF Spaces.
|
| 199 |
+
|
| 200 |
+
### Layer 2: Kubernetes ResourceQuota (Hard)
|
| 201 |
+
|
| 202 |
+
**Where:** `k8s-workloads.yaml` — ResourceQuota on the `prod-sre` namespace.
|
| 203 |
+
|
| 204 |
+
**How it works:** Kubernetes itself refuses to schedule pods that would exceed the quota. If the namespace already has 30 pods and something tries to create a 31st:
|
| 205 |
+
|
| 206 |
+
```
|
| 207 |
+
Error from server (Forbidden): pods "payments-new" is forbidden:
|
| 208 |
+
exceeded quota: prod-sre-quota, requested: pods=1, used: pods=30, limited: pods=30
|
| 209 |
+
```
|
| 210 |
+
|
| 211 |
+
**Can it be bypassed?** Only by someone with cluster-admin access who deletes or edits the ResourceQuota.
|
| 212 |
+
|
| 213 |
+
**Set to:** 30 pods total, 8 CPU, 8GB RAM.
|
| 214 |
+
|
| 215 |
+
### Layer 3: EKS Node Group Max Size (Hard)
|
| 216 |
+
|
| 217 |
+
**Where:** `eksctl-cluster.yaml` — `managedNodeGroups[0].maxSize: 4`.
|
| 218 |
+
|
| 219 |
+
**How it works:** The Cluster Autoscaler will never add more than 4 nodes. Even if there are 100 pending pods, it stops at 4 nodes. Pending pods just wait.
|
| 220 |
+
|
| 221 |
+
**Can it be bypassed?** Only by someone editing the node group in the AWS console.
|
| 222 |
+
|
| 223 |
+
**Set to:** 4 nodes (4 x t3.medium = 8 vCPU, 16GB RAM max).
|
| 224 |
+
|
| 225 |
+
### How the Three Layers Work Together
|
| 226 |
+
|
| 227 |
+
```
|
| 228 |
+
Agent wants to scale all 5 deployments to 20 replicas each:
|
| 229 |
+
|
| 230 |
+
Layer 1 (Python cap): 6 replicas max per deployment -> agent gets "unchanged at 6"
|
| 231 |
+
5 x 6 = 30 pods maximum
|
| 232 |
+
|
| 233 |
+
Layer 2 (ResourceQuota): 30 pods max in namespace -> 31st pod is Forbidden
|
| 234 |
+
|
| 235 |
+
Layer 3 (Node group): 4 nodes max -> if 30 pods don't fit on 4 nodes,
|
| 236 |
+
some stay Pending (no cost)
|
| 237 |
+
|
| 238 |
+
Worst case with all caps: 30 pods on 4 nodes = ~$160/month
|
| 239 |
+
Without any caps: 100 pods on 25 nodes = ~$1,800/month
|
| 240 |
+
```
|
| 241 |
+
|
| 242 |
+
---
|
| 243 |
+
|
| 244 |
+
## The Mapping: Simulator Nodes to Real Deployments
|
| 245 |
+
|
| 246 |
+
The simulator has 5 abstract nodes (node-0 through node-4). The `ANTIATROPOS_WORKLOAD_MAP` env var tells the system which K8s Deployment each simulator node maps to:
|
| 247 |
+
|
| 248 |
+
```
|
| 249 |
+
Simulator Node K8s Deployment Namespace Notes
|
| 250 |
+
------------- --------------- --------- -----
|
| 251 |
+
node-0 payments prod-sre VIP (4x importance weight)
|
| 252 |
+
node-1 checkout prod-sre Critical (no SHED_LOAD)
|
| 253 |
+
node-2 catalog prod-sre Critical (no SHED_LOAD)
|
| 254 |
+
node-3 cart prod-sre Non-critical (sheddable)
|
| 255 |
+
node-4 auth prod-sre Non-critical (sheddable)
|
| 256 |
+
```
|
| 257 |
+
|
| 258 |
+
When the simulator says "SCALE_UP node-0 by 0.5", the system:
|
| 259 |
+
1. Looks up node-0 in the workload map -> `payments` in `prod-sre`
|
| 260 |
+
2. Calls `patch_namespaced_deployment_scale("payments", "prod-sre", ...)`
|
| 261 |
+
3. Kubernetes creates/destroys pods to match the new replica count
|
| 262 |
+
|
| 263 |
+
---
|
| 264 |
+
|
| 265 |
+
## What Runs Where (Complete List)
|
| 266 |
+
|
| 267 |
+
### On Hugging Face Spaces
|
| 268 |
+
|
| 269 |
+
| Component | What It Does | Port |
|
| 270 |
+
|---|---|---|
|
| 271 |
+
| FastAPI server (`server/app.py`) | HTTP API for the agent | 7860 (via NGINX) |
|
| 272 |
+
| Simulator (`simulator.py`) | 5-node microservice cluster simulation | Internal |
|
| 273 |
+
| PrometheusClient (`telemetry/prometheus_client.py`) | Queries AMP for real metrics | Outbound HTTPS |
|
| 274 |
+
| KubernetesExecutor (`control/kubernetes_executor.py`) | Sends scale commands to EKS | Outbound HTTPS |
|
| 275 |
+
| Prometheus metrics exporter | Serves `/metrics` for HF's monitoring | 8000 |
|
| 276 |
+
| Grafana + local Prometheus | Local dashboards (from the Dockerfile) | 3000, 9090 |
|
| 277 |
+
|
| 278 |
+
### On AWS EKS
|
| 279 |
+
|
| 280 |
+
| Component | Namespace | What It Does |
|
| 281 |
+
|---|---|---|
|
| 282 |
+
| payments Deployment | prod-sre | 2 nginx pods (scales with agent) |
|
| 283 |
+
| checkout Deployment | prod-sre | 1 nginx pod (scales with agent) |
|
| 284 |
+
| catalog Deployment | prod-sre | 1 nginx pod (scales with agent) |
|
| 285 |
+
| cart Deployment | prod-sre | 1 nginx pod (scales with agent) |
|
| 286 |
+
| auth Deployment | prod-sre | 1 nginx pod (scales with agent) |
|
| 287 |
+
| Prometheus Agent | monitoring | Scrapes workload pods, remote-writes to AMP |
|
| 288 |
+
| Cluster Autoscaler | kube-system | Adds/removes EC2 nodes based on demand |
|
| 289 |
+
|
| 290 |
+
### On AWS Managed Services
|
| 291 |
+
|
| 292 |
+
| Service | What It Does |
|
| 293 |
+
|---|---|
|
| 294 |
+
| AMP (Amazon Managed Prometheus) | Stores all metrics. Queried by HF Spaces. |
|
| 295 |
+
| AMG (Amazon Managed Grafana) | Visualizes metrics in dashboards. Accessed via browser. |
|
| 296 |
+
|
| 297 |
+
---
|
| 298 |
+
|
| 299 |
+
## The Simulator vs Real Cluster
|
| 300 |
+
|
| 301 |
+
AntiAtropos has three modes controlled by `ANTIATROPOS_ENV_MODE`:
|
| 302 |
+
|
| 303 |
+
### Simulated Mode (`simulated`)
|
| 304 |
+
|
| 305 |
+
Everything is fake. The simulator generates synthetic metrics (random CPU, latency, etc.). No K8s, no Prometheus. The agent practices in a safe sandbox.
|
| 306 |
+
|
| 307 |
+
This is the default on HF Spaces without AWS configured.
|
| 308 |
+
|
| 309 |
+
### Hybrid Mode (`hybrid`)
|
| 310 |
+
|
| 311 |
+
The simulator runs, but it pulls real metrics from AMP to calibrate itself. If AMP says `payments` pods have 80% CPU, the simulator adjusts its internal model to match. The agent can read real data but actions only affect the simulator, not real pods.
|
| 312 |
+
|
| 313 |
+
### Live Mode (`live`)
|
| 314 |
+
|
| 315 |
+
The real deal. The agent reads real metrics from AMP and sends real scale commands to EKS. When it says `SCALE_UP`, actual pods get created on actual EC2 instances that cost actual money.
|
| 316 |
+
|
| 317 |
+
**Set `ANTIATROPOS_ENV_MODE=live` on HF Spaces to enable this.**
|
| 318 |
+
|
| 319 |
+
---
|
| 320 |
+
|
| 321 |
+
## Cost Flow
|
| 322 |
+
|
| 323 |
+
Every pod on EKS costs money. Here is how costs flow based on the agent's actions:
|
| 324 |
+
|
| 325 |
+
```
|
| 326 |
+
Agent action: SCALE_UP node-0
|
| 327 |
+
-> payments Deployment: replicas 2 -> 5
|
| 328 |
+
-> 3 new pods created
|
| 329 |
+
-> If existing nodes are full, Cluster Autoscaler adds a node
|
| 330 |
+
-> New node = another t3.medium EC2 instance = ~$0.04/hr
|
| 331 |
+
-> 3 pods running = 3 x (0.1 CPU + 64MB RAM) from the quota
|
| 332 |
+
|
| 333 |
+
Agent action: SCALE_DOWN node-3
|
| 334 |
+
-> cart Deployment: replicas 4 -> 1
|
| 335 |
+
-> 3 pods terminated
|
| 336 |
+
-> If nodes are now underutilized, Cluster Autoscaler removes a node (after 10 min)
|
| 337 |
+
-> One fewer EC2 instance = saves ~$0.04/hr
|
| 338 |
+
```
|
| 339 |
+
|
| 340 |
+
The Lyapunov reward function penalizes the agent for both instability AND cost, so a well-trained agent should learn to scale efficiently:
|
| 341 |
+
|
| 342 |
+
```
|
| 343 |
+
R_t = -(alpha * delta_V + beta * cost + gamma * SLA_violation)
|
| 344 |
+
^^^^
|
| 345 |
+
beta=0.01 penalizes over-provisioning
|
| 346 |
+
```
|
| 347 |
+
|
| 348 |
+
---
|
| 349 |
+
|
| 350 |
+
## Quick Reference: Key Files
|
| 351 |
+
|
| 352 |
+
| File | Purpose |
|
| 353 |
+
|---|---|
|
| 354 |
+
| `kubernetes_executor.py` | Translates agent actions to K8s API calls |
|
| 355 |
+
| `prometheus_client.py` | Queries AMP for real metrics |
|
| 356 |
+
| `simulator.py` | 5-node fluid-queue simulation |
|
| 357 |
+
| `stability.py` | Lyapunov reward computation |
|
| 358 |
+
| `deploy/aws/k8s-workloads.yaml` | The 5 Deployments + ResourceQuota on EKS |
|
| 359 |
+
| `deploy/aws/eksctl-cluster.yaml` | EKS cluster definition (nodes, caps) |
|
| 360 |
+
| `deploy/aws/prometheus-agent-values.yaml` | Helm config for Prometheus Agent |
|
| 361 |
+
| `deploy/aws/generate-kubeconfig.sh` | Creates kubeconfig for HF Spaces |
|
deploy/aws/FASTAPI_AWS_MODE_GUIDE.md
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# FastAPI AWS Mode + Local Grafana Guide
|
| 2 |
+
|
| 3 |
+
This setup keeps Kubernetes + AMP in AWS, while Grafana runs on your laptop.
|
| 4 |
+
|
| 5 |
+
## 1) Environment file
|
| 6 |
+
|
| 7 |
+
Use [../../.env.example](../../.env.example) as template. A starter [../../.env](../../.env) is already created.
|
| 8 |
+
|
| 9 |
+
Important keys:
|
| 10 |
+
|
| 11 |
+
- `ANTIATROPOS_ENV_MODE=aws`
|
| 12 |
+
- `KUBECONFIG=.../deploy/aws/kubeconfig-antiatropos.yaml`
|
| 13 |
+
- `PROMETHEUS_URL=https://aps-workspaces.<region>.amazonaws.com/workspaces/<workspace-id>`
|
| 14 |
+
- `ANTIATROPOS_WORKLOAD_MAP=...`
|
| 15 |
+
- `ANTIATROPOS_GRAFANA_MODE=external`
|
| 16 |
+
|
| 17 |
+
## 2) Load .env in PowerShell
|
| 18 |
+
|
| 19 |
+
From workspace root:
|
| 20 |
+
|
| 21 |
+
```powershell
|
| 22 |
+
Get-Content .env | ForEach-Object {
|
| 23 |
+
if ($_ -match '^\s*#' -or $_ -match '^\s*$') { return }
|
| 24 |
+
$name, $value = $_ -split '=', 2
|
| 25 |
+
[System.Environment]::SetEnvironmentVariable($name, $value, 'Process')
|
| 26 |
+
}
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
## 3) Start FastAPI server
|
| 30 |
+
|
| 31 |
+
```powershell
|
| 32 |
+
uvicorn server.app:app --host 0.0.0.0 --port 8000
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
## 4) Verify runtime wiring
|
| 36 |
+
|
| 37 |
+
Check runtime endpoint:
|
| 38 |
+
|
| 39 |
+
- [server/app.py](../../server/app.py) exposes `GET /config/runtime`
|
| 40 |
+
- Example URL: `http://localhost:8000/config/runtime`
|
| 41 |
+
|
| 42 |
+
You should see:
|
| 43 |
+
|
| 44 |
+
- `env_mode: "aws"`
|
| 45 |
+
- `prometheus_url_configured: true`
|
| 46 |
+
- `kubeconfig_configured: true`
|
| 47 |
+
- `workload_map_configured: true`
|
| 48 |
+
|
| 49 |
+
## 5) Reset environment in AWS mode
|
| 50 |
+
|
| 51 |
+
Use reset with `mode="aws"`, or omit mode and rely on `ANTIATROPOS_ENV_MODE=aws`.
|
| 52 |
+
|
| 53 |
+
## 6) Run Grafana locally (not in EKS)
|
| 54 |
+
|
| 55 |
+
```powershell
|
| 56 |
+
docker run -d --name antiatropos-grafana -p 3000:3000 grafana/grafana:latest
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
Open `http://localhost:3000` and add AMP as Prometheus datasource:
|
| 60 |
+
|
| 61 |
+
- URL: `https://aps-workspaces.<region>.amazonaws.com/workspaces/<workspace-id>`
|
| 62 |
+
- Auth: SigV4 enabled
|
| 63 |
+
- Region: your AWS region (for example `ap-south-1`)
|
| 64 |
+
|
| 65 |
+
Import dashboards:
|
| 66 |
+
|
| 67 |
+
- [../grafana/provisioning/dashboards/json/antiatropos-overview.json](../grafana/provisioning/dashboards/json/antiatropos-overview.json)
|
| 68 |
+
- [../grafana/provisioning/dashboards/json/antiatropos-live.json](../grafana/provisioning/dashboards/json/antiatropos-live.json)
|
| 69 |
+
|
| 70 |
+
## Notes
|
| 71 |
+
|
| 72 |
+
Grafana is observability-only. Agent control runs via FastAPI + Kubernetes executor.
|
deploy/aws/OPERATIONS.md
ADDED
|
@@ -0,0 +1,465 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AntiAtropos AWS Operations Guide
|
| 2 |
+
|
| 3 |
+
Everything you need to run the AWS infrastructure for AntiAtropos without blowing up your bill.
|
| 4 |
+
|
| 5 |
+
**Architecture: FastAPI on Hugging Face Spaces, EKS + AMP + AMG on AWS.**
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## Table of Contents
|
| 10 |
+
|
| 11 |
+
1. [Replica Strategy & Caps](#1-replica-strategy--caps)
|
| 12 |
+
2. [Autoscaling Configuration](#2-autoscaling-configuration)
|
| 13 |
+
3. [Cost Guardrails](#3-cost-guardrails)
|
| 14 |
+
4. [Step-by-Step Deployment Walkthrough](#4-step-by-step-deployment-walkthrough)
|
| 15 |
+
5. [Configuring HF Spaces to Connect to AWS](#5-configuring-hf-spaces-to-connect-to-aws)
|
| 16 |
+
6. [Day-2 Operations](#6-day-2-operations)
|
| 17 |
+
7. [Teardown & Cost Recovery](#7-teardown--cost-recovery)
|
| 18 |
+
|
| 19 |
+
---
|
| 20 |
+
|
| 21 |
+
## 1. Replica Strategy & Caps
|
| 22 |
+
|
| 23 |
+
### What Runs Where
|
| 24 |
+
|
| 25 |
+
| Component | Where | Scaled By | Cost Impact |
|
| 26 |
+
|---|---|---|---|
|
| 27 |
+
| **AntiAtropos FastAPI server** | HF Spaces | HF auto-scales | $0-5/month (HF billing) |
|
| 28 |
+
| **Workload pods** (payments, checkout, etc.) | EKS | SRE agent via `KubernetesExecutor` | **HIGH** — this is where costs spiral |
|
| 29 |
+
| **Prometheus Agent** | EKS (monitoring ns) | Static (1 pod) | Low |
|
| 30 |
+
| **AMP** | AWS managed | Serverless | Pay per GB ingested |
|
| 31 |
+
| **AMG** | AWS managed | Serverless | Pay per editor |
|
| 32 |
+
|
| 33 |
+
### Workload Pod Replicas — Where Costs Spiral
|
| 34 |
+
|
| 35 |
+
The SRE agent's `SCALE_UP` action calls `KubernetesExecutor._scale_deployment()`, which patches `replicas` on real K8s Deployments. A bad agent can scale every deployment to the cap.
|
| 36 |
+
|
| 37 |
+
The `ANTIATROPOS_MAX_REPLICAS` env var (set on HF Spaces) is the **global** ceiling applied to all deployments. The default in `kubernetes_executor.py` is 20 — with 5 deployments, that's **100 pods** worst case. **Set it to 6.**
|
| 38 |
+
|
| 39 |
+
**Recommended caps by deployment:**
|
| 40 |
+
|
| 41 |
+
| Deployment | Min | Max Replicas | Reasoning |
|
| 42 |
+
|---|---|---|---|
|
| 43 |
+
| `payments` (node-0, VIP) | 2 | 6 | VIP node — needs redundancy, 6 is plenty for the traffic model |
|
| 44 |
+
| `checkout` (node-1) | 1 | 5 | Can burst but shouldn't stay high |
|
| 45 |
+
| `catalog` (node-2) | 1 | 5 | Same |
|
| 46 |
+
| `cart` (node-3) | 1 | 4 | Non-critical, sheddable |
|
| 47 |
+
| `auth` (node-4) | 1 | 4 | Non-critical, sheddable |
|
| 48 |
+
|
| 49 |
+
**Total worst case: 24 workload pods.**
|
| 50 |
+
|
| 51 |
+
At ~0.25 vCPU / 256MB per workload pod (nginx containers), that's ~6 vCPU and ~6GB RAM — fits on 2x t3.medium nodes with some headroom, or 3 nodes for comfort.
|
| 52 |
+
|
| 53 |
+
### How the Cap Works
|
| 54 |
+
|
| 55 |
+
The `KubernetesExecutor._scale_deployment()` method reads `ANTIATROPOS_MAX_REPLICAS` from the environment and refuses to scale above it:
|
| 56 |
+
|
| 57 |
+
```
|
| 58 |
+
Ack: SCALE_UP for node-0 - replicas unchanged at 6 (bounds 1-6)
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
This is enforced in code (`kubernetes_executor.py` line 115):
|
| 62 |
+
```python
|
| 63 |
+
desired = min(self.max_replicas, current + delta)
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
**Set `ANTIATROPOS_MAX_REPLICAS=6` on your HF Space.**
|
| 67 |
+
|
| 68 |
+
---
|
| 69 |
+
|
| 70 |
+
## 2. Autoscaling Configuration
|
| 71 |
+
|
| 72 |
+
### EKS Node Autoscaling
|
| 73 |
+
|
| 74 |
+
The cluster needs to grow nodes when the agent scales workloads. Install the Cluster Autoscaler:
|
| 75 |
+
|
| 76 |
+
```bash
|
| 77 |
+
helm repo add autoscaler https://kubernetes.github.io/autoscaler
|
| 78 |
+
helm repo update
|
| 79 |
+
|
| 80 |
+
helm install cluster-autoscaler autoscaler/cluster-autoscaler \
|
| 81 |
+
--namespace kube-system \
|
| 82 |
+
-f deploy/aws/cluster-autoscaler-values.yaml
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
**The node group `maxSize` in `eksctl-cluster.yaml` (4) is your ultimate cost ceiling.**
|
| 86 |
+
|
| 87 |
+
```
|
| 88 |
+
4 nodes x $0.0416/hr (t3.medium on-demand) = $0.1664/hr = ~$120/month max
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
With spot instances, this drops to ~$36/month max.
|
| 92 |
+
|
| 93 |
+
### What Happens When the Agent Scales Workloads
|
| 94 |
+
|
| 95 |
+
1. Agent on HF Spaces sends `SCALE_UP` action
|
| 96 |
+
2. `KubernetesExecutor._scale_deployment()` patches the Deployment's `spec.replicas` via EKS API server
|
| 97 |
+
3. Kubernetes scheduler tries to place the new pod
|
| 98 |
+
4. If no node has capacity -> pod is `Pending`
|
| 99 |
+
5. Cluster Autoscaler sees `Pending` pods -> adds a node (within `maxSize`)
|
| 100 |
+
6. If `maxSize` is hit -> pod stays `Pending` (agent action succeeded but pod won't schedule)
|
| 101 |
+
|
| 102 |
+
**This is why `maxSize` in the node group is your ultimate cost ceiling.**
|
| 103 |
+
|
| 104 |
+
---
|
| 105 |
+
|
| 106 |
+
## 3. Cost Guardrails
|
| 107 |
+
|
| 108 |
+
### Monthly Cost Caps by Tier
|
| 109 |
+
|
| 110 |
+
| Tier | Max Nodes | Max Workload Pods | Estimated Monthly Cost |
|
| 111 |
+
|---|---|---|---|
|
| 112 |
+
| **Dev/Testing** | 2 | 10 (2/deployment) | ~$80 |
|
| 113 |
+
| **Training** | 3 | 15 (3/deployment) | ~$130 |
|
| 114 |
+
| **Benchmark Suite** | 4 | 24 (~5/deployment) | ~$160 |
|
| 115 |
+
| **Unlimited (danger)** | inf | 100 (20/deployment) | $500+ |
|
| 116 |
+
|
| 117 |
+
### AWS Budgets — Get Alerts Before You Overspend
|
| 118 |
+
|
| 119 |
+
```bash
|
| 120 |
+
aws budgets create-budget \
|
| 121 |
+
--account-id $(aws sts get-caller-identity --query Account --output text) \
|
| 122 |
+
--budget '{
|
| 123 |
+
"BudgetName": "AntiAtropos-Monthly",
|
| 124 |
+
"BudgetLimit": {"Amount": "150", "Unit": "USD"},
|
| 125 |
+
"TimeUnit": "MONTHLY",
|
| 126 |
+
"CostFilters": {
|
| 127 |
+
"TagKeyValue": ["user:Project$AntiAtropos"]
|
| 128 |
+
},
|
| 129 |
+
"CostTypes": {
|
| 130 |
+
"IncludeTax": true,
|
| 131 |
+
"IncludeSubscription": true,
|
| 132 |
+
"UseBlended": false
|
| 133 |
+
}
|
| 134 |
+
}'
|
| 135 |
+
|
| 136 |
+
# Alert at 50%
|
| 137 |
+
aws budgets create-notification \
|
| 138 |
+
--account-id $(aws sts get-caller-identity --query Account --output text) \
|
| 139 |
+
--budget-name "AntiAtropos-Monthly" \
|
| 140 |
+
--notification '{"NotificationType":"ACTUAL","ComparisonOperator":"GREATER_THAN","Threshold":50}' \
|
| 141 |
+
--subscribers '[{"SubscriptionType":"EMAIL","Address":"your-email@example.com"}]'
|
| 142 |
+
|
| 143 |
+
# Alert at 80%
|
| 144 |
+
aws budgets create-notification \
|
| 145 |
+
--account-id $(aws sts get-caller-identity --query Account --output text) \
|
| 146 |
+
--budget-name "AntiAtropos-Monthly" \
|
| 147 |
+
--notification '{"NotificationType":"ACTUAL","ComparisonOperator":"GREATER_THAN","Threshold":80}' \
|
| 148 |
+
--subscribers '[{"SubscriptionType":"EMAIL","Address":"your-email@example.com"}]'
|
| 149 |
+
```
|
| 150 |
+
|
| 151 |
+
### Cost-Saving Checklist
|
| 152 |
+
|
| 153 |
+
- [ ] Use **spot instances** for node groups (60-70% cheaper, OK for training)
|
| 154 |
+
- [ ] Set `ANTIATROPOS_MAX_REPLICAS=6` on HF Spaces (not 20) to prevent agent runaway
|
| 155 |
+
- [ ] Cap node group `maxSize` at 4 (in `eksctl-cluster.yaml`)
|
| 156 |
+
- [ ] Set AWS Budget alert at $150/month
|
| 157 |
+
- [ ] Scale workloads to zero between runs: `kubectl scale deployment -n prod-sre --replicas=0 --all`
|
| 158 |
+
- [ ] Delete the cluster for multi-day breaks: `eksctl delete cluster --name antiatropos`
|
| 159 |
+
- [ ] AMP free tier covers first 10GB ingest/month
|
| 160 |
+
- [ ] AMG free tier is 1 editor for 30 days — cancel if not needed
|
| 161 |
+
|
| 162 |
+
---
|
| 163 |
+
|
| 164 |
+
## 4. Step-by-Step Deployment Walkthrough
|
| 165 |
+
|
| 166 |
+
### Before You Start
|
| 167 |
+
|
| 168 |
+
You need:
|
| 169 |
+
- AWS account with billing alerts enabled
|
| 170 |
+
- AWS CLI v2 installed and configured (`aws configure`)
|
| 171 |
+
- eksctl, kubectl, helm installed
|
| 172 |
+
- About 20-30 minutes
|
| 173 |
+
|
| 174 |
+
### Step 1: Create the EKS Cluster (15 min)
|
| 175 |
+
|
| 176 |
+
```bash
|
| 177 |
+
eksctl create cluster -f deploy/aws/eksctl-cluster.yaml
|
| 178 |
+
|
| 179 |
+
# Verify
|
| 180 |
+
aws eks update-kubeconfig --name antiatropos --region ap-south-1
|
| 181 |
+
kubectl get nodes
|
| 182 |
+
```
|
| 183 |
+
|
| 184 |
+
### Step 2: Deploy Sample Workloads (1 min)
|
| 185 |
+
|
| 186 |
+
```bash
|
| 187 |
+
kubectl apply -f deploy/aws/k8s-workloads.yaml
|
| 188 |
+
kubectl get pods -n prod-sre
|
| 189 |
+
```
|
| 190 |
+
|
| 191 |
+
### Step 3: Create AMP Workspace (1 min)
|
| 192 |
+
|
| 193 |
+
```bash
|
| 194 |
+
aws amp create-workspace --alias antiatropos-metrics --region ap-south-1
|
| 195 |
+
|
| 196 |
+
# Note the workspace ID
|
| 197 |
+
aws amp list-workspaces --alias antiatropos-metrics --region ap-south-1 --query 'workspaces[0].workspaceId' --output text
|
| 198 |
+
```
|
| 199 |
+
|
| 200 |
+
### Step 4: Set Up IRSA (2 min)
|
| 201 |
+
|
| 202 |
+
```bash
|
| 203 |
+
# Prometheus agent needs to write to AMP
|
| 204 |
+
eksctl create iamserviceaccount \
|
| 205 |
+
--cluster antiatropos \
|
| 206 |
+
--namespace monitoring \
|
| 207 |
+
--name prometheus-sa \
|
| 208 |
+
--attach-policy-arn arn:aws:iam::aws:policy/AmazonPrometheusRemoteWriteAccess \
|
| 209 |
+
--approve
|
| 210 |
+
```
|
| 211 |
+
|
| 212 |
+
### Step 5: Install Prometheus Agent (2 min)
|
| 213 |
+
|
| 214 |
+
```bash
|
| 215 |
+
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
|
| 216 |
+
helm repo update
|
| 217 |
+
|
| 218 |
+
# Replace WORKSPACE_ID
|
| 219 |
+
helm install prometheus-agent prometheus-community/prometheus \
|
| 220 |
+
--namespace monitoring --create-namespace \
|
| 221 |
+
-f deploy/aws/prometheus-agent-values.yaml \
|
| 222 |
+
--set "prometheus.prometheusSpec.remoteWrite[0].url=https://aps-workspaces.ap-south-1.amazonaws.com/workspaces/WORKSPACE_ID/api/v1/remote_write"
|
| 223 |
+
```
|
| 224 |
+
|
| 225 |
+
### Step 6: Set Up AMG (5 min)
|
| 226 |
+
|
| 227 |
+
```bash
|
| 228 |
+
# Create IAM role for AMG
|
| 229 |
+
aws iam create-role \
|
| 230 |
+
--role-name AntiAtroposGrafanaRole \
|
| 231 |
+
--assume-role-policy-document file://deploy/aws/grafana-trust-policy.json
|
| 232 |
+
|
| 233 |
+
aws iam attach-role-policy \
|
| 234 |
+
--role-name AntiAtroposGrafanaRole \
|
| 235 |
+
--policy-arn arn:aws:iam::aws:policy/AmazonPrometheusQueryAccess
|
| 236 |
+
|
| 237 |
+
# Create workspace
|
| 238 |
+
aws grafana create-workspace \
|
| 239 |
+
--workspace-name antiatropos-dashboards \
|
| 240 |
+
--account-access-type CURRENT_ACCOUNT \
|
| 241 |
+
--authentication-method AWS_SSO \
|
| 242 |
+
--permission-type SERVICE_MANAGED \
|
| 243 |
+
--data-sources PROMETHEUS \
|
| 244 |
+
--region ap-south-1
|
| 245 |
+
```
|
| 246 |
+
|
| 247 |
+
Then in the AMG web UI:
|
| 248 |
+
1. Sign in with AWS SSO
|
| 249 |
+
2. Configuration -> Data Sources -> Add AMP workspace
|
| 250 |
+
3. Dashboards -> Import -> Upload JSON from `deploy/grafana/provisioning/dashboards/json/`
|
| 251 |
+
4. Select AMP data source when importing
|
| 252 |
+
|
| 253 |
+
### Step 7: Install Cluster Autoscaler (2 min)
|
| 254 |
+
|
| 255 |
+
```bash
|
| 256 |
+
helm repo add autoscaler https://kubernetes.github.io/autoscaler
|
| 257 |
+
helm repo update
|
| 258 |
+
|
| 259 |
+
helm install cluster-autoscaler autoscaler/cluster-autoscaler \
|
| 260 |
+
--namespace kube-system \
|
| 261 |
+
-f deploy/aws/cluster-autoscaler-values.yaml
|
| 262 |
+
```
|
| 263 |
+
|
| 264 |
+
### Step 8: Generate Kubeconfig for HF Spaces (1 min)
|
| 265 |
+
|
| 266 |
+
```bash
|
| 267 |
+
./deploy/aws/generate-kubeconfig.sh
|
| 268 |
+
# Outputs: deploy/aws/kubeconfig-antiatropos.yaml
|
| 269 |
+
```
|
| 270 |
+
|
| 271 |
+
### Step 9: Configure HF Spaces
|
| 272 |
+
|
| 273 |
+
See [Section 5](#5-configuring-hf-spaces-to-connect-to-aws) below.
|
| 274 |
+
|
| 275 |
+
---
|
| 276 |
+
|
| 277 |
+
## 5. Configuring HF Spaces to Connect to AWS
|
| 278 |
+
|
| 279 |
+
### Secrets (HF Space Settings -> Repository secrets)
|
| 280 |
+
|
| 281 |
+
| Secret | Value |
|
| 282 |
+
|---|---|
|
| 283 |
+
| `OPENAI_API_KEY` | Your OpenAI API key |
|
| 284 |
+
| `KUBECONFIG_CONTENT` | Base64-encoded content of `kubeconfig-antiatropos.yaml` |
|
| 285 |
+
|
| 286 |
+
To encode the kubeconfig:
|
| 287 |
+
```bash
|
| 288 |
+
cat deploy/aws/kubeconfig-antiatropos.yaml | base64 -w 0
|
| 289 |
+
```
|
| 290 |
+
|
| 291 |
+
### Environment Variables (HF Space Settings -> Variables)
|
| 292 |
+
|
| 293 |
+
| Variable | Value |
|
| 294 |
+
|---|---|
|
| 295 |
+
| `ANTIATROPOS_ENV_MODE` | `live` |
|
| 296 |
+
| `ANTIATROPOS_STRICT_REAL` | `false` |
|
| 297 |
+
| `PROMETHEUS_URL` | `https://aps-workspaces.ap-south-1.amazonaws.com/workspaces/WORKSPACE_ID` |
|
| 298 |
+
| `KUBECONFIG` | `/app/kubeconfig.yaml` |
|
| 299 |
+
| `ANTIATROPOS_K8S_NAMESPACE` | `prod-sre` |
|
| 300 |
+
| `ANTIATROPOS_DEPLOYMENT_PREFIX` | `` (empty) |
|
| 301 |
+
| `ANTIATROPOS_MIN_REPLICAS` | `1` |
|
| 302 |
+
| `ANTIATROPOS_MAX_REPLICAS` | `6` |
|
| 303 |
+
| `ANTIATROPOS_SCALE_STEP` | `3` |
|
| 304 |
+
| `ANTIATROPOS_PROM_TIMEOUT_S` | `5.0` |
|
| 305 |
+
| `ANTIATROPOS_METRIC_AGGREGATION` | `sum` |
|
| 306 |
+
| `ANTIATROPOS_WORKLOAD_MAP` | See below |
|
| 307 |
+
|
| 308 |
+
### Workload Map Value
|
| 309 |
+
|
| 310 |
+
```json
|
| 311 |
+
{
|
| 312 |
+
"node-0": {"deployment": "payments", "namespace": "prod-sre"},
|
| 313 |
+
"node-1": {"deployment": "checkout", "namespace": "prod-sre"},
|
| 314 |
+
"node-2": {"deployment": "catalog", "namespace": "prod-sre"},
|
| 315 |
+
"node-3": {"deployment": "cart", "namespace": "prod-sre"},
|
| 316 |
+
"node-4": {"deployment": "auth", "namespace": "prod-sre"}
|
| 317 |
+
}
|
| 318 |
+
```
|
| 319 |
+
|
| 320 |
+
### Entrypoint Modification
|
| 321 |
+
|
| 322 |
+
Add this to `deploy/entrypoint.sh` before the uvicorn line, so the kubeconfig is decoded from the HF secret:
|
| 323 |
+
|
| 324 |
+
```bash
|
| 325 |
+
# Decode kubeconfig from HF Spaces secret
|
| 326 |
+
if [ -n "${KUBECONFIG_CONTENT:-}" ]; then
|
| 327 |
+
echo "${KUBECONFIG_CONTENT}" | base64 -d > /app/kubeconfig.yaml
|
| 328 |
+
export KUBECONFIG=/app/kubeconfig.yaml
|
| 329 |
+
fi
|
| 330 |
+
```
|
| 331 |
+
|
| 332 |
+
### Verifying the Connection
|
| 333 |
+
|
| 334 |
+
After deploying, check from HF Spaces that the server can reach AWS:
|
| 335 |
+
|
| 336 |
+
1. Check the HF Space logs for `antiatropos_step` events
|
| 337 |
+
2. Look for `Ack: SCALE_UP` messages (agent is reaching EKS)
|
| 338 |
+
3. Look for non-zero `request_rate` / `cpu_utilization` (PrometheusClient is reaching AMP)
|
| 339 |
+
4. If `ANTIATROPOS_STRICT_REAL=false` (recommended), failures fall back to mock silently
|
| 340 |
+
|
| 341 |
+
---
|
| 342 |
+
|
| 343 |
+
## 6. Day-2 Operations
|
| 344 |
+
|
| 345 |
+
### Scaling Workloads Manually
|
| 346 |
+
|
| 347 |
+
```bash
|
| 348 |
+
# Scale a specific deployment
|
| 349 |
+
kubectl scale deployment/payments -n prod-sre --replicas=4
|
| 350 |
+
|
| 351 |
+
# Scale all workloads down
|
| 352 |
+
kubectl scale deployment -n prod-sre --replicas=0 --all
|
| 353 |
+
|
| 354 |
+
# Scale all workloads back up
|
| 355 |
+
kubectl scale deployment payments -n prod-sre --replicas=2
|
| 356 |
+
kubectl scale deployment checkout -n prod-sre --replicas=1
|
| 357 |
+
kubectl scale deployment catalog -n prod-sre --replicas=1
|
| 358 |
+
kubectl scale deployment cart -n prod-sre --replicas=1
|
| 359 |
+
kubectl scale deployment auth -n prod-sre --replicas=1
|
| 360 |
+
```
|
| 361 |
+
|
| 362 |
+
### Pausing Everything (Without Deleting)
|
| 363 |
+
|
| 364 |
+
```bash
|
| 365 |
+
# Scale all workloads to 0
|
| 366 |
+
kubectl scale deployment -n prod-sre --replicas=0 --all
|
| 367 |
+
|
| 368 |
+
# Note: EKS nodes still run and cost money.
|
| 369 |
+
# For real savings, delete the cluster (Section 7).
|
| 370 |
+
```
|
| 371 |
+
|
| 372 |
+
### Monitoring Agent Behavior
|
| 373 |
+
|
| 374 |
+
Watch what the SRE agent is doing in real-time:
|
| 375 |
+
|
| 376 |
+
```bash
|
| 377 |
+
# Check how many workload pods the agent has created
|
| 378 |
+
kubectl get deployments -n prod-sre
|
| 379 |
+
|
| 380 |
+
# Check current replica counts
|
| 381 |
+
kubectl get hpa -A # if any HPAs are defined
|
| 382 |
+
|
| 383 |
+
# Check node pressure
|
| 384 |
+
kubectl top nodes
|
| 385 |
+
```
|
| 386 |
+
|
| 387 |
+
### Checking Current Spend
|
| 388 |
+
|
| 389 |
+
```bash
|
| 390 |
+
# Current month cost by service
|
| 391 |
+
aws ce get-cost-and-usage \
|
| 392 |
+
--time-period Start=$(date -d '1st of this month' +%Y-%m-%d),End=$(date +%Y-%m-%d) \
|
| 393 |
+
--granularity MONTHLY \
|
| 394 |
+
--metrics BlendedCost \
|
| 395 |
+
--group-by Type=DIMENSION,Key=SERVICE
|
| 396 |
+
```
|
| 397 |
+
|
| 398 |
+
### Regenerating Kubeconfig
|
| 399 |
+
|
| 400 |
+
If the EKS cluster is recreated or credentials expire:
|
| 401 |
+
|
| 402 |
+
```bash
|
| 403 |
+
./deploy/aws/generate-kubeconfig.sh
|
| 404 |
+
# Re-upload the base64-encoded content to HF Spaces secret KUBECONFIG_CONTENT
|
| 405 |
+
```
|
| 406 |
+
|
| 407 |
+
---
|
| 408 |
+
|
| 409 |
+
## 7. Teardown & Cost Recovery
|
| 410 |
+
|
| 411 |
+
### Partial Teardown (Keep Cluster, Stop Workloads)
|
| 412 |
+
|
| 413 |
+
```bash
|
| 414 |
+
kubectl scale deployment -n prod-sre --replicas=0 --all
|
| 415 |
+
# Still paying for EKS control plane ($73/month) and idle nodes
|
| 416 |
+
```
|
| 417 |
+
|
| 418 |
+
### Full Teardown (Stop All Charges)
|
| 419 |
+
|
| 420 |
+
```bash
|
| 421 |
+
# Delete workloads
|
| 422 |
+
kubectl delete -f deploy/aws/k8s-workloads.yaml
|
| 423 |
+
|
| 424 |
+
# Delete Prometheus agent
|
| 425 |
+
helm uninstall prometheus-agent -n monitoring
|
| 426 |
+
kubectl delete namespace monitoring
|
| 427 |
+
|
| 428 |
+
# Delete AMP workspace
|
| 429 |
+
AMP_WS_ID=$(aws amp list-workspaces --alias antiatropos-metrics --region ap-south-1 --query 'workspaces[0].workspaceId' --output text)
|
| 430 |
+
aws amp delete-workspace --workspace-id $AMP_WS_ID --region ap-south-1
|
| 431 |
+
|
| 432 |
+
# Delete AMG workspace
|
| 433 |
+
AMG_WS_ID=$(aws grafana list-workspaces --region ap-south-1 --query 'workspaces[0].id' --output text)
|
| 434 |
+
aws grafana delete-workspace --workspace-id $AMG_WS_ID
|
| 435 |
+
|
| 436 |
+
# Delete IAM role for Grafana
|
| 437 |
+
aws iam detach-role-policy --role-name AntiAtroposGrafanaRole --policy-arn arn:aws:iam::aws:policy/AmazonPrometheusQueryAccess
|
| 438 |
+
aws iam detach-role-policy --role-name AntiAtroposGrafanaRole --policy-arn arn:aws:iam::aws:policy/AmazonPrometheusRemoteWriteAccess
|
| 439 |
+
aws iam delete-role --role-name AntiAtroposGrafanaRole
|
| 440 |
+
|
| 441 |
+
# Delete the EKS cluster (10-15 min)
|
| 442 |
+
eksctl delete cluster --name antiatropos --region ap-south-1
|
| 443 |
+
|
| 444 |
+
# Verify nothing is left
|
| 445 |
+
aws eks list-clusters --region ap-south-1
|
| 446 |
+
aws amp list-workspaces --region ap-south-1
|
| 447 |
+
```
|
| 448 |
+
|
| 449 |
+
Also remove the `KUBECONFIG_CONTENT` secret and reset `PROMETHEUS_URL` to `mock` in your HF Space.
|
| 450 |
+
|
| 451 |
+
---
|
| 452 |
+
|
| 453 |
+
## Quick Reference Card
|
| 454 |
+
|
| 455 |
+
| Task | Command |
|
| 456 |
+
|---|---|
|
| 457 |
+
| Deploy AWS infra | `./deploy/aws/deploy.sh` |
|
| 458 |
+
| Check workloads | `kubectl get pods -n prod-sre` |
|
| 459 |
+
| Check monitoring | `kubectl get pods -n monitoring` |
|
| 460 |
+
| Scale a workload | `kubectl scale deployment/payments -n prod-sre --replicas=N` |
|
| 461 |
+
| Pause all workloads | `kubectl scale deployment -n prod-sre --replicas=0 --all` |
|
| 462 |
+
| Check AMP data | `awscurl --service aps "https://aps-workspaces.ap-south-1.amazonaws.com/workspaces/WS_ID/api/v1/query?query=up" --region ap-south-1` |
|
| 463 |
+
| Generate kubeconfig | `./deploy/aws/generate-kubeconfig.sh` |
|
| 464 |
+
| Nuke everything | `eksctl delete cluster --name antiatropos --region ap-south-1` |
|
| 465 |
+
|
deploy/aws/README.md
ADDED
|
@@ -0,0 +1,361 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AntiAtropos AWS Deployment Guide
|
| 2 |
+
|
| 3 |
+
Deploy the AWS infrastructure (EKS + AMP) that AntiAtropos on Hugging Face Spaces connects to.
|
| 4 |
+
|
| 5 |
+
For FastAPI wiring with `aws` mode and laptop Grafana, see [deploy/aws/FASTAPI_AWS_MODE_GUIDE.md](deploy/aws/FASTAPI_AWS_MODE_GUIDE.md).
|
| 6 |
+
|
| 7 |
+
## Architecture
|
| 8 |
+
|
| 9 |
+
```
|
| 10 |
+
Hugging Face Spaces AWS Region (ap-south-1)
|
| 11 |
+
===================== ======================
|
| 12 |
+
┌─────────────────────────┐
|
| 13 |
+
│ EKS Cluster │
|
| 14 |
+
┌─────────────────┐ │ ├── Workload pods │
|
| 15 |
+
│ AntiAtropos │ PROMETHEUS_URL │ │ (payments, checkout │
|
| 16 |
+
│ FastAPI Server │───────────────────>│ │ catalog, cart, auth)│
|
| 17 |
+
│ (port 7860) │ (HTTPS + SigV4) │ ├── Prometheus Agent │
|
| 18 |
+
│ │ │ │ (scrapes workloads, │
|
| 19 |
+
│ │ KUBECONFIG │ │ remote-writes AMP) │
|
| 20 |
+
│ │───────────────────>│ ├── Grafana │
|
| 21 |
+
│ │ (EKS API server) │ │ (self-hosted, │
|
| 22 |
+
│ │ │ │ dashboards) │
|
| 23 |
+
│ │ │ └── Monitoring ns │
|
| 24 |
+
│ │ └─────────────────────────┘
|
| 25 |
+
│ │ ┌─────────────────────────┐
|
| 26 |
+
│ │ │ Amazon Managed │
|
| 27 |
+
│ │ │ Prometheus (AMP) │
|
| 28 |
+
│ │ │ Workspace: antiatropos │
|
| 29 |
+
│ │ └─────────────────────────┘
|
| 30 |
+
└─────────────────┘
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
**Key principle: FastAPI runs on HF Spaces. AWS runs K8s workloads + AMP + self-hosted Grafana.**
|
| 34 |
+
|
| 35 |
+
---
|
| 36 |
+
|
| 37 |
+
## Phase 0: Prerequisites
|
| 38 |
+
|
| 39 |
+
```bash
|
| 40 |
+
# AWS CLI v2
|
| 41 |
+
curl "https://awscli.amazonaws.com/AWSCLIV2.msi" -o "AWSCLIV2.msi"
|
| 42 |
+
msiexec /i AWSCLIV2.msi
|
| 43 |
+
|
| 44 |
+
# eksctl
|
| 45 |
+
choco install eksctl
|
| 46 |
+
|
| 47 |
+
# kubectl
|
| 48 |
+
choco install kubernetes-cli
|
| 49 |
+
|
| 50 |
+
# Helm
|
| 51 |
+
choco install kubernetes-helm
|
| 52 |
+
|
| 53 |
+
# Authenticate
|
| 54 |
+
aws configure
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
---
|
| 58 |
+
|
| 59 |
+
## Phase 1: Create the EKS Cluster (15 min)
|
| 60 |
+
|
| 61 |
+
```bash
|
| 62 |
+
eksctl create cluster -f deploy/aws/eksctl-cluster.yaml
|
| 63 |
+
|
| 64 |
+
# Verify
|
| 65 |
+
aws eks update-kubeconfig --name antiatropos --region ap-south-1
|
| 66 |
+
kubectl get nodes
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
---
|
| 70 |
+
|
| 71 |
+
## Phase 2: Deploy Sample Workloads on EKS
|
| 72 |
+
|
| 73 |
+
These are the microservice deployments the SRE agent will scale up/down:
|
| 74 |
+
|
| 75 |
+
```bash
|
| 76 |
+
kubectl apply -f deploy/aws/k8s-workloads.yaml
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
This creates 5 deployments in the `prod-sre` namespace:
|
| 80 |
+
- `payments` (node-0, VIP) — 2 replicas
|
| 81 |
+
- `checkout` (node-1) — 1 replica
|
| 82 |
+
- `catalog` (node-2) — 1 replica
|
| 83 |
+
- `cart` (node-3) — 1 replica
|
| 84 |
+
- `auth` (node-4) — 1 replica
|
| 85 |
+
|
| 86 |
+
Verify:
|
| 87 |
+
```bash
|
| 88 |
+
kubectl get pods -n prod-sre
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
---
|
| 92 |
+
|
| 93 |
+
## Phase 3: Set Up Amazon Managed Prometheus (AMP)
|
| 94 |
+
|
| 95 |
+
### Create AMP Workspace
|
| 96 |
+
|
| 97 |
+
```bash
|
| 98 |
+
aws amp create-workspace \
|
| 99 |
+
--alias antiatropos-metrics \
|
| 100 |
+
--region ap-south-1
|
| 101 |
+
|
| 102 |
+
# Note the workspace ID
|
| 103 |
+
aws amp list-workspaces --alias antiatropos-metrics --region ap-south-1
|
| 104 |
+
```
|
| 105 |
+
|
| 106 |
+
### Set Up IRSA for Prometheus Agent
|
| 107 |
+
|
| 108 |
+
```bash
|
| 109 |
+
eksctl create iamserviceaccount \
|
| 110 |
+
--cluster antiatropos \
|
| 111 |
+
--namespace monitoring \
|
| 112 |
+
--name prometheus-sa \
|
| 113 |
+
--attach-policy-arn arn:aws:iam::aws:policy/AmazonPrometheusRemoteWriteAccess \
|
| 114 |
+
--approve \
|
| 115 |
+
--override-existing-serviceaccounts
|
| 116 |
+
```
|
| 117 |
+
|
| 118 |
+
### Install Prometheus Agent on EKS
|
| 119 |
+
|
| 120 |
+
The agent scrapes workload pods and remote-writes metrics to AMP:
|
| 121 |
+
|
| 122 |
+
```bash
|
| 123 |
+
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
|
| 124 |
+
helm repo update
|
| 125 |
+
|
| 126 |
+
# Replace WORKSPACE_ID with your AMP workspace ID
|
| 127 |
+
helm install prometheus-agent prometheus-community/prometheus \
|
| 128 |
+
--namespace monitoring --create-namespace \
|
| 129 |
+
-f deploy/aws/prometheus-agent-values.yaml \
|
| 130 |
+
--set prometheus.prometheusSpec.remoteWrite[0].url="https://aps-workspaces.ap-south-1.amazonaws.com/workspaces/WORKSPACE_ID/api/v1/remote_write"
|
| 131 |
+
```
|
| 132 |
+
|
| 133 |
+
### Verify AMP is Receiving Data
|
| 134 |
+
|
| 135 |
+
```bash
|
| 136 |
+
pip install awscurl
|
| 137 |
+
awscurl --service aps "https://aps-workspaces.ap-south-1.amazonaws.com/workspaces/WORKSPACE_ID/api/v1/query?query=up" --region ap-south-1
|
| 138 |
+
```
|
| 139 |
+
|
| 140 |
+
---
|
| 141 |
+
|
| 142 |
+
## Phase 4 (Optional): Set Up Self-Hosted Grafana on EKS
|
| 143 |
+
|
| 144 |
+
If you are on free-tier nodes, skip this section and run Grafana locally on your laptop.
|
| 145 |
+
|
| 146 |
+
### Install Grafana
|
| 147 |
+
|
| 148 |
+
```bash
|
| 149 |
+
helm repo add grafana https://grafana.github.io/helm-charts
|
| 150 |
+
helm repo update
|
| 151 |
+
|
| 152 |
+
helm install grafana grafana/grafana \
|
| 153 |
+
--namespace monitoring \
|
| 154 |
+
-f deploy/aws/grafana-values.yaml
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
+
### Create Dashboard Secret
|
| 158 |
+
|
| 159 |
+
```bash
|
| 160 |
+
kubectl create secret generic antiatropos-grafana-dashboards \
|
| 161 |
+
--from-file=antiatropos-overview.json=deploy/grafana/provisioning/dashboards/json/antiatropos-overview.json \
|
| 162 |
+
--from-file=antiatropos-live.json=deploy/grafana/provisioning/dashboards/json/antiatropos-live.json \
|
| 163 |
+
--namespace monitoring \
|
| 164 |
+
--dry-run=client -o yaml | kubectl apply -f -
|
| 165 |
+
```
|
| 166 |
+
|
| 167 |
+
### Access Grafana
|
| 168 |
+
|
| 169 |
+
```bash
|
| 170 |
+
kubectl port-forward svc/grafana 3000 -n monitoring
|
| 171 |
+
```
|
| 172 |
+
|
| 173 |
+
Open `http://localhost:3000` in your browser:
|
| 174 |
+
- Username: `admin`
|
| 175 |
+
- Password: `antiatropos`
|
| 176 |
+
|
| 177 |
+
The data source `AMP-Local` is pre-configured to use the local Prometheus agent, and dashboards are auto-imported from the secret.
|
| 178 |
+
|
| 179 |
+
---
|
| 180 |
+
|
| 181 |
+
## Phase 5: Generate Kubeconfig for HF Spaces
|
| 182 |
+
|
| 183 |
+
The AntiAtropos server on HF Spaces needs a kubeconfig to talk to EKS:
|
| 184 |
+
|
| 185 |
+
```bash
|
| 186 |
+
./deploy/aws/generate-kubeconfig.sh
|
| 187 |
+
```
|
| 188 |
+
|
| 189 |
+
This outputs `deploy/aws/kubeconfig-antiatropos.yaml`. You'll set this as a secret on HF Spaces.
|
| 190 |
+
|
| 191 |
+
---
|
| 192 |
+
|
| 193 |
+
## Phase 6: Configure HF Spaces Environment Variables
|
| 194 |
+
|
| 195 |
+
Set these in your HF Space (Settings → Repository secrets and Variables):
|
| 196 |
+
|
| 197 |
+
### Secrets
|
| 198 |
+
|
| 199 |
+
| Secret | Value |
|
| 200 |
+
|---|---|
|
| 201 |
+
| `OPENAI_API_KEY` | Your OpenAI API key |
|
| 202 |
+
| `KUBECONFIG_CONTENT` | Full content of `kubeconfig-antiatropos.yaml`, base64-encoded |
|
| 203 |
+
|
| 204 |
+
### Environment Variables
|
| 205 |
+
|
| 206 |
+
| Variable | Value |
|
| 207 |
+
|---|---|
|
| 208 |
+
| `ANTIATROPOS_ENV_MODE` | `aws` |
|
| 209 |
+
| `ANTIATROPOS_STRICT_REAL` | `false` |
|
| 210 |
+
| `PROMETHEUS_URL` | `https://aps-workspaces.ap-south-1.amazonaws.com/workspaces/WORKSPACE_ID` |
|
| 211 |
+
| `KUBECONFIG` | `/app/kubeconfig.yaml` |
|
| 212 |
+
| `ANTIATROPOS_K8S_NAMESPACE` | `prod-sre` |
|
| 213 |
+
| `ANTIATROPOS_MAX_REPLICAS` | `6` |
|
| 214 |
+
| `ANTIATROPOS_MIN_REPLICAS` | `1` |
|
| 215 |
+
| `ANTIATROPOS_SCALE_STEP` | `3` |
|
| 216 |
+
| `ANTIATROPOS_PROM_TIMEOUT_S` | `5.0` |
|
| 217 |
+
| `ANTIATROPOS_METRIC_AGGREGATION` | `sum` |
|
| 218 |
+
| `ANTIATROPOS_WORKLOAD_MAP` | See below |
|
| 219 |
+
|
| 220 |
+
### Workload Map
|
| 221 |
+
|
| 222 |
+
```json
|
| 223 |
+
{
|
| 224 |
+
"node-0": {"deployment": "payments", "namespace": "prod-sre"},
|
| 225 |
+
"node-1": {"deployment": "checkout", "namespace": "prod-sre"},
|
| 226 |
+
"node-2": {"deployment": "catalog", "namespace": "prod-sre"},
|
| 227 |
+
"node-3": {"deployment": "cart", "namespace": "prod-sre"},
|
| 228 |
+
"node-4": {"deployment": "auth", "namespace": "prod-sre"}
|
| 229 |
+
}
|
| 230 |
+
```
|
| 231 |
+
|
| 232 |
+
### Entrypoint Addition
|
| 233 |
+
|
| 234 |
+
Add this to `deploy/entrypoint.sh` before starting uvicorn, so the kubeconfig is decoded from the HF secret:
|
| 235 |
+
|
| 236 |
+
```bash
|
| 237 |
+
# Decode kubeconfig from HF Spaces secret
|
| 238 |
+
if [ -n "${KUBECONFIG_CONTENT:-}" ]; then
|
| 239 |
+
echo "${KUBECONFIG_CONTENT}" | base64 -d > /app/kubeconfig.yaml
|
| 240 |
+
export KUBECONFIG=/app/kubeconfig.yaml
|
| 241 |
+
fi
|
| 242 |
+
```
|
| 243 |
+
|
| 244 |
+
### FastAPI Reset Mode
|
| 245 |
+
|
| 246 |
+
Use `mode="aws"` on environment reset for AWS-backed execution. If omitted, the server will use `ANTIATROPOS_ENV_MODE`.
|
| 247 |
+
|
| 248 |
+
---
|
| 249 |
+
|
| 250 |
+
## Local Grafana (Recommended on Free Tier)
|
| 251 |
+
|
| 252 |
+
Grafana is only for observability dashboards. Agent action execution stays in FastAPI + Kubernetes executor.
|
| 253 |
+
|
| 254 |
+
Start Grafana locally:
|
| 255 |
+
|
| 256 |
+
```bash
|
| 257 |
+
docker run -d --name antiatropos-grafana -p 3000:3000 grafana/grafana:latest
|
| 258 |
+
```
|
| 259 |
+
|
| 260 |
+
Then in Grafana:
|
| 261 |
+
|
| 262 |
+
1. Add Prometheus datasource using AMP workspace URL:
|
| 263 |
+
- `https://aps-workspaces.<region>.amazonaws.com/workspaces/<WORKSPACE_ID>`
|
| 264 |
+
2. Enable SigV4 auth and set the same AWS region.
|
| 265 |
+
3. Import dashboards:
|
| 266 |
+
- [deploy/grafana/provisioning/dashboards/json/antiatropos-overview.json](deploy/grafana/provisioning/dashboards/json/antiatropos-overview.json)
|
| 267 |
+
- [deploy/grafana/provisioning/dashboards/json/antiatropos-live.json](deploy/grafana/provisioning/dashboards/json/antiatropos-live.json)
|
| 268 |
+
|
| 269 |
+
---
|
| 270 |
+
|
| 271 |
+
## Phase 7: Install Cluster Autoscaler
|
| 272 |
+
|
| 273 |
+
So EKS can add nodes when the agent scales workloads:
|
| 274 |
+
|
| 275 |
+
```bash
|
| 276 |
+
helm repo add autoscaler https://kubernetes.github.io/autoscaler
|
| 277 |
+
helm repo update
|
| 278 |
+
|
| 279 |
+
helm install cluster-autoscaler autoscaler/cluster-autoscaler \
|
| 280 |
+
--namespace kube-system \
|
| 281 |
+
-f deploy/aws/cluster-autoscaler-values.yaml
|
| 282 |
+
```
|
| 283 |
+
|
| 284 |
+
The node group `maxSize: 4` in `eksctl-cluster.yaml` caps your compute cost.
|
| 285 |
+
|
| 286 |
+
---
|
| 287 |
+
|
| 288 |
+
## Cost Estimates
|
| 289 |
+
|
| 290 |
+
| Resource | Config | Monthly Cost (approx) |
|
| 291 |
+
|---|---|---|
|
| 292 |
+
| EKS Control Plane | 1 cluster | $73 |
|
| 293 |
+
| EKS Nodes | 2x t3.medium | $60 |
|
| 294 |
+
| AMP | <10GB ingest | ~$3-5 |
|
| 295 |
+
| EBS Volume (Grafana) | 5Gi | ~$0.50 |
|
| 296 |
+
| **Total** | | **~$135-145/month** |
|
| 297 |
+
| HF Spaces | Free tier or $5/mo | (separate billing) |
|
| 298 |
+
|
| 299 |
+
No ECR, no ALB, no server pods on AWS — cheaper than running everything on AWS.
|
| 300 |
+
|
| 301 |
+
### Cost-Saving Tips
|
| 302 |
+
|
| 303 |
+
- Use spot instances for node groups (60-70% cheaper)
|
| 304 |
+
- Scale workloads to zero between runs: `kubectl scale deployment -n prod-sre --replicas=0 --all`
|
| 305 |
+
- Delete the cluster between training runs: `eksctl delete cluster --name antiatropos`
|
| 306 |
+
- AMP free tier covers first 10GB ingest/month
|
| 307 |
+
- Grafana is self-hosted (free, runs on EKS)
|
| 308 |
+
|
| 309 |
+
---
|
| 310 |
+
|
| 311 |
+
## Teardown
|
| 312 |
+
|
| 313 |
+
```bash
|
| 314 |
+
# Delete workloads
|
| 315 |
+
kubectl delete -f deploy/aws/k8s-workloads.yaml
|
| 316 |
+
|
| 317 |
+
# Delete Grafana
|
| 318 |
+
helm uninstall grafana -n monitoring
|
| 319 |
+
|
| 320 |
+
# Delete Prometheus agent
|
| 321 |
+
helm uninstall prometheus-agent -n monitoring
|
| 322 |
+
kubectl delete namespace monitoring
|
| 323 |
+
|
| 324 |
+
# Delete dashboard secret
|
| 325 |
+
kubectl delete secret antiatropos-grafana-dashboards -n monitoring 2>/dev/null || true
|
| 326 |
+
|
| 327 |
+
# Delete AMP workspace
|
| 328 |
+
AMP_WS_ID=$(aws amp list-workspaces --alias antiatropos-metrics --region ap-south-1 --query 'workspaces[0].workspaceId' --output text)
|
| 329 |
+
aws amp delete-workspace --workspace-id $AMP_WS_ID --region ap-south-1
|
| 330 |
+
|
| 331 |
+
# Delete the EKS cluster (10-15 min)
|
| 332 |
+
eksctl delete cluster --name antiatropos --region ap-south-1
|
| 333 |
+
```
|
| 334 |
+
|
| 335 |
+
---
|
| 336 |
+
|
| 337 |
+
## Troubleshooting
|
| 338 |
+
|
| 339 |
+
### HF Spaces can't reach AMP
|
| 340 |
+
- Verify `PROMETHEUS_URL` includes the full workspace path
|
| 341 |
+
- AMP requires SigV4 auth — ensure `requests-aws4auth` is in your dependencies
|
| 342 |
+
- Set `ANTIATROPOS_PROM_TIMEOUT_S=5.0` (cross-network latency)
|
| 343 |
+
|
| 344 |
+
### HF Spaces can't reach EKS
|
| 345 |
+
- Verify `KUBECONFIG` path and the file is decoded properly
|
| 346 |
+
- Check the EKS API server endpoint is public (default)
|
| 347 |
+
- Verify the IAM user in the kubeconfig has EKS access
|
| 348 |
+
- Test locally: `kubectl --kubeconfig=kubeconfig-antiatropos.yaml get nodes`
|
| 349 |
+
|
| 350 |
+
### AMP not receiving metrics
|
| 351 |
+
```bash
|
| 352 |
+
kubectl logs -n monitoring -l app.kubernetes.io/name=prometheus
|
| 353 |
+
```
|
| 354 |
+
|
| 355 |
+
### Grafana shows no data
|
| 356 |
+
1. Verify the `AMP-Local` data source is configured: `http://prometheus-agent-server.monitoring.svc.cluster.local:80`
|
| 357 |
+
2. Check time range (AMP default retention is 30 days)
|
| 358 |
+
3. Verify PromQL queries match your metric names
|
| 359 |
+
4. Check Grafana logs: `kubectl logs -n monitoring -l app.kubernetes.io/name=grafana`
|
| 360 |
+
5. Verify dashboards secret exists: `kubectl get secret antiatropos-grafana-dashboards -n monitoring`
|
| 361 |
+
|
deploy/aws/cluster-autoscaler-values.yaml
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Cluster Autoscaler Helm values
|
| 2 |
+
#
|
| 3 |
+
# This ensures EKS adds/removes nodes based on pod scheduling pressure.
|
| 4 |
+
# The node group maxSize in eksctl-cluster.yaml (4) is the ultimate cap.
|
| 5 |
+
#
|
| 6 |
+
# Install:
|
| 7 |
+
# helm repo add autoscaler https://kubernetes.github.io/autoscaler
|
| 8 |
+
# helm repo update
|
| 9 |
+
# helm install cluster-autoscaler autoscaler/cluster-autoscaler \
|
| 10 |
+
# --namespace kube-system \
|
| 11 |
+
# -f cluster-autoscaler-values.yaml
|
| 12 |
+
|
| 13 |
+
autoDiscovery:
|
| 14 |
+
clusterName: antiatropos
|
| 15 |
+
enabled: true
|
| 16 |
+
|
| 17 |
+
awsRegion: ap-south-1
|
| 18 |
+
|
| 19 |
+
# Only scale nodes that have the specific tag
|
| 20 |
+
# This prevents autoscaling unrelated node groups if you add them later
|
| 21 |
+
nodeGroupAutoDiscovery:
|
| 22 |
+
- tags: cluster-autoscaler/cluster-name=antiatropos
|
| 23 |
+
|
| 24 |
+
# Conservative scaling — don't overreact
|
| 25 |
+
scaleDown:
|
| 26 |
+
enabled: true
|
| 27 |
+
# Wait 10 minutes before removing a node
|
| 28 |
+
# This prevents flapping when agents create/destroy pods frequently
|
| 29 |
+
delayAfterAdd: 600s
|
| 30 |
+
delayAfterDelete: 60s
|
| 31 |
+
delayAfterScaleDown: 600s
|
| 32 |
+
# Only remove nodes that are below 50% utilization
|
| 33 |
+
utilizationThreshold: "0.5"
|
| 34 |
+
# Don't remove nodes that have AntiAtropos pods on them
|
| 35 |
+
# (we don't want to kill active training sessions)
|
| 36 |
+
skipNodesWithSystemPods: true
|
| 37 |
+
|
| 38 |
+
# Don't try to scale beyond this many nodes total
|
| 39 |
+
# This is a safety net — the eksctl node group maxSize is the real limit
|
| 40 |
+
maxNodeProvisionTime: 15m
|
| 41 |
+
|
| 42 |
+
rbac:
|
| 43 |
+
create: true
|
| 44 |
+
serviceAccount:
|
| 45 |
+
create: true
|
| 46 |
+
name: cluster-autoscaler
|
| 47 |
+
|
| 48 |
+
replicaCount: 1
|
| 49 |
+
|
| 50 |
+
resources:
|
| 51 |
+
requests:
|
| 52 |
+
cpu: 100m
|
| 53 |
+
memory: 256Mi
|
| 54 |
+
limits:
|
| 55 |
+
cpu: 500m
|
| 56 |
+
memory: 512Mi
|
| 57 |
+
|
deploy/aws/deploy-all.ps1
ADDED
|
@@ -0,0 +1,493 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AntiAtropos - One-Run Deploy Script
|
| 2 |
+
# Deploys entire AWS infrastructure: EKS cluster, workloads, AMP, Prometheus, Grafana
|
| 3 |
+
|
| 4 |
+
$ErrorActionPreference = "Stop"
|
| 5 |
+
|
| 6 |
+
# In PowerShell 7+, prevent native stderr from becoming terminating errors.
|
| 7 |
+
if (Get-Variable -Name PSNativeCommandUseErrorActionPreference -ErrorAction SilentlyContinue) {
|
| 8 |
+
$PSNativeCommandUseErrorActionPreference = $false
|
| 9 |
+
}
|
| 10 |
+
|
| 11 |
+
$Region = "ap-south-1"
|
| 12 |
+
$ClusterName = "antiatropos"
|
| 13 |
+
$AwsDir = Split-Path -Parent $MyInvocation.MyCommand.Path
|
| 14 |
+
$GrafanaMode = if ([string]::IsNullOrWhiteSpace($env:ANTIATROPOS_GRAFANA_MODE)) { "auto" } else { $env:ANTIATROPOS_GRAFANA_MODE.Trim().ToLowerInvariant() }
|
| 15 |
+
$GrafanaModeResolved = "cluster"
|
| 16 |
+
|
| 17 |
+
function Invoke-CheckedCommand {
|
| 18 |
+
param(
|
| 19 |
+
[ScriptBlock]$Command,
|
| 20 |
+
[string]$ErrorMessage
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
$previousErrorActionPreference = $ErrorActionPreference
|
| 24 |
+
$ErrorActionPreference = "Continue"
|
| 25 |
+
try {
|
| 26 |
+
& $Command
|
| 27 |
+
} finally {
|
| 28 |
+
$ErrorActionPreference = $previousErrorActionPreference
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
if ($LASTEXITCODE -ne 0) {
|
| 32 |
+
throw $ErrorMessage
|
| 33 |
+
}
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
function Get-EksClusterStatus {
|
| 37 |
+
param(
|
| 38 |
+
[string]$Name,
|
| 39 |
+
[string]$AwsRegion
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
try {
|
| 43 |
+
$status = aws eks describe-cluster --name $Name --region $AwsRegion --query 'cluster.status' --output text 2>$null
|
| 44 |
+
} catch {
|
| 45 |
+
return $null
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
if ($LASTEXITCODE -ne 0 -or [string]::IsNullOrWhiteSpace($status) -or $status -eq "None") {
|
| 49 |
+
return $null
|
| 50 |
+
}
|
| 51 |
+
return $status.Trim()
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
function Test-EksNodegroupExists {
|
| 55 |
+
param(
|
| 56 |
+
[string]$Cluster,
|
| 57 |
+
[string]$Nodegroup,
|
| 58 |
+
[string]$AwsRegion
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
try {
|
| 62 |
+
aws eks describe-nodegroup --cluster-name $Cluster --nodegroup-name $Nodegroup --region $AwsRegion --query 'nodegroup.nodegroupName' --output text 2>$null | Out-Null
|
| 63 |
+
return ($LASTEXITCODE -eq 0)
|
| 64 |
+
} catch {
|
| 65 |
+
return $false
|
| 66 |
+
}
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
function Get-EksNodegroupInstanceType {
|
| 70 |
+
param(
|
| 71 |
+
[string]$Cluster,
|
| 72 |
+
[string]$Nodegroup,
|
| 73 |
+
[string]$AwsRegion
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
try {
|
| 77 |
+
$instanceType = aws eks describe-nodegroup --cluster-name $Cluster --nodegroup-name $Nodegroup --region $AwsRegion --query 'nodegroup.instanceTypes[0]' --output text 2>$null
|
| 78 |
+
} catch {
|
| 79 |
+
return $null
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
if ($LASTEXITCODE -ne 0 -or [string]::IsNullOrWhiteSpace($instanceType) -or $instanceType -eq "None") {
|
| 83 |
+
return $null
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
return $instanceType.Trim()
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
function Get-NodegroupSubnetSelection {
|
| 90 |
+
param(
|
| 91 |
+
[string]$Cluster,
|
| 92 |
+
[string]$AwsRegion
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
try {
|
| 96 |
+
$allSubnetIds = aws eks describe-cluster --name $Cluster --region $AwsRegion --query 'cluster.resourcesVpcConfig.subnetIds' --output text 2>$null
|
| 97 |
+
} catch {
|
| 98 |
+
throw "Failed to read cluster subnet IDs"
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
if ($LASTEXITCODE -ne 0 -or [string]::IsNullOrWhiteSpace($allSubnetIds)) {
|
| 102 |
+
throw "Failed to read cluster subnet IDs"
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
$subnetArray = @($allSubnetIds -split '\s+' | Where-Object { -not [string]::IsNullOrWhiteSpace($_) })
|
| 106 |
+
if ($subnetArray.Count -eq 0) {
|
| 107 |
+
throw "No subnets found for cluster '$Cluster' in region '$AwsRegion'"
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
$describeSubnetArgs = @(
|
| 111 |
+
'ec2', 'describe-subnets',
|
| 112 |
+
'--region', $AwsRegion,
|
| 113 |
+
'--subnet-ids'
|
| 114 |
+
) + $subnetArray + @(
|
| 115 |
+
'--query', 'Subnets[?MapPublicIpOnLaunch==true].SubnetId',
|
| 116 |
+
'--output', 'text'
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
try {
|
| 120 |
+
$publicSubnetIdsText = & aws @describeSubnetArgs 2>$null
|
| 121 |
+
} catch {
|
| 122 |
+
throw "Failed to classify cluster subnets"
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
if ($LASTEXITCODE -ne 0) {
|
| 126 |
+
throw "Failed to classify cluster subnets"
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
$publicSubnetIds = @($publicSubnetIdsText -split '\s+' | Where-Object { -not [string]::IsNullOrWhiteSpace($_) -and $_ -ne "None" })
|
| 130 |
+
$privateSubnetIds = @($subnetArray | Where-Object { $publicSubnetIds -notcontains $_ })
|
| 131 |
+
|
| 132 |
+
if ($publicSubnetIds.Count -gt 0) {
|
| 133 |
+
return [PSCustomObject]@{
|
| 134 |
+
SubnetCsv = ($publicSubnetIds -join ',')
|
| 135 |
+
UsePrivateNetworking = $false
|
| 136 |
+
SubnetType = "public"
|
| 137 |
+
}
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
if ($privateSubnetIds.Count -gt 0) {
|
| 141 |
+
return [PSCustomObject]@{
|
| 142 |
+
SubnetCsv = ($privateSubnetIds -join ',')
|
| 143 |
+
UsePrivateNetworking = $true
|
| 144 |
+
SubnetType = "private"
|
| 145 |
+
}
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
throw "Could not determine valid subnets for nodegroup creation"
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
function Get-ReadyNodeCount {
|
| 152 |
+
$nodeLines = kubectl get nodes --no-headers 2>$null
|
| 153 |
+
if (-not $nodeLines) {
|
| 154 |
+
return 0
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
return (@($nodeLines | Select-String -Pattern '\sReady\s').Count)
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
function Wait-ForReadyNodes {
|
| 161 |
+
param(
|
| 162 |
+
[int]$MinimumReadyNodes,
|
| 163 |
+
[int]$TimeoutSeconds = 600
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
$attempts = [Math]::Ceiling($TimeoutSeconds / 10)
|
| 167 |
+
for ($i = 0; $i -lt $attempts; $i++) {
|
| 168 |
+
$readyCount = Get-ReadyNodeCount
|
| 169 |
+
Write-Host "Nodes ready: $readyCount (target: $MinimumReadyNodes)"
|
| 170 |
+
if ($readyCount -ge $MinimumReadyNodes) {
|
| 171 |
+
return
|
| 172 |
+
}
|
| 173 |
+
Start-Sleep -Seconds 10
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
throw "Timed out waiting for $MinimumReadyNodes Ready nodes"
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
Write-Host ""
|
| 180 |
+
Write-Host "==========================================" -ForegroundColor Cyan
|
| 181 |
+
Write-Host " AntiAtropos AWS Infrastructure Deploy" -ForegroundColor Cyan
|
| 182 |
+
Write-Host "==========================================" -ForegroundColor Cyan
|
| 183 |
+
Write-Host "Region: $Region"
|
| 184 |
+
Write-Host "Cluster: $ClusterName"
|
| 185 |
+
Write-Host ""
|
| 186 |
+
|
| 187 |
+
# Check prerequisites
|
| 188 |
+
$missing = @()
|
| 189 |
+
foreach ($cmd in @("aws", "eksctl", "kubectl", "helm")) {
|
| 190 |
+
if (-not (Get-Command $cmd -ErrorAction SilentlyContinue)) {
|
| 191 |
+
$missing += $cmd
|
| 192 |
+
}
|
| 193 |
+
}
|
| 194 |
+
if ($missing.Count -gt 0) {
|
| 195 |
+
Write-Host "ERROR: Missing: $($missing -join ', ')" -ForegroundColor Red
|
| 196 |
+
exit 1
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
# Phase 1: Create EKS Cluster
|
| 200 |
+
Write-Host ">>> Phase 1: Creating EKS cluster..." -ForegroundColor Yellow
|
| 201 |
+
|
| 202 |
+
$clusterStatus = Get-EksClusterStatus -Name $ClusterName -AwsRegion $Region
|
| 203 |
+
|
| 204 |
+
if ($clusterStatus -eq "DELETING") {
|
| 205 |
+
Write-Host "Cluster is currently deleting. Waiting for deletion to complete..." -ForegroundColor Yellow
|
| 206 |
+
Invoke-CheckedCommand -Command { aws eks wait cluster-deleted --name $ClusterName --region $Region } -ErrorMessage "Failed while waiting for cluster deletion"
|
| 207 |
+
$clusterStatus = $null
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
if (-not $clusterStatus) {
|
| 211 |
+
$TempConfig = Join-Path $AwsDir "eksctl-cluster-only.yaml"
|
| 212 |
+
$ClusterYaml = Get-Content (Join-Path $AwsDir "eksctl-cluster.yaml") -Raw
|
| 213 |
+
$ClusterOnlyYaml = $ClusterYaml -replace '(?s)(managedNodeGroups:.*)', ''
|
| 214 |
+
$ClusterOnlyYaml | Out-File -FilePath $TempConfig -Encoding utf8
|
| 215 |
+
Invoke-CheckedCommand -Command { eksctl create cluster -f $TempConfig } -ErrorMessage "Failed to create EKS cluster"
|
| 216 |
+
Remove-Item $TempConfig -Force
|
| 217 |
+
Write-Host "Cluster created" -ForegroundColor Green
|
| 218 |
+
} else {
|
| 219 |
+
if ($clusterStatus -eq "CREATING") {
|
| 220 |
+
Write-Host "Cluster creation in progress. Waiting until ACTIVE..." -ForegroundColor Yellow
|
| 221 |
+
Invoke-CheckedCommand -Command { aws eks wait cluster-active --name $ClusterName --region $Region } -ErrorMessage "Cluster did not become active"
|
| 222 |
+
}
|
| 223 |
+
Write-Host "Cluster already exists (status: $clusterStatus)" -ForegroundColor Green
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
Invoke-CheckedCommand -Command { aws eks wait cluster-active --name $ClusterName --region $Region } -ErrorMessage "Cluster is not active"
|
| 227 |
+
Invoke-CheckedCommand -Command { aws eks update-kubeconfig --name $ClusterName --region $Region | Out-Null } -ErrorMessage "Failed to update kubeconfig"
|
| 228 |
+
|
| 229 |
+
# Phase 2: Create Nodegroup
|
| 230 |
+
Write-Host ""
|
| 231 |
+
Write-Host ">>> Phase 2: Ensuring compute nodegroup..." -ForegroundColor Yellow
|
| 232 |
+
|
| 233 |
+
$NodegroupName = "linux-nodes"
|
| 234 |
+
$PreferredInstanceType = "t3.micro"
|
| 235 |
+
$ngExists = Test-EksNodegroupExists -Cluster $ClusterName -Nodegroup $NodegroupName -AwsRegion $Region
|
| 236 |
+
|
| 237 |
+
if (-not $ngExists) {
|
| 238 |
+
$SubnetSelection = Get-NodegroupSubnetSelection -Cluster $ClusterName -AwsRegion $Region
|
| 239 |
+
$SubnetCsv = $SubnetSelection.SubnetCsv
|
| 240 |
+
$UsePrivateNetworking = [bool]$SubnetSelection.UsePrivateNetworking
|
| 241 |
+
|
| 242 |
+
Write-Host "Using $($SubnetSelection.SubnetType) subnets: $SubnetCsv"
|
| 243 |
+
|
| 244 |
+
Invoke-CheckedCommand -Command {
|
| 245 |
+
$args = @(
|
| 246 |
+
'create', 'nodegroup',
|
| 247 |
+
'--cluster', $ClusterName,
|
| 248 |
+
'--region', $Region,
|
| 249 |
+
'--name', $NodegroupName,
|
| 250 |
+
'--node-type', $PreferredInstanceType,
|
| 251 |
+
'--nodes', '4',
|
| 252 |
+
'--nodes-min', '2',
|
| 253 |
+
'--nodes-max', '8',
|
| 254 |
+
'--node-volume-size', '20',
|
| 255 |
+
'--subnet-ids', $SubnetCsv
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
if ($UsePrivateNetworking) {
|
| 259 |
+
$args += '--node-private-networking'
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
eksctl @args
|
| 263 |
+
} -ErrorMessage "Failed to create nodegroup '$NodegroupName'"
|
| 264 |
+
|
| 265 |
+
Write-Host "Nodegroup created" -ForegroundColor Green
|
| 266 |
+
} else {
|
| 267 |
+
$existingInstanceType = Get-EksNodegroupInstanceType -Cluster $ClusterName -Nodegroup $NodegroupName -AwsRegion $Region
|
| 268 |
+
Write-Host "Nodegroup already exists ($existingInstanceType)" -ForegroundColor Green
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
Invoke-CheckedCommand -Command { aws eks wait nodegroup-active --cluster-name $ClusterName --nodegroup-name $NodegroupName --region $Region } -ErrorMessage "Nodegroup did not become active"
|
| 272 |
+
|
| 273 |
+
if ($GrafanaMode -in @("auto", "")) {
|
| 274 |
+
$effectiveNodeType = Get-EksNodegroupInstanceType -Cluster $ClusterName -Nodegroup $NodegroupName -AwsRegion $Region
|
| 275 |
+
if ($effectiveNodeType -eq "t3.micro") {
|
| 276 |
+
$GrafanaModeResolved = "external"
|
| 277 |
+
} else {
|
| 278 |
+
$GrafanaModeResolved = "cluster"
|
| 279 |
+
}
|
| 280 |
+
} elseif ($GrafanaMode -in @("external", "local", "hf")) {
|
| 281 |
+
$GrafanaModeResolved = "external"
|
| 282 |
+
} else {
|
| 283 |
+
$GrafanaModeResolved = "cluster"
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
Write-Host "Grafana mode: $GrafanaModeResolved" -ForegroundColor Cyan
|
| 287 |
+
|
| 288 |
+
Write-Host "Waiting for nodes..."
|
| 289 |
+
for ($i = 0; $i -lt 60; $i++) {
|
| 290 |
+
$nodes = $null
|
| 291 |
+
try {
|
| 292 |
+
$nodes = kubectl get nodes --no-headers --request-timeout=10s 2>$null
|
| 293 |
+
} catch {
|
| 294 |
+
Start-Sleep -Seconds 10
|
| 295 |
+
continue
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
if ($nodes) {
|
| 299 |
+
$readyCount = ($nodes | Select-String -Pattern '\sReady\s').Count
|
| 300 |
+
Write-Host "Nodes ready: $readyCount" -ForegroundColor Green
|
| 301 |
+
break
|
| 302 |
+
}
|
| 303 |
+
Start-Sleep -Seconds 10
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
# Phase 3: Deploy Workloads
|
| 307 |
+
Write-Host ""
|
| 308 |
+
Write-Host ">>> Phase 3: Deploying workloads..." -ForegroundColor Yellow
|
| 309 |
+
kubectl create namespace prod-sre --dry-run=client -o yaml | kubectl apply -f - | Out-Null
|
| 310 |
+
kubectl apply -f (Join-Path $AwsDir "k8s-workloads.yaml") | Out-Null
|
| 311 |
+
Write-Host "Workloads deployed" -ForegroundColor Green
|
| 312 |
+
|
| 313 |
+
# Phase 4: Create AMP Workspace
|
| 314 |
+
Write-Host ""
|
| 315 |
+
Write-Host ">>> Phase 4: Creating AMP workspace..." -ForegroundColor Yellow
|
| 316 |
+
|
| 317 |
+
$AmpWsId = $null
|
| 318 |
+
try {
|
| 319 |
+
$AmpWsId = aws amp list-workspaces --alias antiatropos-metrics --region $Region --query 'workspaces[0].workspaceId' --output text 2>$null
|
| 320 |
+
if ($AmpWsId -eq "None") { $AmpWsId = $null }
|
| 321 |
+
} catch {}
|
| 322 |
+
|
| 323 |
+
if ([string]::IsNullOrWhiteSpace($AmpWsId)) {
|
| 324 |
+
$AmpWsId = aws amp create-workspace --alias antiatropos-metrics --region $Region --query 'workspaceId' --output text
|
| 325 |
+
}
|
| 326 |
+
$AmpUrl = "https://aps-workspaces.$Region.amazonaws.com/workspaces/$AmpWsId"
|
| 327 |
+
Write-Host "AMP: $AmpWsId" -ForegroundColor Green
|
| 328 |
+
|
| 329 |
+
# Phase 5: Install Prometheus
|
| 330 |
+
Write-Host ""
|
| 331 |
+
Write-Host ">>> Phase 5: Installing Prometheus..." -ForegroundColor Yellow
|
| 332 |
+
|
| 333 |
+
kubectl create namespace monitoring --dry-run=client -o yaml | kubectl apply -f - | Out-Null
|
| 334 |
+
Invoke-CheckedCommand -Command { helm repo add prometheus-community https://prometheus-community.github.io/helm-charts 2>$null | Out-Null } -ErrorMessage "Failed to add prometheus helm repo"
|
| 335 |
+
Invoke-CheckedCommand -Command { helm repo update 2>$null | Out-Null } -ErrorMessage "Failed to update helm repos"
|
| 336 |
+
|
| 337 |
+
$promValuesYaml = Join-Path $AwsDir "prometheus-agent-values.yaml"
|
| 338 |
+
$remoteWriteUrl = "$AmpUrl/api/v1/remote_write"
|
| 339 |
+
|
| 340 |
+
Invoke-CheckedCommand -Command {
|
| 341 |
+
helm upgrade --install prometheus-agent prometheus-community/prometheus --namespace monitoring --reset-values -f $promValuesYaml `
|
| 342 |
+
--set "alertmanager.enabled=false" `
|
| 343 |
+
--set "kube-state-metrics.enabled=false" `
|
| 344 |
+
--set "prometheus-node-exporter.enabled=false" `
|
| 345 |
+
--set "pushgateway.enabled=false" `
|
| 346 |
+
--set "server.enabled=true" `
|
| 347 |
+
--set "server.persistentVolume.enabled=false" `
|
| 348 |
+
--set "server.resources.requests.cpu=50m" `
|
| 349 |
+
--set "server.resources.requests.memory=128Mi" `
|
| 350 |
+
--set "server.resources.limits.cpu=300m" `
|
| 351 |
+
--set "server.resources.limits.memory=384Mi" `
|
| 352 |
+
--set "server.global.scrape_interval=15s" `
|
| 353 |
+
--set "server.remoteWrite[0].url=$remoteWriteUrl" `
|
| 354 |
+
2>&1 | Out-Null
|
| 355 |
+
} -ErrorMessage "Failed to install/upgrade Prometheus"
|
| 356 |
+
Write-Host "Prometheus installed" -ForegroundColor Green
|
| 357 |
+
|
| 358 |
+
# Phase 6: Install Grafana
|
| 359 |
+
Write-Host ""
|
| 360 |
+
if ($GrafanaModeResolved -eq "cluster") {
|
| 361 |
+
Write-Host ">>> Phase 6: Installing Grafana in-cluster..." -ForegroundColor Yellow
|
| 362 |
+
|
| 363 |
+
Invoke-CheckedCommand -Command { helm repo add grafana https://grafana.github.io/helm-charts 2>$null | Out-Null } -ErrorMessage "Failed to add grafana helm repo"
|
| 364 |
+
Invoke-CheckedCommand -Command { helm repo update 2>$null | Out-Null } -ErrorMessage "Failed to update helm repos"
|
| 365 |
+
|
| 366 |
+
$GrafanaValuesYaml = Join-Path $AwsDir "grafana-values.yaml"
|
| 367 |
+
Invoke-CheckedCommand -Command { helm upgrade --install grafana grafana/grafana --namespace monitoring -f $GrafanaValuesYaml 2>&1 | Out-Null } -ErrorMessage "Failed to install/upgrade Grafana"
|
| 368 |
+
|
| 369 |
+
Write-Host "Waiting for Grafana..."
|
| 370 |
+
try {
|
| 371 |
+
Invoke-CheckedCommand -Command { kubectl rollout status deployment/grafana --namespace monitoring --timeout=120s 2>$null | Out-Null } -ErrorMessage "Grafana rollout timed out"
|
| 372 |
+
} catch {
|
| 373 |
+
$pendingGrafanaPod = kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana --field-selector=status.phase=Pending --no-headers 2>$null | Select-Object -First 1
|
| 374 |
+
$pendingReason = ""
|
| 375 |
+
|
| 376 |
+
if ($pendingGrafanaPod) {
|
| 377 |
+
$pendingGrafanaPodName = ($pendingGrafanaPod -split '\s+')[0]
|
| 378 |
+
$pendingReason = kubectl describe pod $pendingGrafanaPodName -n monitoring 2>$null | Select-String -Pattern "FailedScheduling|Insufficient memory|Too many pods|unbound" -Context 0,2 | Out-String
|
| 379 |
+
if (-not [string]::IsNullOrWhiteSpace($pendingReason)) {
|
| 380 |
+
Write-Host "Grafana is pending due to scheduler constraints:" -ForegroundColor Yellow
|
| 381 |
+
Write-Host $pendingReason -ForegroundColor Yellow
|
| 382 |
+
}
|
| 383 |
+
}
|
| 384 |
+
|
| 385 |
+
$shouldScale = $pendingReason -match "Too many pods|Insufficient memory"
|
| 386 |
+
if ($shouldScale) {
|
| 387 |
+
Write-Host "Scaling nodegroup to 8 nodes and retrying Grafana rollout..." -ForegroundColor Yellow
|
| 388 |
+
Invoke-CheckedCommand -Command { eksctl scale nodegroup --cluster $ClusterName --region $Region --name $NodegroupName --nodes 8 } -ErrorMessage "Failed to scale nodegroup"
|
| 389 |
+
Invoke-CheckedCommand -Command { aws eks wait nodegroup-active --cluster-name $ClusterName --nodegroup-name $NodegroupName --region $Region } -ErrorMessage "Nodegroup did not become active after scaling"
|
| 390 |
+
Write-Host "Waiting for newly scaled nodes to become Ready..." -ForegroundColor Yellow
|
| 391 |
+
Wait-ForReadyNodes -MinimumReadyNodes 8 -TimeoutSeconds 900
|
| 392 |
+
|
| 393 |
+
$pendingGrafanaPodAfterScale = kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana --field-selector=status.phase=Pending --no-headers 2>$null | Select-Object -First 1
|
| 394 |
+
if ($pendingGrafanaPodAfterScale) {
|
| 395 |
+
$pendingGrafanaPodNameAfterScale = ($pendingGrafanaPodAfterScale -split '\s+')[0]
|
| 396 |
+
kubectl delete pod $pendingGrafanaPodNameAfterScale -n monitoring 2>$null | Out-Null
|
| 397 |
+
}
|
| 398 |
+
|
| 399 |
+
Invoke-CheckedCommand -Command { kubectl rollout status deployment/grafana --namespace monitoring --timeout=600s 2>$null | Out-Null } -ErrorMessage "Grafana rollout timed out after scaling"
|
| 400 |
+
} else {
|
| 401 |
+
throw "Grafana rollout failed. Check: kubectl -n monitoring get pods ; kubectl -n monitoring describe pod -l app.kubernetes.io/name=grafana"
|
| 402 |
+
}
|
| 403 |
+
}
|
| 404 |
+
Write-Host "Grafana installed (admin/antiatropos)" -ForegroundColor Green
|
| 405 |
+
} else {
|
| 406 |
+
Write-Host ">>> Phase 6: Skipping in-cluster Grafana (external mode)..." -ForegroundColor Yellow
|
| 407 |
+
$grafanaRelease = ""
|
| 408 |
+
try {
|
| 409 |
+
$grafanaRelease = helm list -n monitoring --filter '^grafana$' --short 2>$null
|
| 410 |
+
} catch {
|
| 411 |
+
$grafanaRelease = ""
|
| 412 |
+
}
|
| 413 |
+
|
| 414 |
+
if (-not [string]::IsNullOrWhiteSpace($grafanaRelease)) {
|
| 415 |
+
helm uninstall grafana -n monitoring 2>$null | Out-Null
|
| 416 |
+
kubectl delete pvc grafana -n monitoring 2>$null | Out-Null
|
| 417 |
+
Write-Host "Removed existing in-cluster Grafana release to save resources" -ForegroundColor Green
|
| 418 |
+
}
|
| 419 |
+
}
|
| 420 |
+
|
| 421 |
+
# Phase 7: Install Cluster Autoscaler
|
| 422 |
+
Write-Host ""
|
| 423 |
+
Write-Host ">>> Phase 7: Installing Cluster Autoscaler..." -ForegroundColor Yellow
|
| 424 |
+
|
| 425 |
+
Invoke-CheckedCommand -Command { helm repo add autoscaler https://kubernetes.github.io/autoscaler 2>$null | Out-Null } -ErrorMessage "Failed to add autoscaler helm repo"
|
| 426 |
+
Invoke-CheckedCommand -Command { helm repo update 2>$null | Out-Null } -ErrorMessage "Failed to update helm repos"
|
| 427 |
+
|
| 428 |
+
$autoscalerValues = Join-Path $AwsDir "cluster-autoscaler-values.yaml"
|
| 429 |
+
Invoke-CheckedCommand -Command { helm upgrade --install cluster-autoscaler autoscaler/cluster-autoscaler --namespace kube-system -f $autoscalerValues 2>&1 | Out-Null } -ErrorMessage "Failed to install/upgrade Cluster Autoscaler"
|
| 430 |
+
Write-Host "Cluster Autoscaler installed" -ForegroundColor Green
|
| 431 |
+
|
| 432 |
+
# Phase 8: Generate Kubeconfig
|
| 433 |
+
Write-Host ""
|
| 434 |
+
Write-Host ">>> Phase 8: Generating kubeconfig..." -ForegroundColor Yellow
|
| 435 |
+
|
| 436 |
+
$ClusterEndpoint = aws eks describe-cluster --name $ClusterName --region $Region --query 'cluster.endpoint' --output text
|
| 437 |
+
$ClusterCa = aws eks describe-cluster --name $ClusterName --region $Region --query 'cluster.certificateAuthority.data' --output text
|
| 438 |
+
$Timestamp = (Get-Date).ToUniversalTime().ToString("yyyy-MM-ddTHH:mm:ssZ")
|
| 439 |
+
$output = Join-Path $AwsDir "kubeconfig-antiatropos.yaml"
|
| 440 |
+
|
| 441 |
+
$kubeconfig = "apiVersion: v1`n" +
|
| 442 |
+
"kind: Config`n" +
|
| 443 |
+
"clusters:`n" +
|
| 444 |
+
" - cluster:`n" +
|
| 445 |
+
" certificate-authority-data: $ClusterCa`n" +
|
| 446 |
+
" server: $ClusterEndpoint`n" +
|
| 447 |
+
" name: $ClusterName`n" +
|
| 448 |
+
"contexts:`n" +
|
| 449 |
+
" - context:`n" +
|
| 450 |
+
" cluster: $ClusterName`n" +
|
| 451 |
+
" user: antiatropos-hf-user`n" +
|
| 452 |
+
" name: $ClusterName`n" +
|
| 453 |
+
"current-context: $ClusterName`n" +
|
| 454 |
+
"preferences: {}`n" +
|
| 455 |
+
"users:`n" +
|
| 456 |
+
" - name: antiatropos-hf-user`n" +
|
| 457 |
+
" user:`n" +
|
| 458 |
+
" exec:`n" +
|
| 459 |
+
" apiVersion: client.authentication.k8s.io/v1beta1`n" +
|
| 460 |
+
" command: aws`n" +
|
| 461 |
+
" args:`n" +
|
| 462 |
+
" - eks`n" +
|
| 463 |
+
" - get-token`n" +
|
| 464 |
+
" - --region`n" +
|
| 465 |
+
" - $Region`n" +
|
| 466 |
+
" - --cluster-name`n" +
|
| 467 |
+
" - $ClusterName`n" +
|
| 468 |
+
" env:`n" +
|
| 469 |
+
" - name: AWS_STS_REGIONAL_ENDPOINTS`n" +
|
| 470 |
+
" value: regional`n" +
|
| 471 |
+
" - name: AWS_DEFAULT_REGION`n" +
|
| 472 |
+
" value: $Region`n" +
|
| 473 |
+
" interactiveMode: IfAvailable`n"
|
| 474 |
+
|
| 475 |
+
$kubeconfig | Out-File -FilePath $output -Encoding utf8 -Force
|
| 476 |
+
Write-Host "Kubeconfig: $output" -ForegroundColor Green
|
| 477 |
+
|
| 478 |
+
# Done
|
| 479 |
+
Write-Host ""
|
| 480 |
+
Write-Host "==========================================" -ForegroundColor Cyan
|
| 481 |
+
Write-Host " Deployment Complete!" -ForegroundColor Cyan
|
| 482 |
+
Write-Host "==========================================" -ForegroundColor Cyan
|
| 483 |
+
Write-Host ""
|
| 484 |
+
Write-Host "AMP: $AmpWsId" -ForegroundColor Yellow
|
| 485 |
+
if ($GrafanaModeResolved -eq "cluster") {
|
| 486 |
+
Write-Host "Grafana: kubectl port-forward svc/grafana 3000 -n monitoring" -ForegroundColor Yellow
|
| 487 |
+
Write-Host "Login: admin / antiatropos" -ForegroundColor Yellow
|
| 488 |
+
} else {
|
| 489 |
+
Write-Host "Grafana: external/local mode enabled (recommended for free-tier nodes)" -ForegroundColor Yellow
|
| 490 |
+
Write-Host "Use AMP endpoint as Prometheus datasource with SigV4 auth" -ForegroundColor Yellow
|
| 491 |
+
}
|
| 492 |
+
Write-Host "Kubeconfig: $output" -ForegroundColor Yellow
|
| 493 |
+
Write-Host ""
|
deploy/aws/deploy.ps1
ADDED
|
@@ -0,0 +1,369 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AntiAtropos AWS Infrastructure Deploy Script (PowerShell)
|
| 2 |
+
#
|
| 3 |
+
# Deploys: EKS cluster, sample workloads, AMP workspace, Prometheus Agent,
|
| 4 |
+
# AMG workspace, Cluster Autoscaler, and generates kubeconfig for HF Spaces.
|
| 5 |
+
#
|
| 6 |
+
# The AntiAtropos FastAPI server runs on Hugging Face Spaces, NOT on AWS.
|
| 7 |
+
# This script only sets up the infrastructure that HF Spaces connects to.
|
| 8 |
+
#
|
| 9 |
+
# Prerequisites: aws cli, eksctl, kubectl, helm
|
| 10 |
+
#
|
| 11 |
+
# Usage:
|
| 12 |
+
# .\deploy\aws\deploy.ps1
|
| 13 |
+
#
|
| 14 |
+
# Environment variables:
|
| 15 |
+
# $env:AWS_REGION - AWS region (default: ap-south-1)
|
| 16 |
+
# $env:CLUSTER_NAME - EKS cluster name (default: antiatropos)
|
| 17 |
+
|
| 18 |
+
$ErrorActionPreference = "Stop"
|
| 19 |
+
|
| 20 |
+
$Region = if ($env:AWS_REGION) { $env:AWS_REGION } else { "ap-south-1" }
|
| 21 |
+
$ClusterName = if ($env:CLUSTER_NAME) { $env:CLUSTER_NAME } else { "antiatropos" }
|
| 22 |
+
$AwsDir = Split-Path -Parent $MyInvocation.MyCommand.Path
|
| 23 |
+
|
| 24 |
+
Write-Host ""
|
| 25 |
+
Write-Host "=== AntiAtropos AWS Infrastructure Deployment ===" -ForegroundColor Cyan
|
| 26 |
+
Write-Host "Region: $Region"
|
| 27 |
+
Write-Host "Cluster: $ClusterName"
|
| 28 |
+
Write-Host "FastAPI: Runs on HF Spaces (not deployed here)"
|
| 29 |
+
Write-Host ""
|
| 30 |
+
|
| 31 |
+
# --- Check prerequisites ---
|
| 32 |
+
$missing = @()
|
| 33 |
+
foreach ($cmd in @("aws", "eksctl", "kubectl", "helm")) {
|
| 34 |
+
if (-not (Get-Command $cmd -ErrorAction SilentlyContinue)) {
|
| 35 |
+
$missing += $cmd
|
| 36 |
+
}
|
| 37 |
+
}
|
| 38 |
+
if ($missing.Count -gt 0) {
|
| 39 |
+
Write-Host "ERROR: Missing prerequisites: $($missing -join ', ')" -ForegroundColor Red
|
| 40 |
+
Write-Host "Install them first:" -ForegroundColor Yellow
|
| 41 |
+
Write-Host " choco install awscli eksctl kubernetes-cli kubernetes-helm -y" -ForegroundColor Yellow
|
| 42 |
+
exit 1
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
# --- Phase 1: Create EKS Cluster ---
|
| 46 |
+
Write-Host ""
|
| 47 |
+
Write-Host ">>> Phase 1: Creating EKS cluster (without nodegroup)..." -ForegroundColor Yellow
|
| 48 |
+
|
| 49 |
+
$clusterExists = $false
|
| 50 |
+
try {
|
| 51 |
+
eksctl get cluster --name $ClusterName --region $Region 2>$null | Out-Null
|
| 52 |
+
$clusterExists = $true
|
| 53 |
+
} catch {}
|
| 54 |
+
|
| 55 |
+
if ($clusterExists) {
|
| 56 |
+
Write-Host "Cluster $ClusterName already exists, skipping creation."
|
| 57 |
+
} else {
|
| 58 |
+
# Create cluster without nodegroup first (faster, avoids timeout)
|
| 59 |
+
$TempClusterConfig = Join-Path $AwsDir "eksctl-cluster-only.yaml"
|
| 60 |
+
$ClusterYaml = Get-Content (Join-Path $AwsDir "eksctl-cluster.yaml") -Raw
|
| 61 |
+
# Remove nodegroups section for initial cluster creation
|
| 62 |
+
$ClusterOnlyYaml = $ClusterYaml -replace '(?s)(managedNodeGroups:.*)', ''
|
| 63 |
+
$ClusterOnlyYaml | Out-File -FilePath $TempClusterConfig -Encoding utf8
|
| 64 |
+
eksctl create cluster -f $TempClusterConfig
|
| 65 |
+
Remove-Item $TempClusterConfig -Force
|
| 66 |
+
Write-Host "Cluster created." -ForegroundColor Green
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
aws eks update-kubeconfig --name $ClusterName --region $Region
|
| 70 |
+
Write-Host "kubeconfig updated."
|
| 71 |
+
|
| 72 |
+
# --- Phase 1b: Create Nodegroup Separately ---
|
| 73 |
+
Write-Host ""
|
| 74 |
+
Write-Host ">>> Phase 1b: Creating nodegroup (separate step to avoid timeout)..." -ForegroundColor Yellow
|
| 75 |
+
|
| 76 |
+
$nodegroupExists = $false
|
| 77 |
+
try {
|
| 78 |
+
eksctl get nodegroup --cluster $ClusterName --region $Region 2>$null | Select-String "linux-nodes" | Out-Null
|
| 79 |
+
$nodegroupExists = $true
|
| 80 |
+
} catch {}
|
| 81 |
+
|
| 82 |
+
if ($nodegroupExists) {
|
| 83 |
+
Write-Host "Nodegroup already exists, skipping creation."
|
| 84 |
+
} else {
|
| 85 |
+
# Create nodegroup separately (better error handling, can retry)
|
| 86 |
+
eksctl create nodegroup --config-file (Join-Path $AwsDir "eksctl-cluster.yaml")
|
| 87 |
+
Write-Host "Nodegroup created." -ForegroundColor Green
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
# Verify nodes are ready
|
| 91 |
+
Write-Host "Waiting for nodes to be ready..."
|
| 92 |
+
$nodesReady = $false
|
| 93 |
+
for ($i = 0; $i -lt 30; $i++) {
|
| 94 |
+
$nodes = kubectl get nodes --no-headers 2>$null
|
| 95 |
+
if ($nodes) {
|
| 96 |
+
Write-Host "Nodes ready:" -ForegroundColor Green
|
| 97 |
+
kubectl get nodes
|
| 98 |
+
$nodesReady = $true
|
| 99 |
+
break
|
| 100 |
+
}
|
| 101 |
+
Start-Sleep -Seconds 10
|
| 102 |
+
}
|
| 103 |
+
if (-not $nodesReady) {
|
| 104 |
+
Write-Host "WARNING: Nodes not ready yet. Check with: kubectl get nodes" -ForegroundColor Yellow
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
Write-Host "Enabling Prefix Delegation on VPC CNI..."
|
| 108 |
+
kubectl set env daemonset aws-node -n kube-system ENABLE_PREFIX_DELEGATION=true
|
| 109 |
+
Write-Host "Prefix Delegation enabled."
|
| 110 |
+
|
| 111 |
+
# --- Phase 2: Deploy Sample Workloads ---
|
| 112 |
+
Write-Host ""
|
| 113 |
+
Write-Host ">>> Phase 2: Deploying sample workloads (payments, checkout, catalog, cart, auth)..." -ForegroundColor Yellow
|
| 114 |
+
kubectl apply -f (Join-Path $AwsDir "k8s-workloads.yaml")
|
| 115 |
+
Write-Host "Workloads deployed." -ForegroundColor Green
|
| 116 |
+
kubectl get pods -n prod-sre
|
| 117 |
+
|
| 118 |
+
# --- Phase 3: Create AMP Workspace ---
|
| 119 |
+
Write-Host ""
|
| 120 |
+
Write-Host ">>> Phase 3: Creating Amazon Managed Prometheus workspace..." -ForegroundColor Yellow
|
| 121 |
+
|
| 122 |
+
$AmpWsId = $null
|
| 123 |
+
try {
|
| 124 |
+
$AmpWsId = aws amp list-workspaces --alias antiatropos-metrics --region $Region --query 'workspaces[0].workspaceId' --output text 2>$null
|
| 125 |
+
if ($AmpWsId -eq "None") { $AmpWsId = $null }
|
| 126 |
+
} catch {}
|
| 127 |
+
|
| 128 |
+
if ([string]::IsNullOrWhiteSpace($AmpWsId)) {
|
| 129 |
+
$AmpWsId = aws amp create-workspace `
|
| 130 |
+
--alias antiatropos-metrics `
|
| 131 |
+
--region $Region `
|
| 132 |
+
--query 'workspaceId' `
|
| 133 |
+
--output text
|
| 134 |
+
Write-Host "AMP workspace created: $AmpWsId" -ForegroundColor Green
|
| 135 |
+
} else {
|
| 136 |
+
Write-Host "AMP workspace already exists: $AmpWsId"
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
$AmpUrl = "https://aps-workspaces.$Region.amazonaws.com/workspaces/$AmpWsId"
|
| 140 |
+
Write-Host "AMP URL: $AmpUrl"
|
| 141 |
+
|
| 142 |
+
# --- Phase 4: Set up IRSA for Prometheus Agent ---
|
| 143 |
+
Write-Host ""
|
| 144 |
+
Write-Host ">>> Phase 4: Setting up IRSA for Prometheus Agent..." -ForegroundColor Yellow
|
| 145 |
+
|
| 146 |
+
$saExists = $false
|
| 147 |
+
try {
|
| 148 |
+
kubectl get serviceaccount prometheus-sa -n monitoring 2>$null | Out-Null
|
| 149 |
+
$saExists = $true
|
| 150 |
+
} catch {}
|
| 151 |
+
|
| 152 |
+
if ($saExists) {
|
| 153 |
+
Write-Host "prometheus-sa already exists."
|
| 154 |
+
} else {
|
| 155 |
+
eksctl create iamserviceaccount `
|
| 156 |
+
--cluster $ClusterName `
|
| 157 |
+
--namespace monitoring `
|
| 158 |
+
--name prometheus-sa `
|
| 159 |
+
--attach-policy-arn "arn:aws:iam::aws:policy/AmazonPrometheusRemoteWriteAccess" `
|
| 160 |
+
--approve `
|
| 161 |
+
--override-existing-serviceaccounts
|
| 162 |
+
Write-Host "prometheus-sa created." -ForegroundColor Green
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
# --- Phase 5: Install Prometheus Agent ---
|
| 166 |
+
Write-Host ""
|
| 167 |
+
Write-Host ">>> Phase 5: Installing Prometheus Agent (remote-writes to AMP)..." -ForegroundColor Yellow
|
| 168 |
+
|
| 169 |
+
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts 2>$null
|
| 170 |
+
helm repo update
|
| 171 |
+
|
| 172 |
+
$agentInstalled = $false
|
| 173 |
+
try {
|
| 174 |
+
helm status prometheus-agent -n monitoring 2>$null | Out-Null
|
| 175 |
+
$agentInstalled = $true
|
| 176 |
+
} catch {}
|
| 177 |
+
|
| 178 |
+
$promValuesYaml = Join-Path $AwsDir "prometheus-agent-values.yaml"
|
| 179 |
+
$remoteWriteUrl = "$AmpUrl/api/v1/remote_write"
|
| 180 |
+
|
| 181 |
+
if ($agentInstalled) {
|
| 182 |
+
Write-Host "prometheus-agent already installed, upgrading..."
|
| 183 |
+
helm upgrade prometheus-agent prometheus-community/prometheus `
|
| 184 |
+
--namespace monitoring `
|
| 185 |
+
-f $promValuesYaml `
|
| 186 |
+
--set "prometheus.prometheusSpec.remoteWrite[0].url=$remoteWriteUrl"
|
| 187 |
+
} else {
|
| 188 |
+
helm install prometheus-agent prometheus-community/prometheus `
|
| 189 |
+
--namespace monitoring --create-namespace `
|
| 190 |
+
-f $promValuesYaml `
|
| 191 |
+
--set "prometheus.prometheusSpec.remoteWrite[0].url=$remoteWriteUrl"
|
| 192 |
+
Write-Host "prometheus-agent installed." -ForegroundColor Green
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
# --- Phase 6: Install Self-Hosted Grafana on EKS ---
|
| 196 |
+
Write-Host ""
|
| 197 |
+
Write-Host ">>> Phase 6: Installing self-hosted Grafana on EKS..." -ForegroundColor Yellow
|
| 198 |
+
|
| 199 |
+
# Add Grafana Helm repo
|
| 200 |
+
helm repo add grafana https://grafana.github.io/helm-charts 2>$null
|
| 201 |
+
helm repo update
|
| 202 |
+
|
| 203 |
+
# Create a secret with the dashboard JSON files for Grafana to import
|
| 204 |
+
$DashboardsDir = Join-Path $PSScriptRoot "..\..\grafana\provisioning\dashboards\json"
|
| 205 |
+
if (Test-Path $DashboardsDir) {
|
| 206 |
+
Write-Host "Creating dashboard secret from $DashboardsDir..."
|
| 207 |
+
kubectl create secret generic antiatropos-grafana-dashboards `
|
| 208 |
+
--from-file=antiatropos-overview.json=$(Join-Path $DashboardsDir "antiatropos-overview.json") `
|
| 209 |
+
--from-file=antiatropos-live.json=$(Join-Path $DashboardsDir "antiatropos-live.json") `
|
| 210 |
+
--namespace monitoring `
|
| 211 |
+
--dry-run=client -o yaml | kubectl apply -f -
|
| 212 |
+
Write-Host "Dashboard secret created." -ForegroundColor Green
|
| 213 |
+
} else {
|
| 214 |
+
Write-Host "Dashboard JSON directory not found at $DashboardsDir, skipping."
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
# Install Grafana
|
| 218 |
+
$GrafanaValuesYaml = Join-Path $AwsDir "grafana-values.yaml"
|
| 219 |
+
|
| 220 |
+
if (helm status grafana -n monitoring 2>$null) {
|
| 221 |
+
Write-Host "Grafana already installed, upgrading..."
|
| 222 |
+
helm upgrade grafana grafana/grafana --namespace monitoring -f $GrafanaValuesYaml
|
| 223 |
+
} else {
|
| 224 |
+
helm install grafana grafana/grafana --namespace monitoring -f $GrafanaValuesYaml
|
| 225 |
+
Write-Host "Grafana installed." -ForegroundColor Green
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
# Wait for Grafana pod to be ready
|
| 229 |
+
Write-Host "Waiting for Grafana pod to be ready..."
|
| 230 |
+
kubectl rollout status deployment/grafana --namespace monitoring --timeout=120s 2>$null | Out-Null
|
| 231 |
+
|
| 232 |
+
$GrafanaPod = kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}' 2>$null
|
| 233 |
+
Write-Host "Grafana pod: $GrafanaPod"
|
| 234 |
+
Write-Host "To access Grafana: kubectl port-forward svc/grafana 3000 -n monitoring" -ForegroundColor Yellow
|
| 235 |
+
Write-Host "Login: admin / antiatropos"
|
| 236 |
+
|
| 237 |
+
# --- Phase 7: Install Cluster Autoscaler ---
|
| 238 |
+
Write-Host ""
|
| 239 |
+
Write-Host ">>> Phase 7: Installing Cluster Autoscaler..." -ForegroundColor Yellow
|
| 240 |
+
|
| 241 |
+
helm repo add autoscaler https://kubernetes.github.io/autoscaler 2>$null
|
| 242 |
+
helm repo update
|
| 243 |
+
|
| 244 |
+
$autoscalerInstalled = $false
|
| 245 |
+
try {
|
| 246 |
+
helm status cluster-autoscaler -n kube-system 2>$null | Out-Null
|
| 247 |
+
$autoscalerInstalled = $true
|
| 248 |
+
} catch {}
|
| 249 |
+
|
| 250 |
+
$autoscalerValues = Join-Path $AwsDir "cluster-autoscaler-values.yaml"
|
| 251 |
+
|
| 252 |
+
if ($autoscalerInstalled) {
|
| 253 |
+
Write-Host "cluster-autoscaler already installed, upgrading..."
|
| 254 |
+
helm upgrade cluster-autoscaler autoscaler/cluster-autoscaler `
|
| 255 |
+
--namespace kube-system `
|
| 256 |
+
-f $autoscalerValues
|
| 257 |
+
} else {
|
| 258 |
+
helm install cluster-autoscaler autoscaler/cluster-autoscaler `
|
| 259 |
+
--namespace kube-system `
|
| 260 |
+
-f $autoscalerValues
|
| 261 |
+
Write-Host "cluster-autoscaler installed." -ForegroundColor Green
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
# --- Phase 8: Generate Kubeconfig for HF Spaces ---
|
| 265 |
+
Write-Host ""
|
| 266 |
+
Write-Host ">>> Phase 8: Generating kubeconfig for HF Spaces..." -ForegroundColor Yellow
|
| 267 |
+
|
| 268 |
+
$generateScript = Join-Path $AwsDir "generate-kubeconfig.ps1"
|
| 269 |
+
if (Test-Path $generateScript) {
|
| 270 |
+
& $generateScript
|
| 271 |
+
} else {
|
| 272 |
+
# Inline kubeconfig generation if the .ps1 version doesn't exist yet
|
| 273 |
+
$output = Join-Path $AwsDir "kubeconfig-antiatropos.yaml"
|
| 274 |
+
|
| 275 |
+
# Verify cluster exists
|
| 276 |
+
$clusterCheck = $false
|
| 277 |
+
try {
|
| 278 |
+
eksctl get cluster --name $ClusterName --region $Region 2>$null | Out-Null
|
| 279 |
+
$clusterCheck = $true
|
| 280 |
+
} catch {}
|
| 281 |
+
if (-not $clusterCheck) {
|
| 282 |
+
Write-Host "ERROR: Cluster $ClusterName not found." -ForegroundColor Red
|
| 283 |
+
exit 1
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
$ClusterEndpoint = aws eks describe-cluster --name $ClusterName --region $Region --query 'cluster.endpoint' --output text
|
| 287 |
+
$ClusterCa = aws eks describe-cluster --name $ClusterName --region $Region --query 'cluster.certificateAuthority.data' --output text
|
| 288 |
+
$Timestamp = (Get-Date).ToUniversalTime().ToString("yyyy-MM-ddTHH:mm:ssZ")
|
| 289 |
+
|
| 290 |
+
$kubeconfig = @"
|
| 291 |
+
# Kubeconfig for AntiAtropos on Hugging Face Spaces
|
| 292 |
+
# Generated: $Timestamp
|
| 293 |
+
# Cluster: $ClusterName
|
| 294 |
+
# Region: $Region
|
| 295 |
+
#
|
| 296 |
+
# This kubeconfig uses AWS IAM authenticator.
|
| 297 |
+
# The HF Space container must have aws-cli available,
|
| 298 |
+
# OR the kubernetes Python client must be configured with AWS credentials.
|
| 299 |
+
|
| 300 |
+
apiVersion: v1
|
| 301 |
+
kind: Config
|
| 302 |
+
clusters:
|
| 303 |
+
- cluster:
|
| 304 |
+
certificate-authority-data: $ClusterCa
|
| 305 |
+
server: $ClusterEndpoint
|
| 306 |
+
name: $ClusterName
|
| 307 |
+
|
| 308 |
+
contexts:
|
| 309 |
+
- context:
|
| 310 |
+
cluster: $ClusterName
|
| 311 |
+
user: antiatropos-hf-user
|
| 312 |
+
name: $ClusterName
|
| 313 |
+
|
| 314 |
+
current-context: $ClusterName
|
| 315 |
+
|
| 316 |
+
preferences: {}
|
| 317 |
+
|
| 318 |
+
users:
|
| 319 |
+
- name: antiatropos-hf-user
|
| 320 |
+
user:
|
| 321 |
+
exec:
|
| 322 |
+
apiVersion: client.authentication.k8s.io/v1beta1
|
| 323 |
+
command: aws
|
| 324 |
+
args:
|
| 325 |
+
- eks
|
| 326 |
+
- get-token
|
| 327 |
+
- --region
|
| 328 |
+
- $Region
|
| 329 |
+
- --cluster-name
|
| 330 |
+
- $ClusterName
|
| 331 |
+
env:
|
| 332 |
+
- name: AWS_STS_REGIONAL_ENDPOINTS
|
| 333 |
+
value: regional
|
| 334 |
+
- name: AWS_DEFAULT_REGION
|
| 335 |
+
value: $Region
|
| 336 |
+
interactiveMode: IfAvailable
|
| 337 |
+
"@
|
| 338 |
+
|
| 339 |
+
$kubeconfig | Out-File -FilePath $output -Encoding utf8 -Force
|
| 340 |
+
Write-Host "Kubeconfig written to: $output" -ForegroundColor Green
|
| 341 |
+
Write-Host ""
|
| 342 |
+
Write-Host "To encode for HF Spaces secret:" -ForegroundColor Yellow
|
| 343 |
+
Write-Host " [Convert]::ToBase64String([System.IO.File]::ReadAllBytes('$output'))"
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
# --- Done ---
|
| 347 |
+
Write-Host ""
|
| 348 |
+
Write-Host "==========================================" -ForegroundColor Cyan
|
| 349 |
+
Write-Host " AntiAtropos AWS Infrastructure Ready!" -ForegroundColor Cyan
|
| 350 |
+
Write-Host "==========================================" -ForegroundColor Cyan
|
| 351 |
+
Write-Host ""
|
| 352 |
+
Write-Host "AMP Workspace ID: $AmpWsId"
|
| 353 |
+
Write-Host "AMP URL: $AmpUrl"
|
| 354 |
+
Write-Host ""
|
| 355 |
+
Write-Host "Grafana: Self-hosted on EKS (monitoring namespace)"
|
| 356 |
+
Write-Host " Access: kubectl port-forward svc/grafana 3000 -n monitoring"
|
| 357 |
+
Write-Host " Login: admin / antiatropos"
|
| 358 |
+
Write-Host " URL: http://localhost:3000"
|
| 359 |
+
Write-Host ""
|
| 360 |
+
Write-Host "Kubeconfig saved: $(Join-Path $AwsDir 'kubeconfig-antiatropos.yaml')"
|
| 361 |
+
Write-Host ""
|
| 362 |
+
Write-Host "Next steps - configure your HF Space:" -ForegroundColor Yellow
|
| 363 |
+
Write-Host " 1. Set secret KUBECONFIG_CONTENT = base64 of kubeconfig-antiatropos.yaml"
|
| 364 |
+
Write-Host " 2. Set env var PROMETHEUS_URL = $AmpUrl"
|
| 365 |
+
Write-Host " 3. Set env var KUBECONFIG = /app/kubeconfig.yaml"
|
| 366 |
+
Write-Host " 4. Set env var ANTIATROPOS_ENV_MODE = live"
|
| 367 |
+
Write-Host " 5. Set env var ANTIATROPOS_MAX_REPLICAS = 6"
|
| 368 |
+
Write-Host " 6. Set env var ANTIATROPOS_WORKLOAD_MAP = (see OPERATIONS.md)"
|
| 369 |
+
Write-Host " 7. Add kubeconfig decode to deploy/entrypoint.sh (see OPERATIONS.md)"
|
deploy/aws/deploy.sh
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# AntiAtropos AWS Infrastructure Deploy Script
|
| 3 |
+
#
|
| 4 |
+
# Deploys: EKS cluster, sample workloads, AMP workspace, Prometheus Agent,
|
| 5 |
+
# AMG workspace, Cluster Autoscaler, and generates kubeconfig for HF Spaces.
|
| 6 |
+
#
|
| 7 |
+
# The AntiAtropos FastAPI server runs on Hugging Face Spaces, NOT on AWS.
|
| 8 |
+
# This script only sets up the infrastructure that HF Spaces connects to.
|
| 9 |
+
#
|
| 10 |
+
# Prerequisites: aws cli, eksctl, kubectl, helm
|
| 11 |
+
#
|
| 12 |
+
# Usage:
|
| 13 |
+
# chmod +x deploy/aws/deploy.sh
|
| 14 |
+
# ./deploy/aws/deploy.sh
|
| 15 |
+
#
|
| 16 |
+
# Environment variables:
|
| 17 |
+
# AWS_REGION - AWS region (default: ap-south-1)
|
| 18 |
+
# CLUSTER_NAME - EKS cluster name (default: antiatropos)
|
| 19 |
+
|
| 20 |
+
set -euo pipefail
|
| 21 |
+
|
| 22 |
+
REGION="${AWS_REGION:-ap-south-1}"
|
| 23 |
+
CLUSTER_NAME="${CLUSTER_NAME:-antiatropos}"
|
| 24 |
+
AWS_DIR="$(cd "$(dirname "$0")" && pwd)"
|
| 25 |
+
|
| 26 |
+
echo "=== AntiAtropos AWS Infrastructure Deployment ==="
|
| 27 |
+
echo "Region: $REGION"
|
| 28 |
+
echo "Cluster: $CLUSTER_NAME"
|
| 29 |
+
echo "FastAPI: Runs on HF Spaces (not deployed here)"
|
| 30 |
+
echo ""
|
| 31 |
+
|
| 32 |
+
# --- Check prerequisites ---
|
| 33 |
+
for cmd in aws eksctl kubectl helm; do
|
| 34 |
+
if ! command -v "$cmd" &>/dev/null; then
|
| 35 |
+
echo "ERROR: $cmd is not installed. Please install it first."
|
| 36 |
+
exit 1
|
| 37 |
+
fi
|
| 38 |
+
done
|
| 39 |
+
|
| 40 |
+
# --- Phase 1: Create EKS Cluster ---
|
| 41 |
+
echo ""
|
| 42 |
+
echo ">>> Phase 1: Creating EKS cluster..."
|
| 43 |
+
if eksctl get cluster --name "$CLUSTER_NAME" --region "$REGION" &>/dev/null; then
|
| 44 |
+
echo "Cluster $CLUSTER_NAME already exists, skipping creation."
|
| 45 |
+
else
|
| 46 |
+
eksctl create cluster -f "$AWS_DIR/eksctl-cluster.yaml"
|
| 47 |
+
echo "Cluster created."
|
| 48 |
+
fi
|
| 49 |
+
|
| 50 |
+
aws eks update-kubeconfig --name "$CLUSTER_NAME" --region "$REGION"
|
| 51 |
+
echo "kubeconfig updated."
|
| 52 |
+
|
| 53 |
+
# --- Phase 2: Deploy Sample Workloads ---
|
| 54 |
+
echo ""
|
| 55 |
+
echo ">>> Phase 2: Deploying sample workloads (payments, checkout, catalog, cart, auth)..."
|
| 56 |
+
kubectl apply -f "$AWS_DIR/k8s-workloads.yaml"
|
| 57 |
+
echo "Workloads deployed."
|
| 58 |
+
kubectl get pods -n prod-sre
|
| 59 |
+
|
| 60 |
+
# --- Phase 3: Create AMP Workspace ---
|
| 61 |
+
echo ""
|
| 62 |
+
echo ">>> Phase 3: Creating Amazon Managed Prometheus workspace..."
|
| 63 |
+
AMP_WS_ID=$(aws amp list-workspaces --alias antiatropos-metrics --region "$REGION" --query 'workspaces[0].workspaceId' --output text 2>/dev/null || echo "")
|
| 64 |
+
|
| 65 |
+
if [ -z "$AMP_WS_ID" ] || [ "$AMP_WS_ID" = "None" ]; then
|
| 66 |
+
AMP_WS_ID=$(aws amp create-workspace \
|
| 67 |
+
--alias antiatropos-metrics \
|
| 68 |
+
--region "$REGION" \
|
| 69 |
+
--query 'workspaceId' \
|
| 70 |
+
--output text)
|
| 71 |
+
echo "AMP workspace created: $AMP_WS_ID"
|
| 72 |
+
else
|
| 73 |
+
echo "AMP workspace already exists: $AMP_WS_ID"
|
| 74 |
+
fi
|
| 75 |
+
|
| 76 |
+
AMP_URL="https://aps-workspaces.$REGION.amazonaws.com/workspaces/$AMP_WS_ID"
|
| 77 |
+
echo "AMP URL: $AMP_URL"
|
| 78 |
+
|
| 79 |
+
# --- Phase 4: Set up IRSA for Prometheus Agent ---
|
| 80 |
+
echo ""
|
| 81 |
+
echo ">>> Phase 4: Setting up IRSA for Prometheus Agent..."
|
| 82 |
+
if kubectl get serviceaccount prometheus-sa -n monitoring &>/dev/null; then
|
| 83 |
+
echo "prometheus-sa already exists."
|
| 84 |
+
else
|
| 85 |
+
eksctl create iamserviceaccount \
|
| 86 |
+
--cluster "$CLUSTER_NAME" \
|
| 87 |
+
--namespace monitoring \
|
| 88 |
+
--name prometheus-sa \
|
| 89 |
+
--attach-policy-arn arn:aws:iam::aws:policy/AmazonPrometheusRemoteWriteAccess \
|
| 90 |
+
--approve \
|
| 91 |
+
--override-existing-serviceaccounts
|
| 92 |
+
echo "prometheus-sa created."
|
| 93 |
+
fi
|
| 94 |
+
|
| 95 |
+
# --- Phase 5: Install Prometheus Agent ---
|
| 96 |
+
echo ""
|
| 97 |
+
echo ">>> Phase 5: Installing Prometheus Agent (remote-writes to AMP)..."
|
| 98 |
+
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts 2>/dev/null || true
|
| 99 |
+
helm repo update
|
| 100 |
+
|
| 101 |
+
if helm status prometheus-agent -n monitoring &>/dev/null; then
|
| 102 |
+
echo "prometheus-agent already installed, upgrading..."
|
| 103 |
+
helm upgrade prometheus-agent prometheus-community/prometheus \
|
| 104 |
+
--namespace monitoring \
|
| 105 |
+
-f "$AWS_DIR/prometheus-agent-values.yaml" \
|
| 106 |
+
--set "prometheus.prometheusSpec.remoteWrite[0].url=$AMP_URL/api/v1/remote_write"
|
| 107 |
+
else
|
| 108 |
+
helm install prometheus-agent prometheus-community/prometheus \
|
| 109 |
+
--namespace monitoring --create-namespace \
|
| 110 |
+
-f "$AWS_DIR/prometheus-agent-values.yaml" \
|
| 111 |
+
--set "prometheus.prometheusSpec.remoteWrite[0].url=$AMP_URL/api/v1/remote_write"
|
| 112 |
+
echo "prometheus-agent installed."
|
| 113 |
+
fi
|
| 114 |
+
|
| 115 |
+
# --- Phase 6: Install Self-Hosted Grafana on EKS ---
|
| 116 |
+
echo ""
|
| 117 |
+
echo ">>> Phase 6: Installing self-hosted Grafana on EKS..."
|
| 118 |
+
|
| 119 |
+
# Add Grafana Helm repo
|
| 120 |
+
helm repo add grafana https://grafana.github.io/helm-charts 2>/dev/null || true
|
| 121 |
+
helm repo update
|
| 122 |
+
|
| 123 |
+
# Create a secret with the dashboard JSON files for Grafana to import
|
| 124 |
+
DASHBOARDS_DIR="$AWS_DIR/../../grafana/provisioning/dashboards/json"
|
| 125 |
+
if [ -d "$DASHBOARDS_DIR" ]; then
|
| 126 |
+
echo "Creating dashboard secret from $DASHBOARDS_DIR..."
|
| 127 |
+
kubectl create secret generic antiatropos-grafana-dashboards \
|
| 128 |
+
--from-file=antiatropos-overview.json="$DASHBOARDS_DIR/antiatropos-overview.json" \
|
| 129 |
+
--from-file=antiatropos-live.json="$DASHBOARDS_DIR/antiatropos-live.json" \
|
| 130 |
+
--namespace monitoring \
|
| 131 |
+
--dry-run=client -o yaml | kubectl apply -f -
|
| 132 |
+
echo "Dashboard secret created."
|
| 133 |
+
else
|
| 134 |
+
echo "Dashboard JSON directory not found at $DASHBOARDS_DIR, skipping."
|
| 135 |
+
fi
|
| 136 |
+
|
| 137 |
+
# Install Grafana
|
| 138 |
+
GRAFANA_VALUES="$AWS_DIR/grafana-values.yaml"
|
| 139 |
+
|
| 140 |
+
if helm status grafana -n monitoring &>/dev/null; then
|
| 141 |
+
echo "Grafana already installed, upgrading..."
|
| 142 |
+
helm upgrade grafana grafana/grafana --namespace monitoring -f "$GRAFANA_VALUES"
|
| 143 |
+
else
|
| 144 |
+
helm install grafana grafana/grafana --namespace monitoring -f "$GRAFANA_VALUES"
|
| 145 |
+
echo "Grafana installed."
|
| 146 |
+
fi
|
| 147 |
+
|
| 148 |
+
# Wait for Grafana pod to be ready
|
| 149 |
+
echo "Waiting for Grafana pod to be ready..."
|
| 150 |
+
kubectl rollout status deployment/grafana --namespace monitoring --timeout=120s 2>/dev/null || true
|
| 151 |
+
|
| 152 |
+
GRAFANA_POD=$(kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
|
| 153 |
+
echo "Grafana pod: $GRAFANA_POD"
|
| 154 |
+
echo "To access Grafana: kubectl port-forward svc/grafana 3000 -n monitoring"
|
| 155 |
+
echo "Login: admin / antiatropos"
|
| 156 |
+
|
| 157 |
+
# --- Phase 7: Install Cluster Autoscaler ---
|
| 158 |
+
echo ""
|
| 159 |
+
echo ">>> Phase 7: Installing Cluster Autoscaler..."
|
| 160 |
+
helm repo add autoscaler https://kubernetes.github.io/autoscaler 2>/dev/null || true
|
| 161 |
+
helm repo update
|
| 162 |
+
|
| 163 |
+
if helm status cluster-autoscaler -n kube-system &>/dev/null; then
|
| 164 |
+
echo "cluster-autoscaler already installed, upgrading..."
|
| 165 |
+
helm upgrade cluster-autoscaler autoscaler/cluster-autoscaler \
|
| 166 |
+
--namespace kube-system \
|
| 167 |
+
-f "$AWS_DIR/cluster-autoscaler-values.yaml"
|
| 168 |
+
else
|
| 169 |
+
helm install cluster-autoscaler autoscaler/cluster-autoscaler \
|
| 170 |
+
--namespace kube-system \
|
| 171 |
+
-f "$AWS_DIR/cluster-autoscaler-values.yaml"
|
| 172 |
+
echo "cluster-autoscaler installed."
|
| 173 |
+
fi
|
| 174 |
+
|
| 175 |
+
# --- Phase 8: Generate Kubeconfig for HF Spaces ---
|
| 176 |
+
echo ""
|
| 177 |
+
echo ">>> Phase 8: Generating kubeconfig for HF Spaces..."
|
| 178 |
+
"$AWS_DIR/generate-kubeconfig.sh"
|
| 179 |
+
|
| 180 |
+
# --- Done ---
|
| 181 |
+
echo ""
|
| 182 |
+
echo "=========================================="
|
| 183 |
+
echo " AntiAtropos AWS Infrastructure Ready!"
|
| 184 |
+
echo "=========================================="
|
| 185 |
+
echo ""
|
| 186 |
+
echo "AMP Workspace ID: $AMP_WS_ID"
|
| 187 |
+
echo "AMP URL: $AMP_URL"
|
| 188 |
+
echo ""
|
| 189 |
+
echo "Grafana: Self-hosted on EKS (monitoring namespace)"
|
| 190 |
+
echo " Access: kubectl port-forward svc/grafana 3000 -n monitoring"
|
| 191 |
+
echo " Login: admin / antiatropos"
|
| 192 |
+
echo " URL: http://localhost:3000"
|
| 193 |
+
echo ""
|
| 194 |
+
echo "Kubeconfig saved: $AWS_DIR/kubeconfig-antiatropos.yaml"
|
| 195 |
+
echo ""
|
| 196 |
+
echo "Next steps — configure your HF Space:"
|
| 197 |
+
echo " 1. Set secret KUBECONFIG_CONTENT = base64 of kubeconfig-antiatropos.yaml"
|
| 198 |
+
echo " 2. Set env var PROMETHEUS_URL = $AMP_URL"
|
| 199 |
+
echo " 3. Set env var KUBECONFIG = /app/kubeconfig.yaml"
|
| 200 |
+
echo " 4. Set env var ANTIATROPOS_ENV_MODE = live"
|
| 201 |
+
echo " 5. Set env var ANTIATROPOS_MAX_REPLICAS = 6"
|
| 202 |
+
echo " 6. Set env var ANTIATROPOS_WORKLOAD_MAP = (see OPERATIONS.md)"
|
| 203 |
+
echo " 7. Add kubeconfig decode to deploy/entrypoint.sh (see OPERATIONS.md)"
|
| 204 |
+
|
deploy/aws/eksctl-cluster.yaml
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
apiVersion: eksctl.io/v1alpha5
|
| 2 |
+
kind: ClusterConfig
|
| 3 |
+
|
| 4 |
+
metadata:
|
| 5 |
+
name: antiatropos
|
| 6 |
+
region: ap-south-1
|
| 7 |
+
version: "1.30"
|
| 8 |
+
tags:
|
| 9 |
+
Project: AntiAtropos
|
| 10 |
+
Environment: production
|
| 11 |
+
|
| 12 |
+
autoModeConfig:
|
| 13 |
+
enabled: false
|
| 14 |
+
|
| 15 |
+
iam:
|
| 16 |
+
withOIDC: true
|
| 17 |
+
|
| 18 |
+
addons:
|
| 19 |
+
- name: vpc-cni
|
| 20 |
+
version: latest
|
| 21 |
+
- name: coredns
|
| 22 |
+
version: latest
|
| 23 |
+
- name: kube-proxy
|
| 24 |
+
version: latest
|
| 25 |
+
- name: aws-ebs-csi-driver
|
| 26 |
+
version: latest
|
| 27 |
+
wellKnownPolicies:
|
| 28 |
+
ebsCSIController: true
|
| 29 |
+
|
| 30 |
+
managedNodeGroups:
|
| 31 |
+
- name: linux-nodes
|
| 32 |
+
instanceType: t3.micro
|
| 33 |
+
maxPodsPerNode: 110
|
| 34 |
+
desiredCapacity: 2
|
| 35 |
+
minSize: 1
|
| 36 |
+
maxSize: 4
|
| 37 |
+
volumeSize: 50
|
| 38 |
+
volumeType: gp3
|
| 39 |
+
availabilityZones:
|
| 40 |
+
- ap-south-1a
|
| 41 |
+
- ap-south-1b
|
| 42 |
+
labels:
|
| 43 |
+
role: worker
|
| 44 |
+
tags:
|
| 45 |
+
Project: AntiAtropos
|
| 46 |
+
NodeGroup: linux-nodes
|
| 47 |
+
iam:
|
| 48 |
+
withAddonPolicies:
|
| 49 |
+
ebs: true
|
| 50 |
+
cloudWatch: true
|
| 51 |
+
autoScaler: true
|
| 52 |
+
|
| 53 |
+
cloudWatch:
|
| 54 |
+
clusterLogging:
|
| 55 |
+
enableTypes:
|
| 56 |
+
- api
|
| 57 |
+
- audit
|
| 58 |
+
- authenticator
|
deploy/aws/generate-kubeconfig.ps1
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Generate a kubeconfig for HF Spaces to connect to the EKS cluster.
|
| 2 |
+
#
|
| 3 |
+
# This creates a kubeconfig that uses AWS IAM authenticator,
|
| 4 |
+
# which works from outside the cluster (like from HF Spaces).
|
| 5 |
+
#
|
| 6 |
+
# Prerequisites: aws cli, kubectl, eksctl
|
| 7 |
+
#
|
| 8 |
+
# Usage:
|
| 9 |
+
# .\deploy\aws\generate-kubeconfig.ps1
|
| 10 |
+
#
|
| 11 |
+
# Output:
|
| 12 |
+
# deploy/aws/kubeconfig-antiatropos.yaml
|
| 13 |
+
#
|
| 14 |
+
# Then on HF Spaces:
|
| 15 |
+
# 1. base64 encode: $b64 = [Convert]::ToBase64String([IO.File]::ReadAllBytes('deploy\aws\kubeconfig-antiatropos.yaml'))
|
| 16 |
+
# 2. Set as HF Space secret: KUBECONFIG_CONTENT = <base64 output>
|
| 17 |
+
# 3. Set env var: KUBECONFIG = /app/kubeconfig.yaml
|
| 18 |
+
# 4. Add to deploy/entrypoint.sh:
|
| 19 |
+
# if [ -n "${KUBECONFIG_CONTENT:-}" ]; then
|
| 20 |
+
# echo "${KUBECONFIG_CONTENT}" | base64 -d > /app/kubeconfig.yaml
|
| 21 |
+
# export KUBECONFIG=/app/kubeconfig.yaml
|
| 22 |
+
# fi
|
| 23 |
+
|
| 24 |
+
$ErrorActionPreference = "Stop"
|
| 25 |
+
|
| 26 |
+
$Region = if ($env:AWS_REGION) { $env:AWS_REGION } else { "ap-south-1" }
|
| 27 |
+
$ClusterName = if ($env:CLUSTER_NAME) { $env:CLUSTER_NAME } else { "antiatropos" }
|
| 28 |
+
$AwsDir = Split-Path -Parent $MyInvocation.MyCommand.Path
|
| 29 |
+
$Output = Join-Path $AwsDir "kubeconfig-antiatropos.yaml"
|
| 30 |
+
|
| 31 |
+
Write-Host ""
|
| 32 |
+
Write-Host "=== Generating kubeconfig for HF Spaces ===" -ForegroundColor Cyan
|
| 33 |
+
Write-Host "Cluster: $ClusterName"
|
| 34 |
+
Write-Host "Region: $Region"
|
| 35 |
+
Write-Host ""
|
| 36 |
+
|
| 37 |
+
# Verify cluster exists
|
| 38 |
+
$clusterExists = $false
|
| 39 |
+
try {
|
| 40 |
+
eksctl get cluster --name $ClusterName --region $Region 2>$null | Out-Null
|
| 41 |
+
$clusterExists = $true
|
| 42 |
+
} catch {}
|
| 43 |
+
|
| 44 |
+
if (-not $clusterExists) {
|
| 45 |
+
Write-Host "ERROR: Cluster $ClusterName not found. Create it first with eksctl." -ForegroundColor Red
|
| 46 |
+
exit 1
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
# Get cluster details
|
| 50 |
+
$ClusterEndpoint = aws eks describe-cluster --name $ClusterName --region $Region --query 'cluster.endpoint' --output text
|
| 51 |
+
$ClusterCa = aws eks describe-cluster --name $ClusterName --region $Region --query 'cluster.certificateAuthority.data' --output text
|
| 52 |
+
$AwsArn = aws sts get-caller-identity --query Arn --output text
|
| 53 |
+
$Timestamp = (Get-Date).ToUniversalTime().ToString("yyyy-MM-ddTHH:mm:ssZ")
|
| 54 |
+
|
| 55 |
+
Write-Host "Cluster endpoint: $ClusterEndpoint"
|
| 56 |
+
Write-Host "AWS identity: $AwsArn"
|
| 57 |
+
Write-Host ""
|
| 58 |
+
|
| 59 |
+
# Generate the kubeconfig
|
| 60 |
+
$kubeconfig = @"
|
| 61 |
+
# Kubeconfig for AntiAtropos on Hugging Face Spaces
|
| 62 |
+
# Generated: $Timestamp
|
| 63 |
+
# Cluster: $ClusterName
|
| 64 |
+
# Region: $Region
|
| 65 |
+
#
|
| 66 |
+
# This kubeconfig uses AWS IAM authenticator.
|
| 67 |
+
# The HF Space container must have aws-cli and aws-iam-authenticator available,
|
| 68 |
+
# OR the kubernetes Python client must be configured with AWS credentials.
|
| 69 |
+
#
|
| 70 |
+
# To use this on HF Spaces:
|
| 71 |
+
# 1. base64 encode this file
|
| 72 |
+
# 2. Set as HF secret: KUBECONFIG_CONTENT = <base64>
|
| 73 |
+
# 3. Set env var: KUBECONFIG = /app/kubeconfig.yaml
|
| 74 |
+
# 4. Decode in entrypoint.sh before uvicorn starts
|
| 75 |
+
|
| 76 |
+
apiVersion: v1
|
| 77 |
+
kind: Config
|
| 78 |
+
clusters:
|
| 79 |
+
- cluster:
|
| 80 |
+
certificate-authority-data: $ClusterCa
|
| 81 |
+
server: $ClusterEndpoint
|
| 82 |
+
name: $ClusterName
|
| 83 |
+
|
| 84 |
+
contexts:
|
| 85 |
+
- context:
|
| 86 |
+
cluster: $ClusterName
|
| 87 |
+
user: antiatropos-hf-user
|
| 88 |
+
name: $ClusterName
|
| 89 |
+
|
| 90 |
+
current-context: $ClusterName
|
| 91 |
+
|
| 92 |
+
preferences: {}
|
| 93 |
+
|
| 94 |
+
users:
|
| 95 |
+
- name: antiatropos-hf-user
|
| 96 |
+
user:
|
| 97 |
+
exec:
|
| 98 |
+
apiVersion: client.authentication.k8s.io/v1beta1
|
| 99 |
+
command: aws
|
| 100 |
+
args:
|
| 101 |
+
- eks
|
| 102 |
+
- get-token
|
| 103 |
+
- --region
|
| 104 |
+
- $Region
|
| 105 |
+
- --cluster-name
|
| 106 |
+
- $ClusterName
|
| 107 |
+
env:
|
| 108 |
+
- name: AWS_STS_REGIONAL_ENDPOINTS
|
| 109 |
+
value: regional
|
| 110 |
+
- name: AWS_DEFAULT_REGION
|
| 111 |
+
value: $Region
|
| 112 |
+
interactiveMode: IfAvailable
|
| 113 |
+
"@
|
| 114 |
+
|
| 115 |
+
$kubeconfig | Out-File -FilePath $Output -Encoding utf8 -Force
|
| 116 |
+
|
| 117 |
+
Write-Host "Kubeconfig written to: $Output" -ForegroundColor Green
|
| 118 |
+
Write-Host ""
|
| 119 |
+
Write-Host "IMPORTANT: The HF Space container needs the AWS CLI and credentials" -ForegroundColor Yellow
|
| 120 |
+
Write-Host "to authenticate with EKS. You have two options:"
|
| 121 |
+
Write-Host ""
|
| 122 |
+
Write-Host "Option A: Include aws-cli in your Docker image and set AWS_ACCESS_KEY_ID /"
|
| 123 |
+
Write-Host " AWS_SECRET_ACCESS_KEY as HF Space secrets."
|
| 124 |
+
Write-Host ""
|
| 125 |
+
Write-Host "Option B: Use the kubernetes Python client with AWS SDK (boto3)."
|
| 126 |
+
Write-Host " The kubernetes_executor.py already supports this via"
|
| 127 |
+
Write-Host " load_kube_config() which uses the Python client's auth plugins."
|
| 128 |
+
Write-Host ""
|
| 129 |
+
Write-Host "To encode for HF Spaces secret:" -ForegroundColor Yellow
|
| 130 |
+
Write-Host " [Convert]::ToBase64String([IO.File]::ReadAllBytes('$Output'))"
|
| 131 |
+
|
deploy/aws/generate-kubeconfig.sh
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Generate a kubeconfig for HF Spaces to connect to the EKS cluster.
|
| 3 |
+
#
|
| 4 |
+
# This creates a kubeconfig that uses AWS IAM authenticator,
|
| 5 |
+
# which works from outside the cluster (like from HF Spaces).
|
| 6 |
+
#
|
| 7 |
+
# Prerequisites:
|
| 8 |
+
# - aws cli
|
| 9 |
+
# - kubectl
|
| 10 |
+
# - eksctl
|
| 11 |
+
# - The EKS cluster must already exist
|
| 12 |
+
#
|
| 13 |
+
# Usage:
|
| 14 |
+
# ./generate-kubeconfig.sh
|
| 15 |
+
#
|
| 16 |
+
# Output:
|
| 17 |
+
# deploy/aws/kubeconfig-antiatropos.yaml
|
| 18 |
+
#
|
| 19 |
+
# Then on HF Spaces:
|
| 20 |
+
# 1. base64 encode: cat kubeconfig-antiatropos.yaml | base64 -w 0
|
| 21 |
+
# 2. Set as HF Space secret: KUBECONFIG_CONTENT = <base64 output>
|
| 22 |
+
# 3. Set env var: KUBECONFIG = /app/kubeconfig.yaml
|
| 23 |
+
# 4. Add to deploy/entrypoint.sh:
|
| 24 |
+
# if [ -n "${KUBECONFIG_CONTENT:-}" ]; then
|
| 25 |
+
# echo "${KUBECONFIG_CONTENT}" | base64 -d > /app/kubeconfig.yaml
|
| 26 |
+
# export KUBECONFIG=/app/kubeconfig.yaml
|
| 27 |
+
# fi
|
| 28 |
+
|
| 29 |
+
set -euo pipefail
|
| 30 |
+
|
| 31 |
+
REGION="${AWS_REGION:-ap-south-1}"
|
| 32 |
+
CLUSTER_NAME="${CLUSTER_NAME:-antiatropos}"
|
| 33 |
+
AWS_DIR="$(cd "$(dirname "$0")" && pwd)"
|
| 34 |
+
OUTPUT="$AWS_DIR/kubeconfig-antiatropos.yaml"
|
| 35 |
+
|
| 36 |
+
echo "=== Generating kubeconfig for HF Spaces ==="
|
| 37 |
+
echo "Cluster: $CLUSTER_NAME"
|
| 38 |
+
echo "Region: $REGION"
|
| 39 |
+
echo ""
|
| 40 |
+
|
| 41 |
+
# Verify cluster exists
|
| 42 |
+
if ! eksctl get cluster --name "$CLUSTER_NAME" --region "$REGION" &>/dev/null; then
|
| 43 |
+
echo "ERROR: Cluster $CLUSTER_NAME not found. Create it first with eksctl."
|
| 44 |
+
exit 1
|
| 45 |
+
fi
|
| 46 |
+
|
| 47 |
+
# Get cluster details
|
| 48 |
+
CLUSTER_ENDPOINT=$(aws eks describe-cluster \
|
| 49 |
+
--name "$CLUSTER_NAME" \
|
| 50 |
+
--region "$REGION" \
|
| 51 |
+
--query 'cluster.endpoint' \
|
| 52 |
+
--output text)
|
| 53 |
+
|
| 54 |
+
CLUSTER_CA=$(aws eks describe-cluster \
|
| 55 |
+
--name "$CLUSTER_NAME" \
|
| 56 |
+
--region "$REGION" \
|
| 57 |
+
--query 'cluster.certificateAuthority.data' \
|
| 58 |
+
--output text)
|
| 59 |
+
|
| 60 |
+
# Get the current AWS identity for the kubeconfig
|
| 61 |
+
AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
|
| 62 |
+
AWS_ARN=$(aws sts get-caller-identity --query Arn --output text)
|
| 63 |
+
|
| 64 |
+
echo "Cluster endpoint: $CLUSTER_ENDPOINT"
|
| 65 |
+
echo "AWS identity: $AWS_ARN"
|
| 66 |
+
echo ""
|
| 67 |
+
|
| 68 |
+
# Generate the kubeconfig
|
| 69 |
+
cat > "$OUTPUT" <<EOF
|
| 70 |
+
# Kubeconfig for AntiAtropos on Hugging Face Spaces
|
| 71 |
+
# Generated: $(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
| 72 |
+
# Cluster: $CLUSTER_NAME
|
| 73 |
+
# Region: $REGION
|
| 74 |
+
#
|
| 75 |
+
# This kubeconfig uses AWS IAM authenticator.
|
| 76 |
+
# The HF Space container must have aws-cli and aws-iam-authenticator available,
|
| 77 |
+
# OR the kubernetes Python client must be configured with AWS credentials.
|
| 78 |
+
#
|
| 79 |
+
# To use this on HF Spaces:
|
| 80 |
+
# 1. base64 encode this file: cat kubeconfig-antiatropos.yaml | base64 -w 0
|
| 81 |
+
# 2. Set as HF secret: KUBECONFIG_CONTENT = <base64>
|
| 82 |
+
# 3. Set env var: KUBECONFIG = /app/kubeconfig.yaml
|
| 83 |
+
# 4. Decode in entrypoint.sh before uvicorn starts
|
| 84 |
+
|
| 85 |
+
apiVersion: v1
|
| 86 |
+
kind: Config
|
| 87 |
+
clusters:
|
| 88 |
+
- cluster:
|
| 89 |
+
certificate-authority-data: $CLUSTER_CA
|
| 90 |
+
server: $CLUSTER_ENDPOINT
|
| 91 |
+
name: $CLUSTER_NAME
|
| 92 |
+
|
| 93 |
+
contexts:
|
| 94 |
+
- context:
|
| 95 |
+
cluster: $CLUSTER_NAME
|
| 96 |
+
user: antiatropos-hf-user
|
| 97 |
+
name: $CLUSTER_NAME
|
| 98 |
+
|
| 99 |
+
current-context: $CLUSTER_NAME
|
| 100 |
+
|
| 101 |
+
preferences: {}
|
| 102 |
+
|
| 103 |
+
users:
|
| 104 |
+
- name: antiatropos-hf-user
|
| 105 |
+
user:
|
| 106 |
+
exec:
|
| 107 |
+
apiVersion: client.authentication.k8s.io/v1beta1
|
| 108 |
+
command: aws
|
| 109 |
+
args:
|
| 110 |
+
- eks
|
| 111 |
+
- token
|
| 112 |
+
- --region
|
| 113 |
+
- $REGION
|
| 114 |
+
- --cluster-name
|
| 115 |
+
- $CLUSTER_NAME
|
| 116 |
+
env:
|
| 117 |
+
- name: AWS_STS_REGIONAL_ENDPOINTS
|
| 118 |
+
value: regional
|
| 119 |
+
- name: AWS_DEFAULT_REGION
|
| 120 |
+
value: $REGION
|
| 121 |
+
interactiveMode: IfAvailable
|
| 122 |
+
EOF
|
| 123 |
+
|
| 124 |
+
echo "Kubeconfig written to: $OUTPUT"
|
| 125 |
+
echo ""
|
| 126 |
+
echo "IMPORTANT: The HF Space container needs the AWS CLI and credentials"
|
| 127 |
+
echo "to authenticate with EKS. You have two options:"
|
| 128 |
+
echo ""
|
| 129 |
+
echo "Option A: Include aws-cli in your Docker image and set AWS_ACCESS_KEY_ID /"
|
| 130 |
+
echo " AWS_SECRET_ACCESS_KEY as HF Space secrets."
|
| 131 |
+
echo ""
|
| 132 |
+
echo "Option B: Use the kubernetes Python client with AWS SDK (boto3)."
|
| 133 |
+
echo " The kubernetes_executor.py already supports this via"
|
| 134 |
+
echo " load_kube_config() which uses the Python client's auth plugins."
|
| 135 |
+
echo ""
|
| 136 |
+
echo "To encode for HF Spaces secret:"
|
| 137 |
+
echo " cat $OUTPUT | base64 -w 0"
|
| 138 |
+
|
deploy/aws/grafana-trust-policy.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"Version": "2012-10-17",
|
| 3 |
+
"Statement": [
|
| 4 |
+
{
|
| 5 |
+
"Effect": "Allow",
|
| 6 |
+
"Principal": {
|
| 7 |
+
"Service": "grafana.amazonaws.com"
|
| 8 |
+
},
|
| 9 |
+
"Action": "sts:AssumeRole"
|
| 10 |
+
}
|
| 11 |
+
]
|
| 12 |
+
}
|
deploy/aws/grafana-values.yaml
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Grafana self-hosted on EKS
|
| 2 |
+
# Connects to the local Prometheus agent and imports AntiAtropos dashboards
|
| 3 |
+
|
| 4 |
+
replicaCount: 1
|
| 5 |
+
|
| 6 |
+
adminUser: admin
|
| 7 |
+
adminPassword: antiatropos
|
| 8 |
+
|
| 9 |
+
service:
|
| 10 |
+
type: ClusterIP
|
| 11 |
+
port: 80
|
| 12 |
+
|
| 13 |
+
persistence:
|
| 14 |
+
enabled: true
|
| 15 |
+
size: 5Gi
|
| 16 |
+
storageClassName: gp2
|
| 17 |
+
|
| 18 |
+
# Use the local Prometheus agent as data source
|
| 19 |
+
additionalDataSources:
|
| 20 |
+
- name: AMP-Local
|
| 21 |
+
type: prometheus
|
| 22 |
+
access: proxy
|
| 23 |
+
url: http://prometheus-agent-server.monitoring.svc.cluster.local:80
|
| 24 |
+
isDefault: true
|
| 25 |
+
editable: true
|
| 26 |
+
|
| 27 |
+
# Import AntiAtropos dashboards
|
| 28 |
+
dashboardProviders:
|
| 29 |
+
dashboardproviders.yaml:
|
| 30 |
+
apiVersion: 1
|
| 31 |
+
providers:
|
| 32 |
+
- name: 'default'
|
| 33 |
+
orgId: 1
|
| 34 |
+
folder: 'AntiAtropos'
|
| 35 |
+
type: file
|
| 36 |
+
disableDeletion: false
|
| 37 |
+
editable: true
|
| 38 |
+
options:
|
| 39 |
+
path: /var/lib/grafana/dashboards
|
| 40 |
+
|
| 41 |
+
dashboards:
|
| 42 |
+
default:
|
| 43 |
+
antiatropos-overview:
|
| 44 |
+
gnetId: null
|
| 45 |
+
datasource: AMP-Local
|
| 46 |
+
antiatropos-live:
|
| 47 |
+
gnetId: null
|
| 48 |
+
datasource: AMP-Local
|
| 49 |
+
|
| 50 |
+
# Allow dashboard JSON files to be mounted
|
| 51 |
+
extraSecretMounts:
|
| 52 |
+
- name: dashboards
|
| 53 |
+
mountPath: /var/lib/grafana/dashboards
|
| 54 |
+
subPath: ""
|
| 55 |
+
secretName: antiatropos-grafana-dashboards
|
| 56 |
+
readOnly: true
|
| 57 |
+
|
| 58 |
+
resources:
|
| 59 |
+
limits:
|
| 60 |
+
memory: 512Mi
|
| 61 |
+
cpu: 250m
|
| 62 |
+
requests:
|
| 63 |
+
memory: 256Mi
|
| 64 |
+
cpu: 100m
|
| 65 |
+
|
| 66 |
+
nodeSelector: {}
|
| 67 |
+
tolerations: []
|
| 68 |
+
affinity: {}
|
deploy/aws/k8s-workloads.yaml
ADDED
|
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Sample microservice deployments for AntiAtropos SRE training.
|
| 2 |
+
#
|
| 3 |
+
# These are the workloads the SRE agent will SCALE_UP / SCALE_DOWN / REROUTE_TRAFFIC / SHED_LOAD.
|
| 4 |
+
# Each maps to a simulator node via ANTIATROPOS_WORKLOAD_MAP on HF Spaces.
|
| 5 |
+
#
|
| 6 |
+
# Apply: kubectl apply -f k8s-workloads.yaml
|
| 7 |
+
#
|
| 8 |
+
# The Prometheus Agent (in monitoring namespace) scrapes these pods
|
| 9 |
+
# because they have the prometheus.io/scrape annotation.
|
| 10 |
+
# Metrics are remote-written to AMP where the AntiAtropos server on HF Spaces queries them.
|
| 11 |
+
|
| 12 |
+
---
|
| 13 |
+
apiVersion: v1
|
| 14 |
+
kind: Namespace
|
| 15 |
+
metadata:
|
| 16 |
+
name: prod-sre
|
| 17 |
+
labels:
|
| 18 |
+
app.kubernetes.io/part-of: antiatropos
|
| 19 |
+
|
| 20 |
+
---
|
| 21 |
+
# ResourceQuota: Hard cap on pods in prod-sre namespace.
|
| 22 |
+
# This is a Kubernetes-level safety net. Even if the agent's Python cap fails,
|
| 23 |
+
# Kubernetes will refuse to create pods beyond this limit.
|
| 24 |
+
#
|
| 25 |
+
# Max 30 pods = 6 replicas x 5 deployments (our worst-case budget)
|
| 26 |
+
# Max 8 CPU / 8GB RAM = enough for 30 small nginx pods
|
| 27 |
+
apiVersion: v1
|
| 28 |
+
kind: ResourceQuota
|
| 29 |
+
metadata:
|
| 30 |
+
name: prod-sre-quota
|
| 31 |
+
namespace: prod-sre
|
| 32 |
+
spec:
|
| 33 |
+
hard:
|
| 34 |
+
pods: "30"
|
| 35 |
+
requests.cpu: "8"
|
| 36 |
+
requests.memory: 8Gi
|
| 37 |
+
limits.cpu: "15"
|
| 38 |
+
limits.memory: 15Gi
|
| 39 |
+
|
| 40 |
+
---
|
| 41 |
+
# payments — node-0 (VIP)
|
| 42 |
+
# Business-critical payment service. Always has 2 replicas for redundancy.
|
| 43 |
+
# The SRE agent should never SHED_LOAD on this (CRITICAL_NODES in simulator.py).
|
| 44 |
+
apiVersion: apps/v1
|
| 45 |
+
kind: Deployment
|
| 46 |
+
metadata:
|
| 47 |
+
name: payments
|
| 48 |
+
namespace: prod-sre
|
| 49 |
+
labels:
|
| 50 |
+
app: payments
|
| 51 |
+
node-id: node-0
|
| 52 |
+
critical: "true"
|
| 53 |
+
spec:
|
| 54 |
+
replicas: 2
|
| 55 |
+
selector:
|
| 56 |
+
matchLabels:
|
| 57 |
+
app: payments
|
| 58 |
+
template:
|
| 59 |
+
metadata:
|
| 60 |
+
labels:
|
| 61 |
+
app: payments
|
| 62 |
+
node-id: node-0
|
| 63 |
+
annotations:
|
| 64 |
+
prometheus.io/scrape: "true"
|
| 65 |
+
prometheus.io/port: "8080"
|
| 66 |
+
prometheus.io/path: "/metrics"
|
| 67 |
+
spec:
|
| 68 |
+
containers:
|
| 69 |
+
- name: payments
|
| 70 |
+
image: nginx:alpine
|
| 71 |
+
ports:
|
| 72 |
+
- containerPort: 80
|
| 73 |
+
resources:
|
| 74 |
+
requests:
|
| 75 |
+
cpu: 100m
|
| 76 |
+
memory: 64Mi
|
| 77 |
+
limits:
|
| 78 |
+
cpu: 250m
|
| 79 |
+
memory: 128Mi
|
| 80 |
+
|
| 81 |
+
---
|
| 82 |
+
apiVersion: v1
|
| 83 |
+
kind: Service
|
| 84 |
+
metadata:
|
| 85 |
+
name: payments
|
| 86 |
+
namespace: prod-sre
|
| 87 |
+
spec:
|
| 88 |
+
selector:
|
| 89 |
+
app: payments
|
| 90 |
+
ports:
|
| 91 |
+
- port: 80
|
| 92 |
+
targetPort: 80
|
| 93 |
+
|
| 94 |
+
---
|
| 95 |
+
# checkout — node-1
|
| 96 |
+
apiVersion: apps/v1
|
| 97 |
+
kind: Deployment
|
| 98 |
+
metadata:
|
| 99 |
+
name: checkout
|
| 100 |
+
namespace: prod-sre
|
| 101 |
+
labels:
|
| 102 |
+
app: checkout
|
| 103 |
+
node-id: node-1
|
| 104 |
+
spec:
|
| 105 |
+
replicas: 1
|
| 106 |
+
selector:
|
| 107 |
+
matchLabels:
|
| 108 |
+
app: checkout
|
| 109 |
+
template:
|
| 110 |
+
metadata:
|
| 111 |
+
labels:
|
| 112 |
+
app: checkout
|
| 113 |
+
node-id: node-1
|
| 114 |
+
annotations:
|
| 115 |
+
prometheus.io/scrape: "true"
|
| 116 |
+
prometheus.io/port: "8080"
|
| 117 |
+
prometheus.io/path: "/metrics"
|
| 118 |
+
spec:
|
| 119 |
+
containers:
|
| 120 |
+
- name: checkout
|
| 121 |
+
image: nginx:alpine
|
| 122 |
+
ports:
|
| 123 |
+
- containerPort: 80
|
| 124 |
+
resources:
|
| 125 |
+
requests:
|
| 126 |
+
cpu: 100m
|
| 127 |
+
memory: 64Mi
|
| 128 |
+
limits:
|
| 129 |
+
cpu: 250m
|
| 130 |
+
memory: 128Mi
|
| 131 |
+
|
| 132 |
+
---
|
| 133 |
+
apiVersion: v1
|
| 134 |
+
kind: Service
|
| 135 |
+
metadata:
|
| 136 |
+
name: checkout
|
| 137 |
+
namespace: prod-sre
|
| 138 |
+
spec:
|
| 139 |
+
selector:
|
| 140 |
+
app: checkout
|
| 141 |
+
ports:
|
| 142 |
+
- port: 80
|
| 143 |
+
targetPort: 80
|
| 144 |
+
|
| 145 |
+
---
|
| 146 |
+
# catalog — node-2
|
| 147 |
+
apiVersion: apps/v1
|
| 148 |
+
kind: Deployment
|
| 149 |
+
metadata:
|
| 150 |
+
name: catalog
|
| 151 |
+
namespace: prod-sre
|
| 152 |
+
labels:
|
| 153 |
+
app: catalog
|
| 154 |
+
node-id: node-2
|
| 155 |
+
spec:
|
| 156 |
+
replicas: 1
|
| 157 |
+
selector:
|
| 158 |
+
matchLabels:
|
| 159 |
+
app: catalog
|
| 160 |
+
template:
|
| 161 |
+
metadata:
|
| 162 |
+
labels:
|
| 163 |
+
app: catalog
|
| 164 |
+
node-id: node-2
|
| 165 |
+
annotations:
|
| 166 |
+
prometheus.io/scrape: "true"
|
| 167 |
+
prometheus.io/port: "8080"
|
| 168 |
+
prometheus.io/path: "/metrics"
|
| 169 |
+
spec:
|
| 170 |
+
containers:
|
| 171 |
+
- name: catalog
|
| 172 |
+
image: nginx:alpine
|
| 173 |
+
ports:
|
| 174 |
+
- containerPort: 80
|
| 175 |
+
resources:
|
| 176 |
+
requests:
|
| 177 |
+
cpu: 100m
|
| 178 |
+
memory: 64Mi
|
| 179 |
+
limits:
|
| 180 |
+
cpu: 250m
|
| 181 |
+
memory: 128Mi
|
| 182 |
+
|
| 183 |
+
---
|
| 184 |
+
apiVersion: v1
|
| 185 |
+
kind: Service
|
| 186 |
+
metadata:
|
| 187 |
+
name: catalog
|
| 188 |
+
namespace: prod-sre
|
| 189 |
+
spec:
|
| 190 |
+
selector:
|
| 191 |
+
app: catalog
|
| 192 |
+
ports:
|
| 193 |
+
- port: 80
|
| 194 |
+
targetPort: 80
|
| 195 |
+
|
| 196 |
+
---
|
| 197 |
+
# cart — node-3
|
| 198 |
+
apiVersion: apps/v1
|
| 199 |
+
kind: Deployment
|
| 200 |
+
metadata:
|
| 201 |
+
name: cart
|
| 202 |
+
namespace: prod-sre
|
| 203 |
+
labels:
|
| 204 |
+
app: cart
|
| 205 |
+
node-id: node-3
|
| 206 |
+
spec:
|
| 207 |
+
replicas: 1
|
| 208 |
+
selector:
|
| 209 |
+
matchLabels:
|
| 210 |
+
app: cart
|
| 211 |
+
template:
|
| 212 |
+
metadata:
|
| 213 |
+
labels:
|
| 214 |
+
app: cart
|
| 215 |
+
node-id: node-3
|
| 216 |
+
annotations:
|
| 217 |
+
prometheus.io/scrape: "true"
|
| 218 |
+
prometheus.io/port: "8080"
|
| 219 |
+
prometheus.io/path: "/metrics"
|
| 220 |
+
spec:
|
| 221 |
+
containers:
|
| 222 |
+
- name: cart
|
| 223 |
+
image: nginx:alpine
|
| 224 |
+
ports:
|
| 225 |
+
- containerPort: 80
|
| 226 |
+
resources:
|
| 227 |
+
requests:
|
| 228 |
+
cpu: 100m
|
| 229 |
+
memory: 64Mi
|
| 230 |
+
limits:
|
| 231 |
+
cpu: 250m
|
| 232 |
+
memory: 128Mi
|
| 233 |
+
|
| 234 |
+
---
|
| 235 |
+
apiVersion: v1
|
| 236 |
+
kind: Service
|
| 237 |
+
metadata:
|
| 238 |
+
name: cart
|
| 239 |
+
namespace: prod-sre
|
| 240 |
+
spec:
|
| 241 |
+
selector:
|
| 242 |
+
app: cart
|
| 243 |
+
ports:
|
| 244 |
+
- port: 80
|
| 245 |
+
targetPort: 80
|
| 246 |
+
|
| 247 |
+
---
|
| 248 |
+
# auth — node-4
|
| 249 |
+
apiVersion: apps/v1
|
| 250 |
+
kind: Deployment
|
| 251 |
+
metadata:
|
| 252 |
+
name: auth
|
| 253 |
+
namespace: prod-sre
|
| 254 |
+
labels:
|
| 255 |
+
app: auth
|
| 256 |
+
node-id: node-4
|
| 257 |
+
spec:
|
| 258 |
+
replicas: 1
|
| 259 |
+
selector:
|
| 260 |
+
matchLabels:
|
| 261 |
+
app: auth
|
| 262 |
+
template:
|
| 263 |
+
metadata:
|
| 264 |
+
labels:
|
| 265 |
+
app: auth
|
| 266 |
+
node-id: node-4
|
| 267 |
+
annotations:
|
| 268 |
+
prometheus.io/scrape: "true"
|
| 269 |
+
prometheus.io/port: "8080"
|
| 270 |
+
prometheus.io/path: "/metrics"
|
| 271 |
+
spec:
|
| 272 |
+
containers:
|
| 273 |
+
- name: auth
|
| 274 |
+
image: nginx:alpine
|
| 275 |
+
ports:
|
| 276 |
+
- containerPort: 80
|
| 277 |
+
resources:
|
| 278 |
+
requests:
|
| 279 |
+
cpu: 100m
|
| 280 |
+
memory: 64Mi
|
| 281 |
+
limits:
|
| 282 |
+
cpu: 250m
|
| 283 |
+
memory: 128Mi
|
| 284 |
+
|
| 285 |
+
---
|
| 286 |
+
apiVersion: v1
|
| 287 |
+
kind: Service
|
| 288 |
+
metadata:
|
| 289 |
+
name: auth
|
| 290 |
+
namespace: prod-sre
|
| 291 |
+
spec:
|
| 292 |
+
selector:
|
| 293 |
+
app: auth
|
| 294 |
+
ports:
|
| 295 |
+
- port: 80
|
| 296 |
+
targetPort: 80
|
deploy/aws/kubeconfig-antiatropos.yaml
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
apiVersion: v1
|
| 2 |
+
kind: Config
|
| 3 |
+
clusters:
|
| 4 |
+
- cluster:
|
| 5 |
+
certificate-authority-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSURCVENDQWUyZ0F3SUJBZ0lJQk5ZY1JYcVZ2dm93RFFZSktvWklodmNOQVFFTEJRQXdGVEVUTUJFR0ExVUUKQXhNS2EzVmlaWEp1WlhSbGN6QWVGdzB5TmpBME1qTXhPREEwTURGYUZ3MHpOakEwTWpBeE9EQTVNREZhTUJVeApFekFSQmdOVkJBTVRDbXQxWW1WeWJtVjBaWE13Z2dFaU1BMEdDU3FHU0liM0RRRUJBUVVBQTRJQkR3QXdnZ0VLCkFvSUJBUUN2cHYwRVRIREIxeVRjVVFxa21Xd2Z2YnE0Z3d1bm9HK0w0MkIvaUV0N3h1NVhTMjZQWVlwNURGckYKUTJoUTRndDlENDUwNXlHNkN0eCtWVXBncExpeUxEU3pMdEM2VHUrUm5uSEY0NHRHZ1NJQm9GaG9TaXhzWFV3SQoxU3E1NVBIeHhPQmo3OGJxRFVxL2R3eE1xOVk1TzBINmkwV1ZaZHMvTmhaMk9rd1dJeUJnYy9Rckhpb2ZJZm1qCkVhZ0psRm9Sb1c2L2RjajBiOThOMi9zaWt1blRhQldJSGpPay9ESkNiWldzU0JtOTBBY0V3dEdnN1Bhc1hOcUsKaWwydWxlMG9PYk9zTyszbDhpeU9nYktROHFDbFgwSU03UVN2Y1J4YnYwK2FCYXpxVS9BRkhMY1VmTW1VMXVKRwpLdGVuTUxzNnBLdlpyRU9EOFlacklkYmkyZDBaQWdNQkFBR2pXVEJYTUE0R0ExVWREd0VCL3dRRUF3SUNwREFQCkJnTlZIUk1CQWY4RUJUQURBUUgvTUIwR0ExVWREZ1FXQkJTVm43TWdjYkhCNE9wNFc0WEhLYlNPeWdBdDREQVYKQmdOVkhSRUVEakFNZ2dwcmRXSmxjbTVsZEdWek1BMEdDU3FHU0liM0RRRUJDd1VBQTRJQkFRQSsxc0Rjc1RJcQp0T3V4Nk5OMkUrTFlYOFEvTk9qWlhSQVhSeDlOdXhoL0RCRmJwTjUrTzg2VWROL3BJamI0WGUyTVRGaytCTXZnCmUyWk9NNGJFQTlLR3JPc1RhK3VBL3pKZFhjUXZ0MG00Kzd5T3VqcklHOGhuOTlZSjRlTmxYYk9nV3NOTmVDMnEKT01DVFFPdGtJNVlMNFNET2ZDRUlsOEpBU0QvZTNRd0p6Mk15bnNIR2F4azZYZ3VnVkgzekVQcVNRL3FZa2pQTgpDY0ZMNXF1WWVUODUzM3g0SENKb1dmblZReHlaOVJ2V1Y0eThpT3JqbTV3Z2xvN2U3NkRmaTBwTnczRS80MysxCisrdXdWYmhZZTE0OUhyK3FzWU1YbGFiTFJmeHhXT2RxdzMxbXdJeitSSHF5V2U4V3prZnhUVGlmQjZNVVJyQXgKRWVKQWkwdWMxSkRMCi0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K
|
| 6 |
+
server: https://D3CBAF956940D075AE61BB6193A93256.gr7.ap-south-1.eks.amazonaws.com
|
| 7 |
+
name: antiatropos
|
| 8 |
+
contexts:
|
| 9 |
+
- context:
|
| 10 |
+
cluster: antiatropos
|
| 11 |
+
user: antiatropos-hf-user
|
| 12 |
+
name: antiatropos
|
| 13 |
+
current-context: antiatropos
|
| 14 |
+
preferences: {}
|
| 15 |
+
users:
|
| 16 |
+
- name: antiatropos-hf-user
|
| 17 |
+
user:
|
| 18 |
+
exec:
|
| 19 |
+
apiVersion: client.authentication.k8s.io/v1beta1
|
| 20 |
+
command: aws
|
| 21 |
+
args:
|
| 22 |
+
- eks
|
| 23 |
+
- get-token
|
| 24 |
+
- --region
|
| 25 |
+
- ap-south-1
|
| 26 |
+
- --cluster-name
|
| 27 |
+
- antiatropos
|
| 28 |
+
env:
|
| 29 |
+
- name: AWS_STS_REGIONAL_ENDPOINTS
|
| 30 |
+
value: regional
|
| 31 |
+
- name: AWS_DEFAULT_REGION
|
| 32 |
+
value: ap-south-1
|
| 33 |
+
interactiveMode: IfAvailable
|
| 34 |
+
|
deploy/aws/prometheus-agent-values.yaml
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Helm values for Prometheus Agent that remote-writes to Amazon Managed Prometheus
|
| 2 |
+
#
|
| 3 |
+
# Usage:
|
| 4 |
+
# helm install prometheus-agent prometheus-community/prometheus \
|
| 5 |
+
# --namespace monitoring --create-namespace \
|
| 6 |
+
# -f prometheus-agent-values.yaml \
|
| 7 |
+
# --set prometheus.prometheusSpec.remoteWrite[0].url="https://aps-workspaces.ap-south-1.amazonaws.com/workspaces/WORKSPACE_ID/api/v1/remote_write"
|
| 8 |
+
#
|
| 9 |
+
# Prerequisite: Create an IAM service account for the prometheus pod
|
| 10 |
+
# eksctl create iamserviceaccount \
|
| 11 |
+
# --cluster antiatropos \
|
| 12 |
+
# --namespace monitoring \
|
| 13 |
+
# --name prometheus-sa \
|
| 14 |
+
# --attach-policy-arn arn:aws:iam::aws:policy/AmazonPrometheusRemoteWriteAccess \
|
| 15 |
+
# --approve
|
| 16 |
+
|
| 17 |
+
prometheus:
|
| 18 |
+
prometheusSpec:
|
| 19 |
+
# Run as agent mode (remote-write only, no local query API)
|
| 20 |
+
agentMode: true
|
| 21 |
+
|
| 22 |
+
# Remote write — override via --set on the command line
|
| 23 |
+
remoteWrite:
|
| 24 |
+
- url: "https://aps-workspaces.ap-south-1.amazonaws.com/workspaces/REPLACE_WORKSPACE_ID/api/v1/remote_write"
|
| 25 |
+
sigv4:
|
| 26 |
+
region: ap-south-1
|
| 27 |
+
|
| 28 |
+
# Scrape the workload pods in prod-sre namespace (the microservices
|
| 29 |
+
# the SRE agent manages: payments, checkout, catalog, cart, auth)
|
| 30 |
+
additionalScrapeConfigs:
|
| 31 |
+
- job_name: antiatropos-workloads
|
| 32 |
+
metrics_path: /metrics
|
| 33 |
+
scrape_interval: 15s
|
| 34 |
+
kubernetes_sd_configs:
|
| 35 |
+
- role: pod
|
| 36 |
+
namespaces:
|
| 37 |
+
names:
|
| 38 |
+
- prod-sre
|
| 39 |
+
relabel_configs:
|
| 40 |
+
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
|
| 41 |
+
action: keep
|
| 42 |
+
regex: true
|
| 43 |
+
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
|
| 44 |
+
action: replace
|
| 45 |
+
target_label: __metrics_path__
|
| 46 |
+
regex: (.+)
|
| 47 |
+
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
|
| 48 |
+
action: replace
|
| 49 |
+
regex: ([^:]+)(?::\d+)?;(\d+)
|
| 50 |
+
replacement: $1:$2
|
| 51 |
+
target_label: __address__
|
| 52 |
+
- action: labelmap
|
| 53 |
+
regex: __meta_kubernetes_pod_label_(.+)
|
| 54 |
+
- source_labels: [__meta_kubernetes_namespace]
|
| 55 |
+
action: replace
|
| 56 |
+
target_label: namespace
|
| 57 |
+
- source_labels: [__meta_kubernetes_pod_name]
|
| 58 |
+
action: replace
|
| 59 |
+
target_label: pod
|
| 60 |
+
|
| 61 |
+
# Also scrape the Prometheus Agent's own metrics for monitoring
|
| 62 |
+
- job_name: prometheus-agent-self
|
| 63 |
+
scrape_interval: 15s
|
| 64 |
+
static_configs:
|
| 65 |
+
- targets:
|
| 66 |
+
- localhost:9090
|
| 67 |
+
|
| 68 |
+
resources:
|
| 69 |
+
requests:
|
| 70 |
+
cpu: 100m
|
| 71 |
+
memory: 256Mi
|
| 72 |
+
limits:
|
| 73 |
+
cpu: 500m
|
| 74 |
+
memory: 512Mi
|
| 75 |
+
|
| 76 |
+
# Short retention since we're remote-writing everything to AMP
|
| 77 |
+
retention: 2h
|
| 78 |
+
|
| 79 |
+
# Use the IAM service account for AMP authentication
|
| 80 |
+
serviceAccount:
|
| 81 |
+
name: prometheus-sa
|
| 82 |
+
create: false
|
| 83 |
+
|
| 84 |
+
# Disable alertmanager (AMP handles alerting if needed)
|
| 85 |
+
alertmanager:
|
| 86 |
+
enabled: false
|
| 87 |
+
|
| 88 |
+
# Disable pushgateway
|
| 89 |
+
pushgateway:
|
| 90 |
+
enabled: false
|
| 91 |
+
|
| 92 |
+
# Disable server (we only need the agent)
|
| 93 |
+
server:
|
| 94 |
+
enabled: false
|
| 95 |
+
|
deploy/aws/teardown-all.ps1
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AntiAtropos - One-Run Teardown Script
|
| 2 |
+
# Deletes entire AWS infrastructure: EKS cluster, AMP workspace
|
| 3 |
+
#
|
| 4 |
+
# Usage: .\deploy\aws\teardown-all.ps1
|
| 5 |
+
|
| 6 |
+
$ErrorActionPreference = "Stop"
|
| 7 |
+
|
| 8 |
+
# In PowerShell 7+, prevent native stderr output from becoming terminating errors.
|
| 9 |
+
if (Get-Variable -Name PSNativeCommandUseErrorActionPreference -ErrorAction SilentlyContinue) {
|
| 10 |
+
$PSNativeCommandUseErrorActionPreference = $false
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
$Region = "ap-south-1"
|
| 14 |
+
$ClusterName = "antiatropos"
|
| 15 |
+
$AmpAlias = "antiatropos-metrics"
|
| 16 |
+
$GeneratedKubeconfig = Join-Path $PSScriptRoot "kubeconfig-antiatropos.yaml"
|
| 17 |
+
|
| 18 |
+
function Invoke-CheckedCommand {
|
| 19 |
+
param(
|
| 20 |
+
[ScriptBlock]$Command,
|
| 21 |
+
[string]$ErrorMessage
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
$previousErrorActionPreference = $ErrorActionPreference
|
| 25 |
+
$ErrorActionPreference = "Continue"
|
| 26 |
+
try {
|
| 27 |
+
& $Command
|
| 28 |
+
} finally {
|
| 29 |
+
$ErrorActionPreference = $previousErrorActionPreference
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
if ($LASTEXITCODE -ne 0) {
|
| 33 |
+
throw $ErrorMessage
|
| 34 |
+
}
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
function Get-EksClusterStatus {
|
| 38 |
+
param(
|
| 39 |
+
[string]$Name,
|
| 40 |
+
[string]$AwsRegion
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
try {
|
| 44 |
+
$status = aws eks describe-cluster --name $Name --region $AwsRegion --query 'cluster.status' --output text 2>$null
|
| 45 |
+
} catch {
|
| 46 |
+
return $null
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
if ($LASTEXITCODE -ne 0 -or [string]::IsNullOrWhiteSpace($status) -or $status -eq "None") {
|
| 50 |
+
return $null
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
return $status.Trim()
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
function Get-EksNodegroups {
|
| 57 |
+
param(
|
| 58 |
+
[string]$Name,
|
| 59 |
+
[string]$AwsRegion
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
try {
|
| 63 |
+
$raw = aws eks list-nodegroups --cluster-name $Name --region $AwsRegion --query 'nodegroups' --output text 2>$null
|
| 64 |
+
} catch {
|
| 65 |
+
return @()
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
if ($LASTEXITCODE -ne 0 -or [string]::IsNullOrWhiteSpace($raw) -or $raw -eq "None") {
|
| 69 |
+
return @()
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
return @($raw -split '\s+' | Where-Object { -not [string]::IsNullOrWhiteSpace($_) })
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
function Remove-ResidualEksStacks {
|
| 76 |
+
param(
|
| 77 |
+
[string]$Cluster,
|
| 78 |
+
[string]$AwsRegion
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
$stackPrefix = "eksctl-$Cluster"
|
| 82 |
+
$stackQuery = "StackSummaries[?starts_with(StackName, '$stackPrefix') && (StackStatus!='DELETE_COMPLETE' && StackStatus!='DELETE_IN_PROGRESS')].StackName"
|
| 83 |
+
|
| 84 |
+
$stacksText = aws cloudformation list-stacks --region $AwsRegion --query $stackQuery --output text 2>$null
|
| 85 |
+
if ($LASTEXITCODE -ne 0 -or [string]::IsNullOrWhiteSpace($stacksText) -or $stacksText -eq "None") {
|
| 86 |
+
return
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
$stacks = @($stacksText -split '\s+' | Where-Object { -not [string]::IsNullOrWhiteSpace($_) })
|
| 90 |
+
foreach ($stack in $stacks) {
|
| 91 |
+
Write-Host "Deleting residual stack: $stack" -ForegroundColor Yellow
|
| 92 |
+
Invoke-CheckedCommand -Command { aws cloudformation delete-stack --stack-name $stack --region $AwsRegion 2>$null | Out-Null } -ErrorMessage "Failed to delete stack '$stack'"
|
| 93 |
+
Invoke-CheckedCommand -Command { aws cloudformation wait stack-delete-complete --stack-name $stack --region $AwsRegion } -ErrorMessage "Timed out deleting stack '$stack'"
|
| 94 |
+
}
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
function Get-AmpWorkspaceIdByAlias {
|
| 98 |
+
param(
|
| 99 |
+
[string]$Alias,
|
| 100 |
+
[string]$AwsRegion
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
try {
|
| 104 |
+
$id = aws amp list-workspaces --alias $Alias --region $AwsRegion --query 'workspaces[0].workspaceId' --output text 2>$null
|
| 105 |
+
} catch {
|
| 106 |
+
return $null
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
if ($LASTEXITCODE -ne 0 -or [string]::IsNullOrWhiteSpace($id) -or $id -eq "None") {
|
| 110 |
+
return $null
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
return $id.Trim()
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
function Wait-AmpWorkspaceDeleted {
|
| 117 |
+
param(
|
| 118 |
+
[string]$WorkspaceId,
|
| 119 |
+
[string]$AwsRegion
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
for ($i = 0; $i -lt 30; $i++) {
|
| 123 |
+
try {
|
| 124 |
+
$status = aws amp describe-workspace --workspace-id $WorkspaceId --region $AwsRegion --query 'workspace.status.statusCode' --output text 2>$null
|
| 125 |
+
} catch {
|
| 126 |
+
return
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
if ($LASTEXITCODE -ne 0 -or [string]::IsNullOrWhiteSpace($status) -or $status -eq "None") {
|
| 130 |
+
return
|
| 131 |
+
}
|
| 132 |
+
Start-Sleep -Seconds 10
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
throw "AMP workspace '$WorkspaceId' deletion timed out"
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
Write-Host ""
|
| 139 |
+
Write-Host "==========================================" -ForegroundColor Red
|
| 140 |
+
Write-Host " AntiAtropos AWS Infrastructure Teardown" -ForegroundColor Red
|
| 141 |
+
Write-Host "==========================================" -ForegroundColor Red
|
| 142 |
+
Write-Host "Region: $Region"
|
| 143 |
+
Write-Host "Cluster: $ClusterName"
|
| 144 |
+
Write-Host ""
|
| 145 |
+
|
| 146 |
+
# --- Step 1: Delete EKS Cluster ---
|
| 147 |
+
Write-Host ">>> Step 1: Deleting EKS cluster..." -ForegroundColor Yellow
|
| 148 |
+
|
| 149 |
+
$clusterStatus = Get-EksClusterStatus -Name $ClusterName -AwsRegion $Region
|
| 150 |
+
if ($clusterStatus) {
|
| 151 |
+
Write-Host "Cluster status: $clusterStatus" -ForegroundColor Yellow
|
| 152 |
+
|
| 153 |
+
if ($clusterStatus -ne "DELETING") {
|
| 154 |
+
$nodegroups = Get-EksNodegroups -Name $ClusterName -AwsRegion $Region
|
| 155 |
+
foreach ($ng in $nodegroups) {
|
| 156 |
+
Write-Host "Deleting nodegroup: $ng" -ForegroundColor Yellow
|
| 157 |
+
$ngStatus = aws eks describe-nodegroup --cluster-name $ClusterName --nodegroup-name $ng --region $Region --query 'nodegroup.status' --output text 2>$null
|
| 158 |
+
if ($LASTEXITCODE -eq 0 -and $ngStatus -ne "DELETING") {
|
| 159 |
+
Invoke-CheckedCommand -Command { aws eks delete-nodegroup --cluster-name $ClusterName --nodegroup-name $ng --region $Region --output text 2>$null | Out-Null } -ErrorMessage "Failed to start deletion for nodegroup '$ng'"
|
| 160 |
+
} else {
|
| 161 |
+
Write-Host "Nodegroup '$ng' already deleting" -ForegroundColor Yellow
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
Write-Host "Waiting for nodegroup deletion: $ng" -ForegroundColor Yellow
|
| 165 |
+
Invoke-CheckedCommand -Command { aws eks wait nodegroup-deleted --cluster-name $ClusterName --nodegroup-name $ng --region $Region } -ErrorMessage "Timed out waiting for nodegroup '$ng' deletion"
|
| 166 |
+
Write-Host "OK: Nodegroup deleted: $ng" -ForegroundColor Green
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
Write-Host "Deleting cluster control plane..." -ForegroundColor Yellow
|
| 170 |
+
Invoke-CheckedCommand -Command { eksctl delete cluster --name $ClusterName --region $Region 2>$null | Out-Null } -ErrorMessage "Failed to delete EKS cluster"
|
| 171 |
+
} else {
|
| 172 |
+
Write-Host "Cluster is already deleting" -ForegroundColor Yellow
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
Write-Host "Waiting for cluster deletion..." -ForegroundColor Yellow
|
| 176 |
+
Invoke-CheckedCommand -Command { aws eks wait cluster-deleted --name $ClusterName --region $Region } -ErrorMessage "Timed out waiting for EKS cluster deletion"
|
| 177 |
+
Write-Host "OK: Cluster deleted" -ForegroundColor Green
|
| 178 |
+
} else {
|
| 179 |
+
Write-Host "OK: Cluster not found, skipping" -ForegroundColor Green
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
Write-Host "Checking for residual eksctl stacks..." -ForegroundColor Yellow
|
| 183 |
+
Remove-ResidualEksStacks -Cluster $ClusterName -AwsRegion $Region
|
| 184 |
+
Write-Host "OK: Residual EKS stacks cleaned" -ForegroundColor Green
|
| 185 |
+
|
| 186 |
+
# --- Step 2: Delete AMP Workspace ---
|
| 187 |
+
Write-Host ""
|
| 188 |
+
Write-Host ">>> Step 2: Deleting AMP workspace..." -ForegroundColor Yellow
|
| 189 |
+
|
| 190 |
+
$AmpWsId = Get-AmpWorkspaceIdByAlias -Alias $AmpAlias -AwsRegion $Region
|
| 191 |
+
|
| 192 |
+
if (-not [string]::IsNullOrWhiteSpace($AmpWsId)) {
|
| 193 |
+
Invoke-CheckedCommand -Command { aws amp delete-workspace --workspace-id $AmpWsId --region $Region | Out-Null } -ErrorMessage "Failed to delete AMP workspace '$AmpWsId'"
|
| 194 |
+
Wait-AmpWorkspaceDeleted -WorkspaceId $AmpWsId -AwsRegion $Region
|
| 195 |
+
Write-Host "OK: AMP workspace deleted: $AmpWsId" -ForegroundColor Green
|
| 196 |
+
} else {
|
| 197 |
+
Write-Host "OK: AMP workspace not found, skipping" -ForegroundColor Green
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
# --- Step 3: Local kubeconfig cleanup ---
|
| 201 |
+
Write-Host ""
|
| 202 |
+
Write-Host ">>> Step 3: Cleaning local kubeconfig entries..." -ForegroundColor Yellow
|
| 203 |
+
|
| 204 |
+
try { kubectl config delete-context $ClusterName 2>$null | Out-Null } catch {}
|
| 205 |
+
try { kubectl config delete-cluster $ClusterName 2>$null | Out-Null } catch {}
|
| 206 |
+
try { kubectl config delete-user antiatropos-hf-user 2>$null | Out-Null } catch {}
|
| 207 |
+
|
| 208 |
+
if (Test-Path $GeneratedKubeconfig) {
|
| 209 |
+
Remove-Item $GeneratedKubeconfig -Force
|
| 210 |
+
Write-Host "OK: Removed generated kubeconfig file" -ForegroundColor Green
|
| 211 |
+
} else {
|
| 212 |
+
Write-Host "OK: Generated kubeconfig file not found, skipping" -ForegroundColor Green
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
# --- Step 4: Verify Cleanup ---
|
| 216 |
+
Write-Host ""
|
| 217 |
+
Write-Host ">>> Step 4: Verifying cleanup..." -ForegroundColor Yellow
|
| 218 |
+
|
| 219 |
+
$clusterStillExists = [bool](Get-EksClusterStatus -Name $ClusterName -AwsRegion $Region)
|
| 220 |
+
|
| 221 |
+
if ($clusterStillExists) {
|
| 222 |
+
Write-Host "WARN: Cluster still exists (deletion in progress)" -ForegroundColor Yellow
|
| 223 |
+
} else {
|
| 224 |
+
Write-Host "OK: Cluster deleted" -ForegroundColor Green
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
$ampStillExists = -not [string]::IsNullOrWhiteSpace((Get-AmpWorkspaceIdByAlias -Alias $AmpAlias -AwsRegion $Region))
|
| 228 |
+
|
| 229 |
+
if ($ampStillExists) {
|
| 230 |
+
Write-Host "WARN: AMP workspace alias '$AmpAlias' still exists" -ForegroundColor Yellow
|
| 231 |
+
} else {
|
| 232 |
+
Write-Host "OK: AMP workspace deleted" -ForegroundColor Green
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
# --- Done ---
|
| 236 |
+
Write-Host ""
|
| 237 |
+
Write-Host "==========================================" -ForegroundColor Green
|
| 238 |
+
Write-Host " Teardown Complete!" -ForegroundColor Green
|
| 239 |
+
Write-Host "==========================================" -ForegroundColor Green
|
| 240 |
+
Write-Host ""
|
| 241 |
+
Write-Host "All AWS infrastructure has been removed." -ForegroundColor Yellow
|
| 242 |
+
Write-Host ""
|
deploy/do/README.md
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# DigitalOcean Droplet one-shot deploy
|
| 2 |
+
|
| 3 |
+
This deploy flow is for a single Ubuntu Droplet running:
|
| 4 |
+
- k3s (single-node Kubernetes)
|
| 5 |
+
- AntiAtropos sample workloads (`prod-sre`)
|
| 6 |
+
- Prometheus + Grafana (`monitoring`)
|
| 7 |
+
- lightweight control-plane API (`antiatropos-control` on port `8010`)
|
| 8 |
+
|
| 9 |
+
The OpenEnv runtime (`server.app`) is intentionally **not** run on the droplet.
|
| 10 |
+
The only supported split is:
|
| 11 |
+
- local machine: OpenEnv server + inference loop
|
| 12 |
+
- droplet: Kubernetes executor API + observability stack
|
| 13 |
+
|
| 14 |
+
## Run
|
| 15 |
+
|
| 16 |
+
From repository root on the Droplet:
|
| 17 |
+
|
| 18 |
+
```bash
|
| 19 |
+
sudo bash deploy/do/deploy-droplet-one-shot.sh
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
Optional overrides:
|
| 23 |
+
|
| 24 |
+
```bash
|
| 25 |
+
sudo REPO_DIR=/opt/AntiAtropos CONTROL_PORT=8010 MAX_REPLICAS=200 bash deploy/do/deploy-droplet-one-shot.sh
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
## What the script configures
|
| 29 |
+
|
| 30 |
+
- k3s kubelet with `max-pods=250`
|
| 31 |
+
- Prometheus service exposed on NodePort `30090`
|
| 32 |
+
- Prometheus scrape job for annotated pods in namespace `prod-sre`
|
| 33 |
+
- Env file at `.env.droplet` with:
|
| 34 |
+
- `KUBECONFIG=/etc/rancher/k3s/k3s.yaml`
|
| 35 |
+
- `ANTIATROPOS_WORKLOAD_MAP` for `node-0`..`node-4`
|
| 36 |
+
- Systemd service:
|
| 37 |
+
- Name: `antiatropos-control`
|
| 38 |
+
- Exec: `uvicorn server.local_laptop_control:app --host 0.0.0.0 --port 8010`
|
| 39 |
+
- Legacy cleanup:
|
| 40 |
+
- `antiatropos-fastapi` (VM OpenEnv service) is disabled/removed by default deploy path
|
| 41 |
+
|
| 42 |
+
## Verify
|
| 43 |
+
|
| 44 |
+
```bash
|
| 45 |
+
systemctl status antiatropos-control --no-pager
|
| 46 |
+
curl http://127.0.0.1:8010/health
|
| 47 |
+
kubectl get deploy -n prod-sre
|
| 48 |
+
kubectl get pods -n monitoring
|
| 49 |
+
curl http://127.0.0.1:30090/api/v1/targets
|
| 50 |
+
kubectl -n monitoring port-forward svc/grafana 3000:80
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
Set local `.env` to use this consolidated path:
|
| 54 |
+
|
| 55 |
+
```env
|
| 56 |
+
ENV_URL=http://localhost:8000
|
| 57 |
+
ANTIATROPOS_CONTROL_PLANE_URL=http://<droplet-ip>:8010
|
| 58 |
+
PROMETHEUS_URL=http://<droplet-ip>:30090
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
## Deterministic remote-scaling proof
|
| 62 |
+
|
| 63 |
+
On droplet, watch desired replicas:
|
| 64 |
+
|
| 65 |
+
```bash
|
| 66 |
+
watch -n 1 'kubectl -n prod-sre get deploy -o custom-columns=NAME:.metadata.name,DESIRED:.spec.replicas,READY:.status.readyReplicas,AVAILABLE:.status.availableReplicas'
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
From local machine, send one control action:
|
| 70 |
+
|
| 71 |
+
```bash
|
| 72 |
+
curl -X POST http://<droplet-ip>:8010/step \
|
| 73 |
+
-H "Content-Type: application/json" \
|
| 74 |
+
-d '{"action_type":"SCALE_UP","target_node_id":"node-0","parameter":1.0}'
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
If `payments` desired replicas increase, scaling is happening on droplet.
|
| 78 |
+
|
| 79 |
+
## Troubleshooting
|
| 80 |
+
|
| 81 |
+
- **Pods do not move during inference**
|
| 82 |
+
- Verify local env points to droplet control API:
|
| 83 |
+
- `ANTIATROPOS_CONTROL_PLANE_URL=http://<droplet-ip>:8010`
|
| 84 |
+
- Check droplet control health:
|
| 85 |
+
- `curl http://127.0.0.1:8010/health`
|
| 86 |
+
- Check service status:
|
| 87 |
+
- `systemctl status antiatropos-control --no-pager`
|
| 88 |
+
- **Connection refused from local to droplet:8010**
|
| 89 |
+
- Service not running or firewall closed.
|
| 90 |
+
- Start service and open firewall if needed.
|
| 91 |
+
- **Need to remove legacy VM OpenEnv service**
|
| 92 |
+
- `sudo bash deploy/do/uninstall-legacy-openenv.sh`
|
deploy/do/antiatropos-control.service
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[Unit]
|
| 2 |
+
Description=AntiAtropos Droplet Control API
|
| 3 |
+
After=network-online.target k3s.service
|
| 4 |
+
Wants=network-online.target
|
| 5 |
+
|
| 6 |
+
[Service]
|
| 7 |
+
Type=simple
|
| 8 |
+
User=root
|
| 9 |
+
WorkingDirectory=/root/Anti-Atropos
|
| 10 |
+
EnvironmentFile=/root/Anti-Atropos/.env.droplet
|
| 11 |
+
ExecStart=/root/Anti-Atropos/.venv-droplet/bin/uvicorn server.local_laptop_control:app --host 0.0.0.0 --port 8010
|
| 12 |
+
Restart=always
|
| 13 |
+
RestartSec=3
|
| 14 |
+
|
| 15 |
+
[Install]
|
| 16 |
+
WantedBy=multi-user.target
|
deploy/do/deploy-droplet-one-shot.sh
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
# One-shot deploy for a single DigitalOcean Droplet:
|
| 5 |
+
# - Installs k3s with kubelet max-pods=250
|
| 6 |
+
# - Deploys workloads + Prometheus + Grafana
|
| 7 |
+
# - Creates env file for live Kubernetes scaling
|
| 8 |
+
# - Starts lightweight control-plane API via systemd (antiatropos-control)
|
| 9 |
+
|
| 10 |
+
if [[ "${EUID}" -ne 0 ]]; then
|
| 11 |
+
echo "Run as root: sudo bash deploy/do/deploy-droplet-one-shot.sh"
|
| 12 |
+
exit 1
|
| 13 |
+
fi
|
| 14 |
+
|
| 15 |
+
REPO_DIR="${REPO_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)}"
|
| 16 |
+
KUBECONFIG_PATH="${KUBECONFIG_PATH:-/etc/rancher/k3s/k3s.yaml}"
|
| 17 |
+
CONTROL_PORT="${CONTROL_PORT:-8010}"
|
| 18 |
+
CONTROL_HOST="${CONTROL_HOST:-0.0.0.0}"
|
| 19 |
+
K8S_NAMESPACE="${K8S_NAMESPACE:-prod-sre}"
|
| 20 |
+
MONITORING_NAMESPACE="${MONITORING_NAMESPACE:-monitoring}"
|
| 21 |
+
PY_VENV_DIR="${PY_VENV_DIR:-${REPO_DIR}/.venv-droplet}"
|
| 22 |
+
ENV_FILE="${ENV_FILE:-${REPO_DIR}/.env.droplet}"
|
| 23 |
+
MIN_REPLICAS="${MIN_REPLICAS:-1}"
|
| 24 |
+
MAX_REPLICAS="${MAX_REPLICAS:-250}"
|
| 25 |
+
SCALE_STEP="${SCALE_STEP:-3}"
|
| 26 |
+
WORKLOAD_MAP="${WORKLOAD_MAP:-{\"node-0\":{\"deployment\":\"payments\",\"namespace\":\"prod-sre\"},\"node-1\":{\"deployment\":\"checkout\",\"namespace\":\"prod-sre\"},\"node-2\":{\"deployment\":\"catalog\",\"namespace\":\"prod-sre\"},\"node-3\":{\"deployment\":\"cart\",\"namespace\":\"prod-sre\"},\"node-4\":{\"deployment\":\"auth\",\"namespace\":\"prod-sre\"}}}"
|
| 27 |
+
|
| 28 |
+
echo "=== AntiAtropos Droplet One-Shot Deploy ==="
|
| 29 |
+
echo "Repo: ${REPO_DIR}"
|
| 30 |
+
echo "Kubeconfig: ${KUBECONFIG_PATH}"
|
| 31 |
+
echo "Control API: ${CONTROL_HOST}:${CONTROL_PORT}"
|
| 32 |
+
echo ""
|
| 33 |
+
|
| 34 |
+
if [[ ! -f "${REPO_DIR}/deploy/local-laptop.yaml" ]]; then
|
| 35 |
+
echo "ERROR: deploy/local-laptop.yaml not found. Run from AntiAtropos checkout."
|
| 36 |
+
exit 1
|
| 37 |
+
fi
|
| 38 |
+
|
| 39 |
+
export DEBIAN_FRONTEND=noninteractive
|
| 40 |
+
apt-get update
|
| 41 |
+
apt-get install -y curl ca-certificates gnupg lsb-release python3 python3-venv python3-pip
|
| 42 |
+
|
| 43 |
+
if ! command -v kubectl >/dev/null 2>&1; then
|
| 44 |
+
echo "Installing k3s (includes kubectl)..."
|
| 45 |
+
curl -sfL https://get.k3s.io | sh -s - --write-kubeconfig-mode 644 --kubelet-arg=max-pods=250
|
| 46 |
+
else
|
| 47 |
+
echo "k3s/kubectl already present; skipping k3s install."
|
| 48 |
+
fi
|
| 49 |
+
|
| 50 |
+
if ! command -v helm >/dev/null 2>&1; then
|
| 51 |
+
echo "Installing Helm..."
|
| 52 |
+
curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
| 53 |
+
fi
|
| 54 |
+
|
| 55 |
+
export KUBECONFIG="${KUBECONFIG_PATH}"
|
| 56 |
+
|
| 57 |
+
echo "Waiting for Kubernetes node to be Ready..."
|
| 58 |
+
kubectl wait --for=condition=Ready node --all --timeout=180s
|
| 59 |
+
|
| 60 |
+
kubectl create ns "${K8S_NAMESPACE}" >/dev/null 2>&1 || true
|
| 61 |
+
kubectl create ns "${MONITORING_NAMESPACE}" >/dev/null 2>&1 || true
|
| 62 |
+
|
| 63 |
+
echo "Deploying AntiAtropos workloads..."
|
| 64 |
+
kubectl apply -f "${REPO_DIR}/deploy/local-laptop.yaml"
|
| 65 |
+
|
| 66 |
+
echo "Installing/upgrading Prometheus + Grafana..."
|
| 67 |
+
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts >/dev/null 2>&1 || true
|
| 68 |
+
helm repo add grafana https://grafana.github.io/helm-charts >/dev/null 2>&1 || true
|
| 69 |
+
helm repo update
|
| 70 |
+
|
| 71 |
+
helm upgrade --install prometheus prometheus-community/prometheus \
|
| 72 |
+
-n "${MONITORING_NAMESPACE}" \
|
| 73 |
+
-f "${REPO_DIR}/deploy/prometheus-helm-values.yaml"
|
| 74 |
+
|
| 75 |
+
if [[ -d "${REPO_DIR}/deploy/grafana/provisioning/dashboards/json" ]]; then
|
| 76 |
+
kubectl delete configmap grafana-dashboards -n "${MONITORING_NAMESPACE}" >/dev/null 2>&1 || true
|
| 77 |
+
kubectl create configmap grafana-dashboards \
|
| 78 |
+
-n "${MONITORING_NAMESPACE}" \
|
| 79 |
+
--from-file="${REPO_DIR}/deploy/grafana/provisioning/dashboards/json/"
|
| 80 |
+
fi
|
| 81 |
+
|
| 82 |
+
helm upgrade --install grafana grafana/grafana \
|
| 83 |
+
-n "${MONITORING_NAMESPACE}" \
|
| 84 |
+
-f "${REPO_DIR}/deploy/grafana-helm-values.yaml"
|
| 85 |
+
|
| 86 |
+
echo "Exposing Grafana on NodePort 30000..."
|
| 87 |
+
kubectl patch svc grafana -n "${MONITORING_NAMESPACE}" --type='merge' -p '{
|
| 88 |
+
"spec": {
|
| 89 |
+
"type": "NodePort",
|
| 90 |
+
"ports": [
|
| 91 |
+
{"port": 80, "nodePort": 30000, "targetPort": 3000, "name": "service"}
|
| 92 |
+
]
|
| 93 |
+
}
|
| 94 |
+
}' || true
|
| 95 |
+
|
| 96 |
+
echo "Waiting for Grafana pods to be ready..."
|
| 97 |
+
kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=grafana -n "${MONITORING_NAMESPACE}" --timeout=180s || true
|
| 98 |
+
|
| 99 |
+
if [[ ! -f "${ENV_FILE}" ]]; then
|
| 100 |
+
cat > "${ENV_FILE}" <<EOF
|
| 101 |
+
KUBECONFIG=/etc/rancher/k3s/k3s.yaml
|
| 102 |
+
ANTIATROPOS_K8S_NAMESPACE=prod-sre
|
| 103 |
+
ANTIATROPOS_MIN_REPLICAS=${MIN_REPLICAS}
|
| 104 |
+
ANTIATROPOS_MAX_REPLICAS=${MAX_REPLICAS}
|
| 105 |
+
ANTIATROPOS_SCALE_STEP=${SCALE_STEP}
|
| 106 |
+
ANTIATROPOS_WORKLOAD_MAP=${WORKLOAD_MAP}
|
| 107 |
+
EOF
|
| 108 |
+
echo "Created ${ENV_FILE}"
|
| 109 |
+
else
|
| 110 |
+
echo "Using existing ${ENV_FILE}"
|
| 111 |
+
fi
|
| 112 |
+
|
| 113 |
+
echo "Preparing Python environment..."
|
| 114 |
+
python3 -m venv "${PY_VENV_DIR}"
|
| 115 |
+
"${PY_VENV_DIR}/bin/python" -m pip install --upgrade pip
|
| 116 |
+
if [[ -f "${REPO_DIR}/pyproject.toml" ]]; then
|
| 117 |
+
# Prefer project metadata (uses openenv-core, not legacy openenv package name).
|
| 118 |
+
"${PY_VENV_DIR}/bin/pip" install -e "${REPO_DIR}"
|
| 119 |
+
else
|
| 120 |
+
"${PY_VENV_DIR}/bin/pip" install -r "${REPO_DIR}/server/requirements.txt"
|
| 121 |
+
fi
|
| 122 |
+
|
| 123 |
+
# Hard cleanup: remove legacy VM OpenEnv service if it exists.
|
| 124 |
+
if systemctl list-unit-files | grep -q '^antiatropos-fastapi\.service'; then
|
| 125 |
+
echo "Disabling legacy service antiatropos-fastapi..."
|
| 126 |
+
systemctl disable --now antiatropos-fastapi >/dev/null 2>&1 || true
|
| 127 |
+
rm -f /etc/systemd/system/antiatropos-fastapi.service
|
| 128 |
+
fi
|
| 129 |
+
|
| 130 |
+
cat > /etc/systemd/system/antiatropos-control.service <<EOF
|
| 131 |
+
[Unit]
|
| 132 |
+
Description=AntiAtropos Droplet Control API
|
| 133 |
+
After=network-online.target k3s.service
|
| 134 |
+
Wants=network-online.target
|
| 135 |
+
|
| 136 |
+
[Service]
|
| 137 |
+
Type=simple
|
| 138 |
+
User=root
|
| 139 |
+
WorkingDirectory=${REPO_DIR}
|
| 140 |
+
EnvironmentFile=${ENV_FILE}
|
| 141 |
+
ExecStart=${PY_VENV_DIR}/bin/uvicorn server.local_laptop_control:app --host ${CONTROL_HOST} --port ${CONTROL_PORT}
|
| 142 |
+
Restart=always
|
| 143 |
+
RestartSec=3
|
| 144 |
+
|
| 145 |
+
[Install]
|
| 146 |
+
WantedBy=multi-user.target
|
| 147 |
+
EOF
|
| 148 |
+
|
| 149 |
+
systemctl daemon-reload
|
| 150 |
+
systemctl enable --now antiatropos-control
|
| 151 |
+
|
| 152 |
+
echo ""
|
| 153 |
+
echo "Waiting for control API readiness..."
|
| 154 |
+
for _ in {1..30}; do
|
| 155 |
+
if curl -fsS "http://127.0.0.1:${CONTROL_PORT}/health" >/dev/null 2>&1; then
|
| 156 |
+
break
|
| 157 |
+
fi
|
| 158 |
+
sleep 2
|
| 159 |
+
done
|
| 160 |
+
|
| 161 |
+
PUBLIC_IP="$(curl -fsS https://api.ipify.org 2>/dev/null || true)"
|
| 162 |
+
if [[ -z "${PUBLIC_IP}" ]]; then
|
| 163 |
+
PUBLIC_IP="$(hostname -I 2>/dev/null | awk '{print $1}')"
|
| 164 |
+
fi
|
| 165 |
+
PROM_URL_DISPLAY="http://${PUBLIC_IP:-<droplet-ip>}:30090"
|
| 166 |
+
|
| 167 |
+
echo ""
|
| 168 |
+
echo "=== Deploy Complete ==="
|
| 169 |
+
echo "Control health: http://127.0.0.1:${CONTROL_PORT}/health"
|
| 170 |
+
echo "Control step: http://127.0.0.1:${CONTROL_PORT}/step"
|
| 171 |
+
echo "Prometheus svc: kubectl -n ${MONITORING_NAMESPACE} get svc prometheus-server"
|
| 172 |
+
echo "Prometheus URL: ${PROM_URL_DISPLAY}"
|
| 173 |
+
echo "Grafana URL: http://${PUBLIC_IP:-<droplet-ip>}:30000 (admin / antiatropos)"
|
| 174 |
+
echo ""
|
| 175 |
+
echo "Service status command:"
|
| 176 |
+
echo " systemctl status antiatropos-control --no-pager"
|
| 177 |
+
echo ""
|
| 178 |
+
echo "If needed, edit env and restart control service:"
|
| 179 |
+
echo " ${ENV_FILE}"
|
| 180 |
+
echo " systemctl restart antiatropos-control"
|
| 181 |
+
echo ""
|
| 182 |
+
echo "Verify remote scaling path:"
|
| 183 |
+
echo " watch -n 1 'kubectl -n prod-sre get deploy -o custom-columns=NAME:.metadata.name,DESIRED:.spec.replicas,READY:.status.readyReplicas'"
|
deploy/do/uninstall-legacy-openenv.sh
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
# Removes legacy VM OpenEnv service path.
|
| 5 |
+
# This keeps droplet runtime focused on control API + observability only.
|
| 6 |
+
|
| 7 |
+
if [[ "${EUID}" -ne 0 ]]; then
|
| 8 |
+
echo "Run as root: sudo bash deploy/do/uninstall-legacy-openenv.sh"
|
| 9 |
+
exit 1
|
| 10 |
+
fi
|
| 11 |
+
|
| 12 |
+
if systemctl list-unit-files | grep -q '^antiatropos-fastapi\.service'; then
|
| 13 |
+
echo "Stopping and disabling antiatropos-fastapi..."
|
| 14 |
+
systemctl disable --now antiatropos-fastapi >/dev/null 2>&1 || true
|
| 15 |
+
else
|
| 16 |
+
echo "antiatropos-fastapi service not registered."
|
| 17 |
+
fi
|
| 18 |
+
|
| 19 |
+
if [[ -f /etc/systemd/system/antiatropos-fastapi.service ]]; then
|
| 20 |
+
rm -f /etc/systemd/system/antiatropos-fastapi.service
|
| 21 |
+
echo "Removed /etc/systemd/system/antiatropos-fastapi.service"
|
| 22 |
+
fi
|
| 23 |
+
|
| 24 |
+
systemctl daemon-reload
|
| 25 |
+
echo "Legacy VM OpenEnv service cleanup complete."
|
deploy/entrypoint.sh
CHANGED
|
@@ -1,62 +1,71 @@
|
|
| 1 |
-
#!/usr/bin/env bash
|
| 2 |
-
set -euo pipefail
|
| 3 |
-
|
| 4 |
-
FASTAPI_PID=""
|
| 5 |
-
PROMETHEUS_PID=""
|
| 6 |
-
GRAFANA_PID=""
|
| 7 |
-
NGINX_PID=""
|
| 8 |
-
MONITOR_PID=""
|
| 9 |
-
|
| 10 |
-
cleanup() {
|
| 11 |
-
for pid in "${MONITOR_PID}" "${NGINX_PID}" "${GRAFANA_PID}" "${PROMETHEUS_PID}" "${FASTAPI_PID}"; do
|
| 12 |
-
if [[ -n "${pid}" ]]; then
|
| 13 |
-
kill "${pid}" 2>/dev/null || true
|
| 14 |
-
fi
|
| 15 |
-
done
|
| 16 |
-
}
|
| 17 |
-
|
| 18 |
-
trap cleanup INT TERM EXIT
|
| 19 |
-
|
| 20 |
-
cd /app
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
--config
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
}
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
FASTAPI_PID=""
|
| 5 |
+
PROMETHEUS_PID=""
|
| 6 |
+
GRAFANA_PID=""
|
| 7 |
+
NGINX_PID=""
|
| 8 |
+
MONITOR_PID=""
|
| 9 |
+
|
| 10 |
+
cleanup() {
|
| 11 |
+
for pid in "${MONITOR_PID}" "${NGINX_PID}" "${GRAFANA_PID}" "${PROMETHEUS_PID}" "${FASTAPI_PID}"; do
|
| 12 |
+
if [[ -n "${pid}" ]]; then
|
| 13 |
+
kill "${pid}" 2>/dev/null || true
|
| 14 |
+
fi
|
| 15 |
+
done
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
trap cleanup INT TERM EXIT
|
| 19 |
+
|
| 20 |
+
cd /app
|
| 21 |
+
|
| 22 |
+
# Source HF Spaces live-mode config if present (overrides Dockerfile defaults)
|
| 23 |
+
if [[ -f /app/.env.hf ]]; then
|
| 24 |
+
echo "Loading .env.hf..."
|
| 25 |
+
set -a
|
| 26 |
+
# shellcheck source=/dev/null
|
| 27 |
+
source /app/.env.hf
|
| 28 |
+
set +a
|
| 29 |
+
fi
|
| 30 |
+
|
| 31 |
+
uvicorn server.app:app --host 127.0.0.1 --port 8000 &
|
| 32 |
+
FASTAPI_PID=$!
|
| 33 |
+
|
| 34 |
+
/opt/prometheus/prometheus \
|
| 35 |
+
--config.file=/etc/prometheus/prometheus.yml \
|
| 36 |
+
--storage.tsdb.path=/tmp/prometheus-data \
|
| 37 |
+
--web.listen-address=127.0.0.1:9090 \
|
| 38 |
+
--web.route-prefix=/prometheus \
|
| 39 |
+
&
|
| 40 |
+
PROMETHEUS_PID=$!
|
| 41 |
+
|
| 42 |
+
/opt/grafana/bin/grafana-server \
|
| 43 |
+
--homepath /opt/grafana \
|
| 44 |
+
--config /etc/grafana/grafana.ini \
|
| 45 |
+
cfg:default.paths.data=/var/lib/grafana \
|
| 46 |
+
cfg:default.paths.logs=/var/log/grafana \
|
| 47 |
+
cfg:default.paths.plugins=/var/lib/grafana/plugins \
|
| 48 |
+
cfg:default.paths.provisioning=/etc/grafana/provisioning \
|
| 49 |
+
&
|
| 50 |
+
GRAFANA_PID=$!
|
| 51 |
+
|
| 52 |
+
nginx -g "daemon off;" &
|
| 53 |
+
NGINX_PID=$!
|
| 54 |
+
|
| 55 |
+
monitor_children() {
|
| 56 |
+
while true; do
|
| 57 |
+
for pid in "${FASTAPI_PID}" "${PROMETHEUS_PID}" "${GRAFANA_PID}"; do
|
| 58 |
+
if ! kill -0 "${pid}" 2>/dev/null; then
|
| 59 |
+
echo "A backend service exited unexpectedly." >&2
|
| 60 |
+
kill "${NGINX_PID}" 2>/dev/null || true
|
| 61 |
+
exit 1
|
| 62 |
+
fi
|
| 63 |
+
done
|
| 64 |
+
sleep 2
|
| 65 |
+
done
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
monitor_children &
|
| 69 |
+
MONITOR_PID=$!
|
| 70 |
+
|
| 71 |
+
wait "${NGINX_PID}"
|
deploy/grafana-datasource-local.yaml
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Grafana datasource provisioning - points to in-cluster Prometheus
|
| 2 |
+
apiVersion: 1
|
| 3 |
+
|
| 4 |
+
datasources:
|
| 5 |
+
- name: Prometheus
|
| 6 |
+
uid: PBFA97CFB590B2093
|
| 7 |
+
type: prometheus
|
| 8 |
+
access: proxy
|
| 9 |
+
url: http://prometheus-server.monitoring.svc.cluster.local
|
| 10 |
+
isDefault: true
|
| 11 |
+
editable: true
|
deploy/grafana-helm-values.yaml
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Grafana self-hosted on Kind - Simplified dashboard + datasource setup
|
| 2 |
+
|
| 3 |
+
adminUser: admin
|
| 4 |
+
adminPassword: antiatropos
|
| 5 |
+
|
| 6 |
+
service:
|
| 7 |
+
type: ClusterIP
|
| 8 |
+
port: 80
|
| 9 |
+
|
| 10 |
+
persistence:
|
| 11 |
+
enabled: false
|
| 12 |
+
|
| 13 |
+
# Datasource provisioning - mount as separate file
|
| 14 |
+
datasources:
|
| 15 |
+
datasources.yaml:
|
| 16 |
+
apiVersion: 1
|
| 17 |
+
datasources:
|
| 18 |
+
- name: Prometheus
|
| 19 |
+
uid: PBFA97CFB590B2093
|
| 20 |
+
type: prometheus
|
| 21 |
+
access: proxy
|
| 22 |
+
url: http://prometheus-server.monitoring.svc.cluster.local
|
| 23 |
+
isDefault: true
|
| 24 |
+
editable: true
|
| 25 |
+
|
| 26 |
+
# Dashboard provider config
|
| 27 |
+
dashboardProviders:
|
| 28 |
+
dashboardproviders.yaml:
|
| 29 |
+
apiVersion: 1
|
| 30 |
+
providers:
|
| 31 |
+
- name: AntiAtropos
|
| 32 |
+
orgId: 1
|
| 33 |
+
folder: AntiAtropos
|
| 34 |
+
type: file
|
| 35 |
+
disableDeletion: false
|
| 36 |
+
editable: true
|
| 37 |
+
updateIntervalSeconds: 30
|
| 38 |
+
options:
|
| 39 |
+
path: /var/lib/grafana/dashboards/antiatropos
|
| 40 |
+
|
| 41 |
+
# Mount dashboard JSONs from ConfigMap
|
| 42 |
+
extraConfigmapMounts:
|
| 43 |
+
- name: grafana-dashboards
|
| 44 |
+
configMap: grafana-dashboards
|
| 45 |
+
mountPath: /var/lib/grafana/dashboards/antiatropos
|
| 46 |
+
readOnly: true
|
deploy/grafana/grafana.ini
CHANGED
|
@@ -1,21 +1,21 @@
|
|
| 1 |
-
[server]
|
| 2 |
-
http_addr = 127.0.0.1
|
| 3 |
-
http_port = 3000
|
| 4 |
-
domain = localhost
|
| 5 |
-
root_url = /grafana/
|
| 6 |
-
serve_from_sub_path = true
|
| 7 |
-
router_logging = false
|
| 8 |
-
enable_gzip = true
|
| 9 |
-
|
| 10 |
-
[auth]
|
| 11 |
-
disable_login_form = false
|
| 12 |
-
|
| 13 |
-
[auth.anonymous]
|
| 14 |
-
enabled = true
|
| 15 |
-
org_role = Viewer
|
| 16 |
-
|
| 17 |
-
[dashboards]
|
| 18 |
-
default_home_dashboard_path = /etc/grafana/provisioning/dashboards/json/antiatropos-overview.json
|
| 19 |
-
|
| 20 |
-
[security]
|
| 21 |
-
allow_embedding = true
|
|
|
|
| 1 |
+
[server]
|
| 2 |
+
http_addr = 127.0.0.1
|
| 3 |
+
http_port = 3000
|
| 4 |
+
domain = localhost
|
| 5 |
+
root_url = /grafana/
|
| 6 |
+
serve_from_sub_path = true
|
| 7 |
+
router_logging = false
|
| 8 |
+
enable_gzip = true
|
| 9 |
+
|
| 10 |
+
[auth]
|
| 11 |
+
disable_login_form = false
|
| 12 |
+
|
| 13 |
+
[auth.anonymous]
|
| 14 |
+
enabled = true
|
| 15 |
+
org_role = Viewer
|
| 16 |
+
|
| 17 |
+
[dashboards]
|
| 18 |
+
default_home_dashboard_path = /etc/grafana/provisioning/dashboards/json/antiatropos-overview.json
|
| 19 |
+
|
| 20 |
+
[security]
|
| 21 |
+
allow_embedding = true
|
deploy/grafana/provisioning/dashboards/dashboard.yaml
CHANGED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
-
apiVersion: 1
|
| 2 |
-
|
| 3 |
-
providers:
|
| 4 |
-
- name: AntiAtropos Dashboards
|
| 5 |
-
orgId: 1
|
| 6 |
-
folder: AntiAtropos
|
| 7 |
-
type: file
|
| 8 |
-
disableDeletion: false
|
| 9 |
-
editable: true
|
| 10 |
-
updateIntervalSeconds: 30
|
| 11 |
-
options:
|
| 12 |
-
path: /etc/grafana/provisioning/dashboards/json
|
|
|
|
| 1 |
+
apiVersion: 1
|
| 2 |
+
|
| 3 |
+
providers:
|
| 4 |
+
- name: AntiAtropos Dashboards
|
| 5 |
+
orgId: 1
|
| 6 |
+
folder: AntiAtropos
|
| 7 |
+
type: file
|
| 8 |
+
disableDeletion: false
|
| 9 |
+
editable: true
|
| 10 |
+
updateIntervalSeconds: 30
|
| 11 |
+
options:
|
| 12 |
+
path: /etc/grafana/provisioning/dashboards/json
|
deploy/grafana/provisioning/dashboards/json/antiatropos-live.json
CHANGED
|
@@ -1,334 +1,334 @@
|
|
| 1 |
-
{
|
| 2 |
-
"annotations": {
|
| 3 |
-
"list": [
|
| 4 |
-
{
|
| 5 |
-
"builtIn": 1,
|
| 6 |
-
"datasource": {
|
| 7 |
-
"type": "grafana",
|
| 8 |
-
"uid": "-- Grafana --"
|
| 9 |
-
},
|
| 10 |
-
"enable": true,
|
| 11 |
-
"hide": true,
|
| 12 |
-
"iconColor": "rgba(0, 211, 255, 1)",
|
| 13 |
-
"name": "Annotations & Alerts",
|
| 14 |
-
"type": "dashboard"
|
| 15 |
-
}
|
| 16 |
-
]
|
| 17 |
-
},
|
| 18 |
-
"editable": true,
|
| 19 |
-
"fiscalYearStartMonth": 0,
|
| 20 |
-
"graphTooltip": 0,
|
| 21 |
-
"id": null,
|
| 22 |
-
"links": [],
|
| 23 |
-
"liveNow": false,
|
| 24 |
-
"panels": [
|
| 25 |
-
{
|
| 26 |
-
"datasource": {
|
| 27 |
-
"type": "prometheus",
|
| 28 |
-
"uid": "PBFA97CFB590B2093"
|
| 29 |
-
},
|
| 30 |
-
"fieldConfig": {
|
| 31 |
-
"defaults": {
|
| 32 |
-
"color": {
|
| 33 |
-
"mode": "palette-classic"
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
"overrides": []
|
| 37 |
-
},
|
| 38 |
-
"gridPos": {
|
| 39 |
-
"h": 7,
|
| 40 |
-
"w": 12,
|
| 41 |
-
"x": 0,
|
| 42 |
-
"y": 0
|
| 43 |
-
},
|
| 44 |
-
"id": 1,
|
| 45 |
-
"options": {
|
| 46 |
-
"legend": {
|
| 47 |
-
"calcs": [],
|
| 48 |
-
"displayMode": "list",
|
| 49 |
-
"placement": "bottom"
|
| 50 |
-
},
|
| 51 |
-
"tooltip": {
|
| 52 |
-
"mode": "single"
|
| 53 |
-
}
|
| 54 |
-
},
|
| 55 |
-
"targets": [
|
| 56 |
-
{
|
| 57 |
-
"expr": "sum by (action_type, ack_class) (rate(antiatropos_actions_total{task_id=~\"$task\",mode=~\"$mode\"}[
|
| 58 |
-
"legendFormat": "{{action_type}} {{ack_class}}",
|
| 59 |
-
"refId": "A"
|
| 60 |
-
}
|
| 61 |
-
],
|
| 62 |
-
"title": "Actions Per Second",
|
| 63 |
-
"type": "timeseries"
|
| 64 |
-
},
|
| 65 |
-
{
|
| 66 |
-
"datasource": {
|
| 67 |
-
"type": "prometheus",
|
| 68 |
-
"uid": "PBFA97CFB590B2093"
|
| 69 |
-
},
|
| 70 |
-
"fieldConfig": {
|
| 71 |
-
"defaults": {
|
| 72 |
-
"color": {
|
| 73 |
-
"mode": "palette-classic"
|
| 74 |
-
},
|
| 75 |
-
"min": 0,
|
| 76 |
-
"max": 1
|
| 77 |
-
},
|
| 78 |
-
"overrides": []
|
| 79 |
-
},
|
| 80 |
-
"gridPos": {
|
| 81 |
-
"h": 7,
|
| 82 |
-
"w": 12,
|
| 83 |
-
"x": 12,
|
| 84 |
-
"y": 0
|
| 85 |
-
},
|
| 86 |
-
"id": 2,
|
| 87 |
-
"options": {
|
| 88 |
-
"legend": {
|
| 89 |
-
"calcs": [],
|
| 90 |
-
"displayMode": "table",
|
| 91 |
-
"placement": "bottom"
|
| 92 |
-
},
|
| 93 |
-
"tooltip": {
|
| 94 |
-
"mode": "single"
|
| 95 |
-
}
|
| 96 |
-
},
|
| 97 |
-
"targets": [
|
| 98 |
-
{
|
| 99 |
-
"expr": "antiatropos_reward_normalized{task_id=~\"$task\",mode=~\"$mode\"}",
|
| 100 |
-
"legendFormat": "{{task_id}}/{{mode}} normalized",
|
| 101 |
-
"refId": "A"
|
| 102 |
-
}
|
| 103 |
-
],
|
| 104 |
-
"title": "Normalized Reward [0,1]",
|
| 105 |
-
"type": "timeseries"
|
| 106 |
-
},
|
| 107 |
-
{
|
| 108 |
-
"datasource": {
|
| 109 |
-
"type": "prometheus",
|
| 110 |
-
"uid": "PBFA97CFB590B2093"
|
| 111 |
-
},
|
| 112 |
-
"fieldConfig": {
|
| 113 |
-
"defaults": {
|
| 114 |
-
"color": {
|
| 115 |
-
"mode": "palette-classic"
|
| 116 |
-
}
|
| 117 |
-
},
|
| 118 |
-
"overrides": []
|
| 119 |
-
},
|
| 120 |
-
"gridPos": {
|
| 121 |
-
"h": 7,
|
| 122 |
-
"w": 12,
|
| 123 |
-
"x": 0,
|
| 124 |
-
"y": 7
|
| 125 |
-
},
|
| 126 |
-
"id": 3,
|
| 127 |
-
"options": {
|
| 128 |
-
"legend": {
|
| 129 |
-
"calcs": [],
|
| 130 |
-
"displayMode": "table",
|
| 131 |
-
"placement": "bottom"
|
| 132 |
-
},
|
| 133 |
-
"tooltip": {
|
| 134 |
-
"mode": "single"
|
| 135 |
-
}
|
| 136 |
-
},
|
| 137 |
-
"targets": [
|
| 138 |
-
{
|
| 139 |
-
"expr": "antiatropos_reward_raw{task_id=~\"$task\",mode=~\"$mode\"}",
|
| 140 |
-
"legendFormat": "{{task_id}}/{{mode}} raw",
|
| 141 |
-
"refId": "A"
|
| 142 |
-
}
|
| 143 |
-
],
|
| 144 |
-
"title": "Raw Reward",
|
| 145 |
-
"type": "timeseries"
|
| 146 |
-
},
|
| 147 |
-
{
|
| 148 |
-
"datasource": {
|
| 149 |
-
"type": "prometheus",
|
| 150 |
-
"uid": "PBFA97CFB590B2093"
|
| 151 |
-
},
|
| 152 |
-
"fieldConfig": {
|
| 153 |
-
"defaults": {
|
| 154 |
-
"color": {
|
| 155 |
-
"mode": "palette-classic"
|
| 156 |
-
}
|
| 157 |
-
},
|
| 158 |
-
"overrides": []
|
| 159 |
-
},
|
| 160 |
-
"gridPos": {
|
| 161 |
-
"h": 7,
|
| 162 |
-
"w": 12,
|
| 163 |
-
"x": 12,
|
| 164 |
-
"y": 7
|
| 165 |
-
},
|
| 166 |
-
"id": 4,
|
| 167 |
-
"options": {
|
| 168 |
-
"legend": {
|
| 169 |
-
"calcs": [],
|
| 170 |
-
"displayMode": "table",
|
| 171 |
-
"placement": "bottom"
|
| 172 |
-
},
|
| 173 |
-
"tooltip": {
|
| 174 |
-
"mode": "single"
|
| 175 |
-
}
|
| 176 |
-
},
|
| 177 |
-
"targets": [
|
| 178 |
-
{
|
| 179 |
-
"expr": "antiatropos_total_queue_backlog{task_id=~\"$task\",mode=~\"$mode\"}",
|
| 180 |
-
"legendFormat": "{{task_id}}/{{mode}} queue",
|
| 181 |
-
"refId": "A"
|
| 182 |
-
},
|
| 183 |
-
{
|
| 184 |
-
"expr": "antiatropos_average_latency_norm{task_id=~\"$task\",mode=~\"$mode\"}",
|
| 185 |
-
"legendFormat": "{{task_id}}/{{mode}} latency",
|
| 186 |
-
"refId": "B"
|
| 187 |
-
}
|
| 188 |
-
],
|
| 189 |
-
"title": "Queue Backlog and Latency (Norm)",
|
| 190 |
-
"type": "timeseries"
|
| 191 |
-
},
|
| 192 |
-
{
|
| 193 |
-
"datasource": {
|
| 194 |
-
"type": "prometheus",
|
| 195 |
-
"uid": "PBFA97CFB590B2093"
|
| 196 |
-
},
|
| 197 |
-
"fieldConfig": {
|
| 198 |
-
"defaults": {
|
| 199 |
-
"color": {
|
| 200 |
-
"mode": "palette-classic"
|
| 201 |
-
}
|
| 202 |
-
},
|
| 203 |
-
"overrides": []
|
| 204 |
-
},
|
| 205 |
-
"gridPos": {
|
| 206 |
-
"h": 7,
|
| 207 |
-
"w": 12,
|
| 208 |
-
"x": 0,
|
| 209 |
-
"y": 14
|
| 210 |
-
},
|
| 211 |
-
"id": 5,
|
| 212 |
-
"options": {
|
| 213 |
-
"legend": {
|
| 214 |
-
"calcs": [],
|
| 215 |
-
"displayMode": "table",
|
| 216 |
-
"placement": "bottom"
|
| 217 |
-
},
|
| 218 |
-
"tooltip": {
|
| 219 |
-
"mode": "single"
|
| 220 |
-
}
|
| 221 |
-
},
|
| 222 |
-
"targets": [
|
| 223 |
-
{
|
| 224 |
-
"expr": "antiatropos_lyapunov_energy{task_id=~\"$task\",mode=~\"$mode\"}",
|
| 225 |
-
"legendFormat": "{{task_id}}/{{mode}}",
|
| 226 |
-
"refId": "A"
|
| 227 |
-
}
|
| 228 |
-
],
|
| 229 |
-
"title": "Lyapunov Energy",
|
| 230 |
-
"type": "timeseries"
|
| 231 |
-
},
|
| 232 |
-
{
|
| 233 |
-
"datasource": {
|
| 234 |
-
"type": "prometheus",
|
| 235 |
-
"uid": "PBFA97CFB590B2093"
|
| 236 |
-
},
|
| 237 |
-
"fieldConfig": {
|
| 238 |
-
"defaults": {
|
| 239 |
-
"color": {
|
| 240 |
-
"mode": "palette-classic"
|
| 241 |
-
}
|
| 242 |
-
},
|
| 243 |
-
"overrides": []
|
| 244 |
-
},
|
| 245 |
-
"gridPos": {
|
| 246 |
-
"h": 7,
|
| 247 |
-
"w": 12,
|
| 248 |
-
"x": 12,
|
| 249 |
-
"y": 14
|
| 250 |
-
},
|
| 251 |
-
"id": 6,
|
| 252 |
-
"options": {
|
| 253 |
-
"legend": {
|
| 254 |
-
"calcs": [],
|
| 255 |
-
"displayMode": "table",
|
| 256 |
-
"placement": "bottom"
|
| 257 |
-
},
|
| 258 |
-
"tooltip": {
|
| 259 |
-
"mode": "single"
|
| 260 |
-
}
|
| 261 |
-
},
|
| 262 |
-
"targets": [
|
| 263 |
-
{
|
| 264 |
-
"expr": "histogram_quantile(0.95, sum(rate(antiatropos_executor_latency_ms_bucket{mode=~\"$mode\"}[
|
| 265 |
-
"legendFormat": "p95 {{mode}}",
|
| 266 |
-
"refId": "A"
|
| 267 |
-
},
|
| 268 |
-
{
|
| 269 |
-
"expr": "sum by (mode, error_code) (rate(antiatropos_executor_errors_total{mode=~\"$mode\"}[5m]))",
|
| 270 |
-
"legendFormat": "{{mode}} {{error_code}}",
|
| 271 |
-
"refId": "B"
|
| 272 |
-
}
|
| 273 |
-
],
|
| 274 |
-
"title": "Executor Latency p95 and Errors/s",
|
| 275 |
-
"type": "timeseries"
|
| 276 |
-
}
|
| 277 |
-
],
|
| 278 |
-
"refresh": "5s",
|
| 279 |
-
"schemaVersion": 39,
|
| 280 |
-
"style": "dark",
|
| 281 |
-
"tags": [
|
| 282 |
-
"antiatropos",
|
| 283 |
-
"sre",
|
| 284 |
-
"rl"
|
| 285 |
-
],
|
| 286 |
-
"templating": {
|
| 287 |
-
"list": [
|
| 288 |
-
{
|
| 289 |
-
"datasource": {
|
| 290 |
-
"type": "prometheus",
|
| 291 |
-
"uid": "PBFA97CFB590B2093"
|
| 292 |
-
},
|
| 293 |
-
"definition": "label_values(antiatropos_steps_total, task_id)",
|
| 294 |
-
"includeAll": true,
|
| 295 |
-
"multi": true,
|
| 296 |
-
"name": "task",
|
| 297 |
-
"query": {
|
| 298 |
-
"qryType": 1,
|
| 299 |
-
"query": "label_values(antiatropos_steps_total, task_id)",
|
| 300 |
-
"refId": "TaskVar"
|
| 301 |
-
},
|
| 302 |
-
"refresh": 2,
|
| 303 |
-
"type": "query"
|
| 304 |
-
},
|
| 305 |
-
{
|
| 306 |
-
"datasource": {
|
| 307 |
-
"type": "prometheus",
|
| 308 |
-
"uid": "PBFA97CFB590B2093"
|
| 309 |
-
},
|
| 310 |
-
"definition": "label_values(antiatropos_steps_total, mode)",
|
| 311 |
-
"includeAll": true,
|
| 312 |
-
"multi": true,
|
| 313 |
-
"name": "mode",
|
| 314 |
-
"query": {
|
| 315 |
-
"qryType": 1,
|
| 316 |
-
"query": "label_values(antiatropos_steps_total, mode)",
|
| 317 |
-
"refId": "ModeVar"
|
| 318 |
-
},
|
| 319 |
-
"refresh": 2,
|
| 320 |
-
"type": "query"
|
| 321 |
-
}
|
| 322 |
-
]
|
| 323 |
-
},
|
| 324 |
-
"time": {
|
| 325 |
-
"from": "now-15m",
|
| 326 |
-
"to": "now"
|
| 327 |
-
},
|
| 328 |
-
"timepicker": {},
|
| 329 |
-
"timezone": "",
|
| 330 |
-
"title": "AntiAtropos Live Control Plane",
|
| 331 |
-
"uid": "antiatropos-live",
|
| 332 |
-
"version": 2,
|
| 333 |
-
"weekStart": ""
|
| 334 |
-
}
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"annotations": {
|
| 3 |
+
"list": [
|
| 4 |
+
{
|
| 5 |
+
"builtIn": 1,
|
| 6 |
+
"datasource": {
|
| 7 |
+
"type": "grafana",
|
| 8 |
+
"uid": "-- Grafana --"
|
| 9 |
+
},
|
| 10 |
+
"enable": true,
|
| 11 |
+
"hide": true,
|
| 12 |
+
"iconColor": "rgba(0, 211, 255, 1)",
|
| 13 |
+
"name": "Annotations & Alerts",
|
| 14 |
+
"type": "dashboard"
|
| 15 |
+
}
|
| 16 |
+
]
|
| 17 |
+
},
|
| 18 |
+
"editable": true,
|
| 19 |
+
"fiscalYearStartMonth": 0,
|
| 20 |
+
"graphTooltip": 0,
|
| 21 |
+
"id": null,
|
| 22 |
+
"links": [],
|
| 23 |
+
"liveNow": false,
|
| 24 |
+
"panels": [
|
| 25 |
+
{
|
| 26 |
+
"datasource": {
|
| 27 |
+
"type": "prometheus",
|
| 28 |
+
"uid": "PBFA97CFB590B2093"
|
| 29 |
+
},
|
| 30 |
+
"fieldConfig": {
|
| 31 |
+
"defaults": {
|
| 32 |
+
"color": {
|
| 33 |
+
"mode": "palette-classic"
|
| 34 |
+
}
|
| 35 |
+
},
|
| 36 |
+
"overrides": []
|
| 37 |
+
},
|
| 38 |
+
"gridPos": {
|
| 39 |
+
"h": 7,
|
| 40 |
+
"w": 12,
|
| 41 |
+
"x": 0,
|
| 42 |
+
"y": 0
|
| 43 |
+
},
|
| 44 |
+
"id": 1,
|
| 45 |
+
"options": {
|
| 46 |
+
"legend": {
|
| 47 |
+
"calcs": [],
|
| 48 |
+
"displayMode": "list",
|
| 49 |
+
"placement": "bottom"
|
| 50 |
+
},
|
| 51 |
+
"tooltip": {
|
| 52 |
+
"mode": "single"
|
| 53 |
+
}
|
| 54 |
+
},
|
| 55 |
+
"targets": [
|
| 56 |
+
{
|
| 57 |
+
"expr": "sum by (action_type, ack_class) (rate(antiatropos_actions_total{task_id=~\"$task\",mode=~\"$mode\"}[5m]))",
|
| 58 |
+
"legendFormat": "{{action_type}} {{ack_class}}",
|
| 59 |
+
"refId": "A"
|
| 60 |
+
}
|
| 61 |
+
],
|
| 62 |
+
"title": "Actions Per Second",
|
| 63 |
+
"type": "timeseries"
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
"datasource": {
|
| 67 |
+
"type": "prometheus",
|
| 68 |
+
"uid": "PBFA97CFB590B2093"
|
| 69 |
+
},
|
| 70 |
+
"fieldConfig": {
|
| 71 |
+
"defaults": {
|
| 72 |
+
"color": {
|
| 73 |
+
"mode": "palette-classic"
|
| 74 |
+
},
|
| 75 |
+
"min": 0,
|
| 76 |
+
"max": 1
|
| 77 |
+
},
|
| 78 |
+
"overrides": []
|
| 79 |
+
},
|
| 80 |
+
"gridPos": {
|
| 81 |
+
"h": 7,
|
| 82 |
+
"w": 12,
|
| 83 |
+
"x": 12,
|
| 84 |
+
"y": 0
|
| 85 |
+
},
|
| 86 |
+
"id": 2,
|
| 87 |
+
"options": {
|
| 88 |
+
"legend": {
|
| 89 |
+
"calcs": [],
|
| 90 |
+
"displayMode": "table",
|
| 91 |
+
"placement": "bottom"
|
| 92 |
+
},
|
| 93 |
+
"tooltip": {
|
| 94 |
+
"mode": "single"
|
| 95 |
+
}
|
| 96 |
+
},
|
| 97 |
+
"targets": [
|
| 98 |
+
{
|
| 99 |
+
"expr": "antiatropos_reward_normalized{task_id=~\"$task\",mode=~\"$mode\"}",
|
| 100 |
+
"legendFormat": "{{task_id}}/{{mode}} normalized",
|
| 101 |
+
"refId": "A"
|
| 102 |
+
}
|
| 103 |
+
],
|
| 104 |
+
"title": "Normalized Reward [0,1]",
|
| 105 |
+
"type": "timeseries"
|
| 106 |
+
},
|
| 107 |
+
{
|
| 108 |
+
"datasource": {
|
| 109 |
+
"type": "prometheus",
|
| 110 |
+
"uid": "PBFA97CFB590B2093"
|
| 111 |
+
},
|
| 112 |
+
"fieldConfig": {
|
| 113 |
+
"defaults": {
|
| 114 |
+
"color": {
|
| 115 |
+
"mode": "palette-classic"
|
| 116 |
+
}
|
| 117 |
+
},
|
| 118 |
+
"overrides": []
|
| 119 |
+
},
|
| 120 |
+
"gridPos": {
|
| 121 |
+
"h": 7,
|
| 122 |
+
"w": 12,
|
| 123 |
+
"x": 0,
|
| 124 |
+
"y": 7
|
| 125 |
+
},
|
| 126 |
+
"id": 3,
|
| 127 |
+
"options": {
|
| 128 |
+
"legend": {
|
| 129 |
+
"calcs": [],
|
| 130 |
+
"displayMode": "table",
|
| 131 |
+
"placement": "bottom"
|
| 132 |
+
},
|
| 133 |
+
"tooltip": {
|
| 134 |
+
"mode": "single"
|
| 135 |
+
}
|
| 136 |
+
},
|
| 137 |
+
"targets": [
|
| 138 |
+
{
|
| 139 |
+
"expr": "antiatropos_reward_raw{task_id=~\"$task\",mode=~\"$mode\"}",
|
| 140 |
+
"legendFormat": "{{task_id}}/{{mode}} raw",
|
| 141 |
+
"refId": "A"
|
| 142 |
+
}
|
| 143 |
+
],
|
| 144 |
+
"title": "Raw Reward",
|
| 145 |
+
"type": "timeseries"
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"datasource": {
|
| 149 |
+
"type": "prometheus",
|
| 150 |
+
"uid": "PBFA97CFB590B2093"
|
| 151 |
+
},
|
| 152 |
+
"fieldConfig": {
|
| 153 |
+
"defaults": {
|
| 154 |
+
"color": {
|
| 155 |
+
"mode": "palette-classic"
|
| 156 |
+
}
|
| 157 |
+
},
|
| 158 |
+
"overrides": []
|
| 159 |
+
},
|
| 160 |
+
"gridPos": {
|
| 161 |
+
"h": 7,
|
| 162 |
+
"w": 12,
|
| 163 |
+
"x": 12,
|
| 164 |
+
"y": 7
|
| 165 |
+
},
|
| 166 |
+
"id": 4,
|
| 167 |
+
"options": {
|
| 168 |
+
"legend": {
|
| 169 |
+
"calcs": [],
|
| 170 |
+
"displayMode": "table",
|
| 171 |
+
"placement": "bottom"
|
| 172 |
+
},
|
| 173 |
+
"tooltip": {
|
| 174 |
+
"mode": "single"
|
| 175 |
+
}
|
| 176 |
+
},
|
| 177 |
+
"targets": [
|
| 178 |
+
{
|
| 179 |
+
"expr": "antiatropos_total_queue_backlog{task_id=~\"$task\",mode=~\"$mode\"}",
|
| 180 |
+
"legendFormat": "{{task_id}}/{{mode}} queue",
|
| 181 |
+
"refId": "A"
|
| 182 |
+
},
|
| 183 |
+
{
|
| 184 |
+
"expr": "antiatropos_average_latency_norm{task_id=~\"$task\",mode=~\"$mode\"}",
|
| 185 |
+
"legendFormat": "{{task_id}}/{{mode}} latency",
|
| 186 |
+
"refId": "B"
|
| 187 |
+
}
|
| 188 |
+
],
|
| 189 |
+
"title": "Queue Backlog and Latency (Norm)",
|
| 190 |
+
"type": "timeseries"
|
| 191 |
+
},
|
| 192 |
+
{
|
| 193 |
+
"datasource": {
|
| 194 |
+
"type": "prometheus",
|
| 195 |
+
"uid": "PBFA97CFB590B2093"
|
| 196 |
+
},
|
| 197 |
+
"fieldConfig": {
|
| 198 |
+
"defaults": {
|
| 199 |
+
"color": {
|
| 200 |
+
"mode": "palette-classic"
|
| 201 |
+
}
|
| 202 |
+
},
|
| 203 |
+
"overrides": []
|
| 204 |
+
},
|
| 205 |
+
"gridPos": {
|
| 206 |
+
"h": 7,
|
| 207 |
+
"w": 12,
|
| 208 |
+
"x": 0,
|
| 209 |
+
"y": 14
|
| 210 |
+
},
|
| 211 |
+
"id": 5,
|
| 212 |
+
"options": {
|
| 213 |
+
"legend": {
|
| 214 |
+
"calcs": [],
|
| 215 |
+
"displayMode": "table",
|
| 216 |
+
"placement": "bottom"
|
| 217 |
+
},
|
| 218 |
+
"tooltip": {
|
| 219 |
+
"mode": "single"
|
| 220 |
+
}
|
| 221 |
+
},
|
| 222 |
+
"targets": [
|
| 223 |
+
{
|
| 224 |
+
"expr": "antiatropos_lyapunov_energy{task_id=~\"$task\",mode=~\"$mode\"}",
|
| 225 |
+
"legendFormat": "{{task_id}}/{{mode}}",
|
| 226 |
+
"refId": "A"
|
| 227 |
+
}
|
| 228 |
+
],
|
| 229 |
+
"title": "Lyapunov Energy",
|
| 230 |
+
"type": "timeseries"
|
| 231 |
+
},
|
| 232 |
+
{
|
| 233 |
+
"datasource": {
|
| 234 |
+
"type": "prometheus",
|
| 235 |
+
"uid": "PBFA97CFB590B2093"
|
| 236 |
+
},
|
| 237 |
+
"fieldConfig": {
|
| 238 |
+
"defaults": {
|
| 239 |
+
"color": {
|
| 240 |
+
"mode": "palette-classic"
|
| 241 |
+
}
|
| 242 |
+
},
|
| 243 |
+
"overrides": []
|
| 244 |
+
},
|
| 245 |
+
"gridPos": {
|
| 246 |
+
"h": 7,
|
| 247 |
+
"w": 12,
|
| 248 |
+
"x": 12,
|
| 249 |
+
"y": 14
|
| 250 |
+
},
|
| 251 |
+
"id": 6,
|
| 252 |
+
"options": {
|
| 253 |
+
"legend": {
|
| 254 |
+
"calcs": [],
|
| 255 |
+
"displayMode": "table",
|
| 256 |
+
"placement": "bottom"
|
| 257 |
+
},
|
| 258 |
+
"tooltip": {
|
| 259 |
+
"mode": "single"
|
| 260 |
+
}
|
| 261 |
+
},
|
| 262 |
+
"targets": [
|
| 263 |
+
{
|
| 264 |
+
"expr": "histogram_quantile(0.95, sum(rate(antiatropos_executor_latency_ms_bucket{mode=~\"$mode\"}[5m])) by (le, mode))",
|
| 265 |
+
"legendFormat": "p95 {{mode}}",
|
| 266 |
+
"refId": "A"
|
| 267 |
+
},
|
| 268 |
+
{
|
| 269 |
+
"expr": "sum by (mode, error_code) (rate(antiatropos_executor_errors_total{mode=~\"$mode\"}[5m]))",
|
| 270 |
+
"legendFormat": "{{mode}} {{error_code}}",
|
| 271 |
+
"refId": "B"
|
| 272 |
+
}
|
| 273 |
+
],
|
| 274 |
+
"title": "Executor Latency p95 and Errors/s",
|
| 275 |
+
"type": "timeseries"
|
| 276 |
+
}
|
| 277 |
+
],
|
| 278 |
+
"refresh": "5s",
|
| 279 |
+
"schemaVersion": 39,
|
| 280 |
+
"style": "dark",
|
| 281 |
+
"tags": [
|
| 282 |
+
"antiatropos",
|
| 283 |
+
"sre",
|
| 284 |
+
"rl"
|
| 285 |
+
],
|
| 286 |
+
"templating": {
|
| 287 |
+
"list": [
|
| 288 |
+
{
|
| 289 |
+
"datasource": {
|
| 290 |
+
"type": "prometheus",
|
| 291 |
+
"uid": "PBFA97CFB590B2093"
|
| 292 |
+
},
|
| 293 |
+
"definition": "label_values(antiatropos_steps_total, task_id)",
|
| 294 |
+
"includeAll": true,
|
| 295 |
+
"multi": true,
|
| 296 |
+
"name": "task",
|
| 297 |
+
"query": {
|
| 298 |
+
"qryType": 1,
|
| 299 |
+
"query": "label_values(antiatropos_steps_total, task_id)",
|
| 300 |
+
"refId": "TaskVar"
|
| 301 |
+
},
|
| 302 |
+
"refresh": 2,
|
| 303 |
+
"type": "query"
|
| 304 |
+
},
|
| 305 |
+
{
|
| 306 |
+
"datasource": {
|
| 307 |
+
"type": "prometheus",
|
| 308 |
+
"uid": "PBFA97CFB590B2093"
|
| 309 |
+
},
|
| 310 |
+
"definition": "label_values(antiatropos_steps_total, mode)",
|
| 311 |
+
"includeAll": true,
|
| 312 |
+
"multi": true,
|
| 313 |
+
"name": "mode",
|
| 314 |
+
"query": {
|
| 315 |
+
"qryType": 1,
|
| 316 |
+
"query": "label_values(antiatropos_steps_total, mode)",
|
| 317 |
+
"refId": "ModeVar"
|
| 318 |
+
},
|
| 319 |
+
"refresh": 2,
|
| 320 |
+
"type": "query"
|
| 321 |
+
}
|
| 322 |
+
]
|
| 323 |
+
},
|
| 324 |
+
"time": {
|
| 325 |
+
"from": "now-15m",
|
| 326 |
+
"to": "now"
|
| 327 |
+
},
|
| 328 |
+
"timepicker": {},
|
| 329 |
+
"timezone": "",
|
| 330 |
+
"title": "AntiAtropos Live Control Plane",
|
| 331 |
+
"uid": "antiatropos-live",
|
| 332 |
+
"version": 2,
|
| 333 |
+
"weekStart": ""
|
| 334 |
+
}
|
deploy/grafana/provisioning/dashboards/json/antiatropos-overview.json
CHANGED
|
@@ -76,8 +76,8 @@
|
|
| 76 |
"targets": [
|
| 77 |
{
|
| 78 |
"editorMode": "code",
|
| 79 |
-
"expr": "scalar(avg(last_over_time(antiatropos_reward{mode=\"simulated\"}[1m])))",
|
| 80 |
-
"legendFormat": "reward (
|
| 81 |
"range": true,
|
| 82 |
"refId": "A"
|
| 83 |
}
|
|
@@ -143,8 +143,8 @@
|
|
| 143 |
"targets": [
|
| 144 |
{
|
| 145 |
"editorMode": "code",
|
| 146 |
-
"expr": "scalar(avg(last_over_time(antiatropos_total_queue_backlog{mode=\"simulated\"}[1m])))",
|
| 147 |
-
"legendFormat": "queue backlog (
|
| 148 |
"range": true,
|
| 149 |
"refId": "A"
|
| 150 |
}
|
|
@@ -210,8 +210,8 @@
|
|
| 210 |
"targets": [
|
| 211 |
{
|
| 212 |
"editorMode": "code",
|
| 213 |
-
"expr": "scalar(avg(last_over_time(antiatropos_average_latency_norm{mode=\"simulated\"}[1m])))",
|
| 214 |
-
"legendFormat": "latency (
|
| 215 |
"range": true,
|
| 216 |
"refId": "A"
|
| 217 |
}
|
|
@@ -277,8 +277,8 @@
|
|
| 277 |
"targets": [
|
| 278 |
{
|
| 279 |
"editorMode": "code",
|
| 280 |
-
"expr": "scalar(avg(last_over_time(antiatropos_lyapunov_energy{mode=\"simulated\"}[1m])))",
|
| 281 |
-
"legendFormat": "lyapunov energy (
|
| 282 |
"range": true,
|
| 283 |
"refId": "A"
|
| 284 |
}
|
|
@@ -369,14 +369,14 @@
|
|
| 369 |
"targets": [
|
| 370 |
{
|
| 371 |
"editorMode": "code",
|
| 372 |
-
"expr": "antiatropos_reward{mode=\"simulated\"}",
|
| 373 |
"legendFormat": "reward {{task_id}} ({{mode}})",
|
| 374 |
"range": true,
|
| 375 |
"refId": "A"
|
| 376 |
},
|
| 377 |
{
|
| 378 |
"editorMode": "code",
|
| 379 |
-
"expr": "antiatropos_lyapunov_energy{mode=\"simulated\"}",
|
| 380 |
"legendFormat": "lyapunov {{task_id}} ({{mode}})",
|
| 381 |
"range": true,
|
| 382 |
"refId": "B"
|
|
@@ -468,14 +468,14 @@
|
|
| 468 |
"targets": [
|
| 469 |
{
|
| 470 |
"editorMode": "code",
|
| 471 |
-
"expr": "antiatropos_total_queue_backlog{mode=\"simulated\"}",
|
| 472 |
"legendFormat": "queue {{task_id}} ({{mode}})",
|
| 473 |
"range": true,
|
| 474 |
"refId": "A"
|
| 475 |
},
|
| 476 |
{
|
| 477 |
"editorMode": "code",
|
| 478 |
-
"expr": "antiatropos_average_latency_norm{mode=\"simulated\"}",
|
| 479 |
"legendFormat": "latency {{task_id}} ({{mode}})",
|
| 480 |
"range": true,
|
| 481 |
"refId": "B"
|
|
@@ -535,14 +535,14 @@
|
|
| 535 |
"targets": [
|
| 536 |
{
|
| 537 |
"editorMode": "code",
|
| 538 |
-
"expr": "sum by (task_id, mode) (rate(antiatropos_steps_total{mode=\"simulated\"}[1m]))",
|
| 539 |
"legendFormat": "steps/sec {{task_id}} ({{mode}})",
|
| 540 |
"range": true,
|
| 541 |
"refId": "A"
|
| 542 |
},
|
| 543 |
{
|
| 544 |
"editorMode": "code",
|
| 545 |
-
"expr": "sum by (task_id, mode, action_type) (rate(antiatropos_actions_total{mode=\"simulated\"}[1m]))",
|
| 546 |
"legendFormat": "actions/sec {{action_type}} ({{task_id}}, {{mode}})",
|
| 547 |
"range": true,
|
| 548 |
"refId": "B"
|
|
@@ -602,14 +602,14 @@
|
|
| 602 |
"targets": [
|
| 603 |
{
|
| 604 |
"editorMode": "code",
|
| 605 |
-
"expr": "sum by (mode, error_code) (rate(antiatropos_executor_errors_total{mode=\"simulated\"}[5m]))",
|
| 606 |
"legendFormat": "executor errors {{error_code}} ({{mode}})",
|
| 607 |
"range": true,
|
| 608 |
"refId": "A"
|
| 609 |
},
|
| 610 |
{
|
| 611 |
"editorMode": "code",
|
| 612 |
-
"expr": "histogram_quantile(0.95, sum(rate(antiatropos_executor_latency_ms_bucket{mode=\"simulated\"}[5m])) by (le, mode))",
|
| 613 |
"legendFormat": "p95 executor latency {{mode}}",
|
| 614 |
"range": true,
|
| 615 |
"refId": "B"
|
|
@@ -640,3 +640,8 @@
|
|
| 640 |
"version": 2,
|
| 641 |
"weekStart": ""
|
| 642 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
"targets": [
|
| 77 |
{
|
| 78 |
"editorMode": "code",
|
| 79 |
+
"expr": "scalar(avg(last_over_time(antiatropos_reward{mode=~\"live|simulated|hybrid|aws\"}[1m])))",
|
| 80 |
+
"legendFormat": "reward (all modes)",
|
| 81 |
"range": true,
|
| 82 |
"refId": "A"
|
| 83 |
}
|
|
|
|
| 143 |
"targets": [
|
| 144 |
{
|
| 145 |
"editorMode": "code",
|
| 146 |
+
"expr": "scalar(avg(last_over_time(antiatropos_total_queue_backlog{mode=~\"live|simulated|hybrid|aws\"}[1m])))",
|
| 147 |
+
"legendFormat": "queue backlog (all modes)",
|
| 148 |
"range": true,
|
| 149 |
"refId": "A"
|
| 150 |
}
|
|
|
|
| 210 |
"targets": [
|
| 211 |
{
|
| 212 |
"editorMode": "code",
|
| 213 |
+
"expr": "scalar(avg(last_over_time(antiatropos_average_latency_norm{mode=~\"live|simulated|hybrid|aws\"}[1m])))",
|
| 214 |
+
"legendFormat": "latency (all modes)",
|
| 215 |
"range": true,
|
| 216 |
"refId": "A"
|
| 217 |
}
|
|
|
|
| 277 |
"targets": [
|
| 278 |
{
|
| 279 |
"editorMode": "code",
|
| 280 |
+
"expr": "scalar(avg(last_over_time(antiatropos_lyapunov_energy{mode=~\"live|simulated|hybrid|aws\"}[1m])))",
|
| 281 |
+
"legendFormat": "lyapunov energy (all modes)",
|
| 282 |
"range": true,
|
| 283 |
"refId": "A"
|
| 284 |
}
|
|
|
|
| 369 |
"targets": [
|
| 370 |
{
|
| 371 |
"editorMode": "code",
|
| 372 |
+
"expr": "antiatropos_reward{mode=~\"live|simulated|hybrid|aws\"}",
|
| 373 |
"legendFormat": "reward {{task_id}} ({{mode}})",
|
| 374 |
"range": true,
|
| 375 |
"refId": "A"
|
| 376 |
},
|
| 377 |
{
|
| 378 |
"editorMode": "code",
|
| 379 |
+
"expr": "antiatropos_lyapunov_energy{mode=~\"live|simulated|hybrid|aws\"}",
|
| 380 |
"legendFormat": "lyapunov {{task_id}} ({{mode}})",
|
| 381 |
"range": true,
|
| 382 |
"refId": "B"
|
|
|
|
| 468 |
"targets": [
|
| 469 |
{
|
| 470 |
"editorMode": "code",
|
| 471 |
+
"expr": "antiatropos_total_queue_backlog{mode=~\"live|simulated|hybrid|aws\"}",
|
| 472 |
"legendFormat": "queue {{task_id}} ({{mode}})",
|
| 473 |
"range": true,
|
| 474 |
"refId": "A"
|
| 475 |
},
|
| 476 |
{
|
| 477 |
"editorMode": "code",
|
| 478 |
+
"expr": "antiatropos_average_latency_norm{mode=~\"live|simulated|hybrid|aws\"}",
|
| 479 |
"legendFormat": "latency {{task_id}} ({{mode}})",
|
| 480 |
"range": true,
|
| 481 |
"refId": "B"
|
|
|
|
| 535 |
"targets": [
|
| 536 |
{
|
| 537 |
"editorMode": "code",
|
| 538 |
+
"expr": "sum by (task_id, mode) (rate(antiatropos_steps_total{mode=~\"live|simulated|hybrid|aws\"}[1m]))",
|
| 539 |
"legendFormat": "steps/sec {{task_id}} ({{mode}})",
|
| 540 |
"range": true,
|
| 541 |
"refId": "A"
|
| 542 |
},
|
| 543 |
{
|
| 544 |
"editorMode": "code",
|
| 545 |
+
"expr": "sum by (task_id, mode, action_type) (rate(antiatropos_actions_total{mode=~\"live|simulated|hybrid|aws\"}[1m]))",
|
| 546 |
"legendFormat": "actions/sec {{action_type}} ({{task_id}}, {{mode}})",
|
| 547 |
"range": true,
|
| 548 |
"refId": "B"
|
|
|
|
| 602 |
"targets": [
|
| 603 |
{
|
| 604 |
"editorMode": "code",
|
| 605 |
+
"expr": "sum by (mode, error_code) (rate(antiatropos_executor_errors_total{mode=~\"live|simulated|hybrid|aws\"}[5m]))",
|
| 606 |
"legendFormat": "executor errors {{error_code}} ({{mode}})",
|
| 607 |
"range": true,
|
| 608 |
"refId": "A"
|
| 609 |
},
|
| 610 |
{
|
| 611 |
"editorMode": "code",
|
| 612 |
+
"expr": "histogram_quantile(0.95, sum(rate(antiatropos_executor_latency_ms_bucket{mode=~\"live|simulated|hybrid|aws\"}[5m])) by (le, mode))",
|
| 613 |
"legendFormat": "p95 executor latency {{mode}}",
|
| 614 |
"range": true,
|
| 615 |
"refId": "B"
|
|
|
|
| 640 |
"version": 2,
|
| 641 |
"weekStart": ""
|
| 642 |
}
|
| 643 |
+
|
| 644 |
+
|
| 645 |
+
|
| 646 |
+
|
| 647 |
+
|
deploy/grafana/provisioning/dashboards/json/antiatropos-workloads.json
ADDED
|
@@ -0,0 +1,436 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"annotations": {
|
| 3 |
+
"list": [
|
| 4 |
+
{
|
| 5 |
+
"builtIn": 1,
|
| 6 |
+
"datasource": {"type": "grafana", "uid": "-- Grafana --"},
|
| 7 |
+
"enable": true,
|
| 8 |
+
"hide": true,
|
| 9 |
+
"iconColor": "rgba(0, 211, 255, 1)",
|
| 10 |
+
"name": "Annotations & Alerts",
|
| 11 |
+
"type": "dashboard"
|
| 12 |
+
}
|
| 13 |
+
]
|
| 14 |
+
},
|
| 15 |
+
"editable": true,
|
| 16 |
+
"fiscalYearStartMonth": 0,
|
| 17 |
+
"graphTooltip": 1,
|
| 18 |
+
"id": null,
|
| 19 |
+
"links": [],
|
| 20 |
+
"liveNow": false,
|
| 21 |
+
"panels": [
|
| 22 |
+
{
|
| 23 |
+
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
| 24 |
+
"fieldConfig": {
|
| 25 |
+
"defaults": {
|
| 26 |
+
"color": {"mode": "thresholds"},
|
| 27 |
+
"decimals": 1,
|
| 28 |
+
"mappings": [],
|
| 29 |
+
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 100}, {"color": "red", "value": 500}]},
|
| 30 |
+
"unit": "reqps"
|
| 31 |
+
},
|
| 32 |
+
"overrides": []
|
| 33 |
+
},
|
| 34 |
+
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 0},
|
| 35 |
+
"id": 1,
|
| 36 |
+
"options": {
|
| 37 |
+
"colorMode": "value",
|
| 38 |
+
"graphMode": "area",
|
| 39 |
+
"justifyMode": "auto",
|
| 40 |
+
"orientation": "auto",
|
| 41 |
+
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
|
| 42 |
+
"textMode": "auto"
|
| 43 |
+
},
|
| 44 |
+
"targets": [{"expr": "sum(rate(http_requests_total[1m]))", "refId": "A"}],
|
| 45 |
+
"title": "Total Request Rate",
|
| 46 |
+
"type": "stat"
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
| 50 |
+
"fieldConfig": {
|
| 51 |
+
"defaults": {
|
| 52 |
+
"color": {"mode": "thresholds"},
|
| 53 |
+
"decimals": 3,
|
| 54 |
+
"mappings": [],
|
| 55 |
+
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.01}, {"color": "red", "value": 0.05}]},
|
| 56 |
+
"unit": "percentunit"
|
| 57 |
+
},
|
| 58 |
+
"overrides": []
|
| 59 |
+
},
|
| 60 |
+
"gridPos": {"h": 4, "w": 6, "x": 6, "y": 0},
|
| 61 |
+
"id": 2,
|
| 62 |
+
"options": {
|
| 63 |
+
"colorMode": "value",
|
| 64 |
+
"graphMode": "area",
|
| 65 |
+
"justifyMode": "auto",
|
| 66 |
+
"orientation": "auto",
|
| 67 |
+
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
|
| 68 |
+
"textMode": "auto"
|
| 69 |
+
},
|
| 70 |
+
"targets": [{"expr": "sum(rate(http_requests_total{status=~\"5..\"}[1m])) / clamp_min(sum(rate(http_requests_total[1m])), 1)", "refId": "A"}],
|
| 71 |
+
"title": "Global Error Rate",
|
| 72 |
+
"type": "stat"
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
| 76 |
+
"fieldConfig": {
|
| 77 |
+
"defaults": {
|
| 78 |
+
"color": {"mode": "thresholds"},
|
| 79 |
+
"decimals": 1,
|
| 80 |
+
"mappings": [],
|
| 81 |
+
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "orange", "value": 50}, {"color": "red", "value": 100}]},
|
| 82 |
+
"unit": "none"
|
| 83 |
+
},
|
| 84 |
+
"overrides": []
|
| 85 |
+
},
|
| 86 |
+
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 0},
|
| 87 |
+
"id": 3,
|
| 88 |
+
"options": {
|
| 89 |
+
"colorMode": "value",
|
| 90 |
+
"graphMode": "area",
|
| 91 |
+
"justifyMode": "auto",
|
| 92 |
+
"orientation": "auto",
|
| 93 |
+
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
|
| 94 |
+
"textMode": "auto"
|
| 95 |
+
},
|
| 96 |
+
"targets": [{"expr": "sum(queue_depth)", "refId": "A"}],
|
| 97 |
+
"title": "Total Queue Backlog",
|
| 98 |
+
"type": "stat"
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
| 102 |
+
"fieldConfig": {
|
| 103 |
+
"defaults": {
|
| 104 |
+
"color": {"mode": "thresholds"},
|
| 105 |
+
"decimals": 1,
|
| 106 |
+
"mappings": [],
|
| 107 |
+
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "orange", "value": 100}, {"color": "red", "value": 200}]},
|
| 108 |
+
"unit": "ms"
|
| 109 |
+
},
|
| 110 |
+
"overrides": []
|
| 111 |
+
},
|
| 112 |
+
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 0},
|
| 113 |
+
"id": 4,
|
| 114 |
+
"options": {
|
| 115 |
+
"colorMode": "value",
|
| 116 |
+
"graphMode": "area",
|
| 117 |
+
"justifyMode": "auto",
|
| 118 |
+
"orientation": "auto",
|
| 119 |
+
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
|
| 120 |
+
"textMode": "auto"
|
| 121 |
+
},
|
| 122 |
+
"targets": [{"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[1m])) by (le)) * 1000", "refId": "A"}],
|
| 123 |
+
"title": "Cluster p95 Latency",
|
| 124 |
+
"type": "stat"
|
| 125 |
+
},
|
| 126 |
+
{
|
| 127 |
+
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
| 128 |
+
"fieldConfig": {
|
| 129 |
+
"defaults": {
|
| 130 |
+
"color": {"mode": "palette-classic"},
|
| 131 |
+
"custom": {
|
| 132 |
+
"axisBorderShow": false,
|
| 133 |
+
"axisCenteredZero": false,
|
| 134 |
+
"axisColorMode": "text",
|
| 135 |
+
"axisLabel": "",
|
| 136 |
+
"axisPlacement": "auto",
|
| 137 |
+
"barAlignment": 0,
|
| 138 |
+
"drawStyle": "line",
|
| 139 |
+
"fillOpacity": 10,
|
| 140 |
+
"gradientMode": "none",
|
| 141 |
+
"hideFrom": {"legend": false, "tooltip": false, "viz": false},
|
| 142 |
+
"insertNulls": false,
|
| 143 |
+
"lineInterpolation": "linear",
|
| 144 |
+
"lineWidth": 2,
|
| 145 |
+
"pointSize": 3,
|
| 146 |
+
"scaleDistribution": {"type": "linear"},
|
| 147 |
+
"showPoints": "auto",
|
| 148 |
+
"spanNulls": false,
|
| 149 |
+
"stacking": {"group": "A", "mode": "none"},
|
| 150 |
+
"thresholdsStyle": {"mode": "off"}
|
| 151 |
+
},
|
| 152 |
+
"mappings": [],
|
| 153 |
+
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
|
| 154 |
+
"unit": "reqps"
|
| 155 |
+
},
|
| 156 |
+
"overrides": []
|
| 157 |
+
},
|
| 158 |
+
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 4},
|
| 159 |
+
"id": 10,
|
| 160 |
+
"options": {
|
| 161 |
+
"legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
|
| 162 |
+
"tooltip": {"mode": "multi", "sort": "none"}
|
| 163 |
+
},
|
| 164 |
+
"targets": [
|
| 165 |
+
{
|
| 166 |
+
"expr": "sum(rate(http_requests_total[1m])) by (node_id)",
|
| 167 |
+
"legendFormat": "{{node_id}}",
|
| 168 |
+
"refId": "A"
|
| 169 |
+
}
|
| 170 |
+
],
|
| 171 |
+
"title": "Request Rate by Node",
|
| 172 |
+
"type": "timeseries"
|
| 173 |
+
},
|
| 174 |
+
{
|
| 175 |
+
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
| 176 |
+
"fieldConfig": {
|
| 177 |
+
"defaults": {
|
| 178 |
+
"color": {"mode": "palette-classic"},
|
| 179 |
+
"custom": {
|
| 180 |
+
"axisBorderShow": false,
|
| 181 |
+
"axisCenteredZero": false,
|
| 182 |
+
"axisColorMode": "text",
|
| 183 |
+
"axisLabel": "",
|
| 184 |
+
"axisPlacement": "auto",
|
| 185 |
+
"barAlignment": 0,
|
| 186 |
+
"drawStyle": "line",
|
| 187 |
+
"fillOpacity": 10,
|
| 188 |
+
"gradientMode": "none",
|
| 189 |
+
"hideFrom": {"legend": false, "tooltip": false, "viz": false},
|
| 190 |
+
"insertNulls": false,
|
| 191 |
+
"lineInterpolation": "linear",
|
| 192 |
+
"lineWidth": 2,
|
| 193 |
+
"pointSize": 3,
|
| 194 |
+
"scaleDistribution": {"type": "linear"},
|
| 195 |
+
"showPoints": "auto",
|
| 196 |
+
"spanNulls": false,
|
| 197 |
+
"stacking": {"group": "A", "mode": "none"},
|
| 198 |
+
"thresholdsStyle": {"mode": "off"}
|
| 199 |
+
},
|
| 200 |
+
"mappings": [],
|
| 201 |
+
"min": 0,
|
| 202 |
+
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
|
| 203 |
+
"unit": "percentunit"
|
| 204 |
+
},
|
| 205 |
+
"overrides": []
|
| 206 |
+
},
|
| 207 |
+
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 4},
|
| 208 |
+
"id": 11,
|
| 209 |
+
"options": {
|
| 210 |
+
"legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
|
| 211 |
+
"tooltip": {"mode": "multi", "sort": "none"}
|
| 212 |
+
},
|
| 213 |
+
"targets": [
|
| 214 |
+
{
|
| 215 |
+
"expr": "sum(rate(http_requests_total{status=~\"5..\"}[1m])) by (node_id) / clamp_min(sum(rate(http_requests_total[1m])) by (node_id), 1)",
|
| 216 |
+
"legendFormat": "{{node_id}}",
|
| 217 |
+
"refId": "A"
|
| 218 |
+
}
|
| 219 |
+
],
|
| 220 |
+
"title": "Error Rate by Node",
|
| 221 |
+
"type": "timeseries"
|
| 222 |
+
},
|
| 223 |
+
{
|
| 224 |
+
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
| 225 |
+
"fieldConfig": {
|
| 226 |
+
"defaults": {
|
| 227 |
+
"color": {"mode": "palette-classic"},
|
| 228 |
+
"custom": {
|
| 229 |
+
"axisBorderShow": false,
|
| 230 |
+
"axisCenteredZero": false,
|
| 231 |
+
"axisColorMode": "text",
|
| 232 |
+
"axisLabel": "",
|
| 233 |
+
"axisPlacement": "auto",
|
| 234 |
+
"barAlignment": 0,
|
| 235 |
+
"drawStyle": "line",
|
| 236 |
+
"fillOpacity": 10,
|
| 237 |
+
"gradientMode": "none",
|
| 238 |
+
"hideFrom": {"legend": false, "tooltip": false, "viz": false},
|
| 239 |
+
"insertNulls": false,
|
| 240 |
+
"lineInterpolation": "linear",
|
| 241 |
+
"lineWidth": 2,
|
| 242 |
+
"pointSize": 3,
|
| 243 |
+
"scaleDistribution": {"type": "linear"},
|
| 244 |
+
"showPoints": "auto",
|
| 245 |
+
"spanNulls": false,
|
| 246 |
+
"stacking": {"group": "A", "mode": "none"},
|
| 247 |
+
"thresholdsStyle": {"mode": "off"}
|
| 248 |
+
},
|
| 249 |
+
"mappings": [],
|
| 250 |
+
"min": 0,
|
| 251 |
+
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
|
| 252 |
+
"unit": "none"
|
| 253 |
+
},
|
| 254 |
+
"overrides": []
|
| 255 |
+
},
|
| 256 |
+
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 12},
|
| 257 |
+
"id": 12,
|
| 258 |
+
"options": {
|
| 259 |
+
"legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
|
| 260 |
+
"tooltip": {"mode": "multi", "sort": "none"}
|
| 261 |
+
},
|
| 262 |
+
"targets": [
|
| 263 |
+
{
|
| 264 |
+
"expr": "avg(queue_depth) by (node_id)",
|
| 265 |
+
"legendFormat": "{{node_id}}",
|
| 266 |
+
"refId": "A"
|
| 267 |
+
}
|
| 268 |
+
],
|
| 269 |
+
"title": "Queue Depth by Node",
|
| 270 |
+
"type": "timeseries"
|
| 271 |
+
},
|
| 272 |
+
{
|
| 273 |
+
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
| 274 |
+
"fieldConfig": {
|
| 275 |
+
"defaults": {
|
| 276 |
+
"color": {"mode": "palette-classic"},
|
| 277 |
+
"custom": {
|
| 278 |
+
"axisBorderShow": false,
|
| 279 |
+
"axisCenteredZero": false,
|
| 280 |
+
"axisColorMode": "text",
|
| 281 |
+
"axisLabel": "",
|
| 282 |
+
"axisPlacement": "auto",
|
| 283 |
+
"barAlignment": 0,
|
| 284 |
+
"drawStyle": "line",
|
| 285 |
+
"fillOpacity": 10,
|
| 286 |
+
"gradientMode": "none",
|
| 287 |
+
"hideFrom": {"legend": false, "tooltip": false, "viz": false},
|
| 288 |
+
"insertNulls": false,
|
| 289 |
+
"lineInterpolation": "linear",
|
| 290 |
+
"lineWidth": 2,
|
| 291 |
+
"pointSize": 3,
|
| 292 |
+
"scaleDistribution": {"type": "linear"},
|
| 293 |
+
"showPoints": "auto",
|
| 294 |
+
"spanNulls": false,
|
| 295 |
+
"stacking": {"group": "A", "mode": "none"},
|
| 296 |
+
"thresholdsStyle": {"mode": "off"}
|
| 297 |
+
},
|
| 298 |
+
"mappings": [],
|
| 299 |
+
"min": 0,
|
| 300 |
+
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
|
| 301 |
+
"unit": "ms"
|
| 302 |
+
},
|
| 303 |
+
"overrides": []
|
| 304 |
+
},
|
| 305 |
+
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 12},
|
| 306 |
+
"id": 13,
|
| 307 |
+
"options": {
|
| 308 |
+
"legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
|
| 309 |
+
"tooltip": {"mode": "multi", "sort": "none"}
|
| 310 |
+
},
|
| 311 |
+
"targets": [
|
| 312 |
+
{
|
| 313 |
+
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (node_id, le)) * 1000",
|
| 314 |
+
"legendFormat": "{{node_id}}",
|
| 315 |
+
"refId": "A"
|
| 316 |
+
}
|
| 317 |
+
],
|
| 318 |
+
"title": "Latency p95 by Node",
|
| 319 |
+
"type": "timeseries"
|
| 320 |
+
},
|
| 321 |
+
{
|
| 322 |
+
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
| 323 |
+
"fieldConfig": {
|
| 324 |
+
"defaults": {
|
| 325 |
+
"color": {"mode": "palette-classic"},
|
| 326 |
+
"custom": {
|
| 327 |
+
"axisBorderShow": false,
|
| 328 |
+
"axisCenteredZero": false,
|
| 329 |
+
"axisColorMode": "text",
|
| 330 |
+
"axisLabel": "",
|
| 331 |
+
"axisPlacement": "auto",
|
| 332 |
+
"barAlignment": 0,
|
| 333 |
+
"drawStyle": "line",
|
| 334 |
+
"fillOpacity": 10,
|
| 335 |
+
"gradientMode": "none",
|
| 336 |
+
"hideFrom": {"legend": false, "tooltip": false, "viz": false},
|
| 337 |
+
"insertNulls": false,
|
| 338 |
+
"lineInterpolation": "linear",
|
| 339 |
+
"lineWidth": 2,
|
| 340 |
+
"pointSize": 3,
|
| 341 |
+
"scaleDistribution": {"type": "linear"},
|
| 342 |
+
"showPoints": "auto",
|
| 343 |
+
"spanNulls": false,
|
| 344 |
+
"stacking": {"group": "A", "mode": "none"},
|
| 345 |
+
"thresholdsStyle": {"mode": "off"}
|
| 346 |
+
},
|
| 347 |
+
"mappings": [],
|
| 348 |
+
"min": 0,
|
| 349 |
+
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
|
| 350 |
+
"unit": "percentunit"
|
| 351 |
+
},
|
| 352 |
+
"overrides": []
|
| 353 |
+
},
|
| 354 |
+
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 20},
|
| 355 |
+
"id": 14,
|
| 356 |
+
"options": {
|
| 357 |
+
"legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
|
| 358 |
+
"tooltip": {"mode": "multi", "sort": "none"}
|
| 359 |
+
},
|
| 360 |
+
"targets": [
|
| 361 |
+
{
|
| 362 |
+
"expr": "avg(rate(container_cpu_usage_seconds_total[1m])) by (node_id)",
|
| 363 |
+
"legendFormat": "{{node_id}}",
|
| 364 |
+
"refId": "A"
|
| 365 |
+
}
|
| 366 |
+
],
|
| 367 |
+
"title": "CPU by Node",
|
| 368 |
+
"type": "timeseries"
|
| 369 |
+
},
|
| 370 |
+
{
|
| 371 |
+
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
| 372 |
+
"fieldConfig": {
|
| 373 |
+
"defaults": {
|
| 374 |
+
"color": {"mode": "palette-classic"},
|
| 375 |
+
"custom": {
|
| 376 |
+
"axisBorderShow": false,
|
| 377 |
+
"axisCenteredZero": false,
|
| 378 |
+
"axisColorMode": "text",
|
| 379 |
+
"axisLabel": "",
|
| 380 |
+
"axisPlacement": "auto",
|
| 381 |
+
"barAlignment": 0,
|
| 382 |
+
"drawStyle": "line",
|
| 383 |
+
"fillOpacity": 10,
|
| 384 |
+
"gradientMode": "none",
|
| 385 |
+
"hideFrom": {"legend": false, "tooltip": false, "viz": false},
|
| 386 |
+
"insertNulls": false,
|
| 387 |
+
"lineInterpolation": "linear",
|
| 388 |
+
"lineWidth": 2,
|
| 389 |
+
"pointSize": 3,
|
| 390 |
+
"scaleDistribution": {"type": "linear"},
|
| 391 |
+
"showPoints": "auto",
|
| 392 |
+
"spanNulls": false,
|
| 393 |
+
"stacking": {"group": "A", "mode": "none"},
|
| 394 |
+
"thresholdsStyle": {"mode": "off"}
|
| 395 |
+
},
|
| 396 |
+
"mappings": [],
|
| 397 |
+
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
|
| 398 |
+
"unit": "reqps"
|
| 399 |
+
},
|
| 400 |
+
"overrides": []
|
| 401 |
+
},
|
| 402 |
+
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 20},
|
| 403 |
+
"id": 15,
|
| 404 |
+
"options": {
|
| 405 |
+
"legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
|
| 406 |
+
"tooltip": {"mode": "multi", "sort": "none"}
|
| 407 |
+
},
|
| 408 |
+
"targets": [
|
| 409 |
+
{
|
| 410 |
+
"expr": "sum(rate(http_requests_total{status=\"200\"}[1m])) by (node_id)",
|
| 411 |
+
"legendFormat": "200 {{node_id}}",
|
| 412 |
+
"refId": "A"
|
| 413 |
+
},
|
| 414 |
+
{
|
| 415 |
+
"expr": "sum(rate(http_requests_total{status=\"500\"}[1m])) by (node_id)",
|
| 416 |
+
"legendFormat": "500 {{node_id}}",
|
| 417 |
+
"refId": "B"
|
| 418 |
+
}
|
| 419 |
+
],
|
| 420 |
+
"title": "Requests by Status Code",
|
| 421 |
+
"type": "timeseries"
|
| 422 |
+
}
|
| 423 |
+
],
|
| 424 |
+
"refresh": "5s",
|
| 425 |
+
"schemaVersion": 41,
|
| 426 |
+
"style": "dark",
|
| 427 |
+
"tags": ["antiatropos", "sre", "workload"],
|
| 428 |
+
"templating": {"list": []},
|
| 429 |
+
"time": {"from": "now-15m", "to": "now"},
|
| 430 |
+
"timepicker": {},
|
| 431 |
+
"timezone": "browser",
|
| 432 |
+
"title": "AntiAtropos Workloads",
|
| 433 |
+
"uid": "antiatropos-workloads",
|
| 434 |
+
"version": 1,
|
| 435 |
+
"weekStart": ""
|
| 436 |
+
}
|
deploy/grafana/provisioning/datasources/prometheus.yaml
CHANGED
|
@@ -5,6 +5,6 @@ datasources:
|
|
| 5 |
uid: PBFA97CFB590B2093
|
| 6 |
type: prometheus
|
| 7 |
access: proxy
|
| 8 |
-
url: http://127.0.0.1:9090
|
| 9 |
isDefault: true
|
| 10 |
-
editable:
|
|
|
|
| 5 |
uid: PBFA97CFB590B2093
|
| 6 |
type: prometheus
|
| 7 |
access: proxy
|
| 8 |
+
url: http://127.0.0.1:9090
|
| 9 |
isDefault: true
|
| 10 |
+
editable: true
|
deploy/index.html
CHANGED
|
@@ -1,473 +1,473 @@
|
|
| 1 |
-
<!DOCTYPE html>
|
| 2 |
-
<html lang="en">
|
| 3 |
-
<head>
|
| 4 |
-
<meta charset="UTF-8">
|
| 5 |
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
-
<title>AntiAtropos Control Console</title>
|
| 7 |
-
<style>
|
| 8 |
-
:root {
|
| 9 |
-
--bg: #0b1220;
|
| 10 |
-
--bg-soft: #101a2d;
|
| 11 |
-
--panel: #111d33;
|
| 12 |
-
--line: #2b3d5d;
|
| 13 |
-
--text: #e6edf8;
|
| 14 |
-
--muted: #9bb0cf;
|
| 15 |
-
--accent: #ff5a3d;
|
| 16 |
-
--accent-strong: #e14830;
|
| 17 |
-
--ok: #3dcf8e;
|
| 18 |
-
--bad: #ff6f7f;
|
| 19 |
-
}
|
| 20 |
-
|
| 21 |
-
* {
|
| 22 |
-
box-sizing: border-box;
|
| 23 |
-
}
|
| 24 |
-
|
| 25 |
-
body {
|
| 26 |
-
margin: 0;
|
| 27 |
-
padding: 24px;
|
| 28 |
-
background:
|
| 29 |
-
radial-gradient(circle at top right, rgba(255, 90, 61, 0.18), transparent 40%),
|
| 30 |
-
radial-gradient(circle at top left, rgba(74, 140, 255, 0.18), transparent 35%),
|
| 31 |
-
var(--bg);
|
| 32 |
-
color: var(--text);
|
| 33 |
-
font-family: "Segoe UI", "Helvetica Neue", Arial, sans-serif;
|
| 34 |
-
}
|
| 35 |
-
|
| 36 |
-
.shell {
|
| 37 |
-
max-width: 1440px;
|
| 38 |
-
margin: 0 auto;
|
| 39 |
-
display: grid;
|
| 40 |
-
gap: 18px;
|
| 41 |
-
}
|
| 42 |
-
|
| 43 |
-
.card {
|
| 44 |
-
background: linear-gradient(180deg, rgba(17, 29, 51, 0.88), rgba(15, 25, 44, 0.92));
|
| 45 |
-
border: 1px solid var(--line);
|
| 46 |
-
border-radius: 16px;
|
| 47 |
-
}
|
| 48 |
-
|
| 49 |
-
.header {
|
| 50 |
-
padding: 20px 22px;
|
| 51 |
-
display: flex;
|
| 52 |
-
justify-content: space-between;
|
| 53 |
-
align-items: center;
|
| 54 |
-
gap: 16px;
|
| 55 |
-
flex-wrap: wrap;
|
| 56 |
-
}
|
| 57 |
-
|
| 58 |
-
.title h1 {
|
| 59 |
-
margin: 0;
|
| 60 |
-
font-size: 1.5rem;
|
| 61 |
-
letter-spacing: 0.01em;
|
| 62 |
-
}
|
| 63 |
-
|
| 64 |
-
.title p {
|
| 65 |
-
margin: 4px 0 0;
|
| 66 |
-
color: var(--muted);
|
| 67 |
-
font-size: 0.95rem;
|
| 68 |
-
}
|
| 69 |
-
|
| 70 |
-
.links {
|
| 71 |
-
display: flex;
|
| 72 |
-
gap: 10px;
|
| 73 |
-
flex-wrap: wrap;
|
| 74 |
-
}
|
| 75 |
-
|
| 76 |
-
.link-btn {
|
| 77 |
-
display: inline-flex;
|
| 78 |
-
align-items: center;
|
| 79 |
-
justify-content: center;
|
| 80 |
-
height: 38px;
|
| 81 |
-
padding: 0 14px;
|
| 82 |
-
border-radius: 10px;
|
| 83 |
-
border: 1px solid var(--line);
|
| 84 |
-
color: var(--text);
|
| 85 |
-
text-decoration: none;
|
| 86 |
-
background: var(--bg-soft);
|
| 87 |
-
font-size: 0.9rem;
|
| 88 |
-
}
|
| 89 |
-
|
| 90 |
-
.layout {
|
| 91 |
-
display: grid;
|
| 92 |
-
grid-template-columns: 1fr;
|
| 93 |
-
gap: 18px;
|
| 94 |
-
}
|
| 95 |
-
|
| 96 |
-
.controls {
|
| 97 |
-
padding: 16px;
|
| 98 |
-
display: grid;
|
| 99 |
-
grid-template-columns: 1fr;
|
| 100 |
-
gap: 14px;
|
| 101 |
-
}
|
| 102 |
-
|
| 103 |
-
.controls-grid {
|
| 104 |
-
display: grid;
|
| 105 |
-
grid-template-columns: repeat(4, minmax(0, 1fr));
|
| 106 |
-
gap: 12px;
|
| 107 |
-
align-items: end;
|
| 108 |
-
}
|
| 109 |
-
|
| 110 |
-
.field label {
|
| 111 |
-
display: block;
|
| 112 |
-
color: var(--muted);
|
| 113 |
-
font-size: 0.78rem;
|
| 114 |
-
font-weight: 600;
|
| 115 |
-
letter-spacing: 0.04em;
|
| 116 |
-
margin-bottom: 6px;
|
| 117 |
-
text-transform: uppercase;
|
| 118 |
-
}
|
| 119 |
-
|
| 120 |
-
.field select,
|
| 121 |
-
.field input {
|
| 122 |
-
width: 100%;
|
| 123 |
-
height: 44px;
|
| 124 |
-
border-radius: 10px;
|
| 125 |
-
border: 1px solid var(--line);
|
| 126 |
-
background: #0c162a;
|
| 127 |
-
color: var(--text);
|
| 128 |
-
padding: 0 12px;
|
| 129 |
-
font-size: 0.95rem;
|
| 130 |
-
}
|
| 131 |
-
|
| 132 |
-
.actions {
|
| 133 |
-
display: grid;
|
| 134 |
-
grid-template-columns: 180px 1fr;
|
| 135 |
-
gap: 10px;
|
| 136 |
-
}
|
| 137 |
-
|
| 138 |
-
.btn {
|
| 139 |
-
border: 1px solid var(--line);
|
| 140 |
-
border-radius: 10px;
|
| 141 |
-
height: 44px;
|
| 142 |
-
cursor: pointer;
|
| 143 |
-
font-weight: 600;
|
| 144 |
-
font-size: 0.95rem;
|
| 145 |
-
color: var(--text);
|
| 146 |
-
background: var(--bg-soft);
|
| 147 |
-
}
|
| 148 |
-
|
| 149 |
-
.btn-primary {
|
| 150 |
-
background: linear-gradient(135deg, var(--accent), var(--accent-strong));
|
| 151 |
-
border-color: transparent;
|
| 152 |
-
color: #fff;
|
| 153 |
-
}
|
| 154 |
-
|
| 155 |
-
.metrics {
|
| 156 |
-
padding: 16px;
|
| 157 |
-
display: grid;
|
| 158 |
-
grid-template-columns: repeat(5, minmax(0, 1fr));
|
| 159 |
-
gap: 10px;
|
| 160 |
-
}
|
| 161 |
-
|
| 162 |
-
.metric {
|
| 163 |
-
background: #0d172a;
|
| 164 |
-
border: 1px solid var(--line);
|
| 165 |
-
border-radius: 12px;
|
| 166 |
-
padding: 12px;
|
| 167 |
-
min-height: 86px;
|
| 168 |
-
}
|
| 169 |
-
|
| 170 |
-
.metric .name {
|
| 171 |
-
color: var(--muted);
|
| 172 |
-
font-size: 0.78rem;
|
| 173 |
-
text-transform: uppercase;
|
| 174 |
-
letter-spacing: 0.05em;
|
| 175 |
-
margin-bottom: 8px;
|
| 176 |
-
}
|
| 177 |
-
|
| 178 |
-
.metric .value {
|
| 179 |
-
font-family: Consolas, "SFMono-Regular", Menlo, monospace;
|
| 180 |
-
font-size: 1.18rem;
|
| 181 |
-
font-weight: 700;
|
| 182 |
-
color: var(--text);
|
| 183 |
-
}
|
| 184 |
-
|
| 185 |
-
.metric .value.good {
|
| 186 |
-
color: var(--ok);
|
| 187 |
-
}
|
| 188 |
-
|
| 189 |
-
.metric .value.bad {
|
| 190 |
-
color: var(--bad);
|
| 191 |
-
}
|
| 192 |
-
|
| 193 |
-
.monitor {
|
| 194 |
-
padding: 16px;
|
| 195 |
-
display: grid;
|
| 196 |
-
gap: 10px;
|
| 197 |
-
}
|
| 198 |
-
|
| 199 |
-
.monitor-head {
|
| 200 |
-
display: flex;
|
| 201 |
-
justify-content: space-between;
|
| 202 |
-
align-items: center;
|
| 203 |
-
gap: 12px;
|
| 204 |
-
flex-wrap: wrap;
|
| 205 |
-
}
|
| 206 |
-
|
| 207 |
-
.monitor-head h2 {
|
| 208 |
-
margin: 0;
|
| 209 |
-
font-size: 1.05rem;
|
| 210 |
-
font-weight: 700;
|
| 211 |
-
}
|
| 212 |
-
|
| 213 |
-
.monitor-head p {
|
| 214 |
-
margin: 0;
|
| 215 |
-
color: var(--muted);
|
| 216 |
-
font-size: 0.85rem;
|
| 217 |
-
}
|
| 218 |
-
|
| 219 |
-
.graph-wrap {
|
| 220 |
-
height: 920px;
|
| 221 |
-
border: 1px solid var(--line);
|
| 222 |
-
border-radius: 12px;
|
| 223 |
-
overflow: hidden;
|
| 224 |
-
background: #0a1324;
|
| 225 |
-
}
|
| 226 |
-
|
| 227 |
-
iframe {
|
| 228 |
-
width: 100%;
|
| 229 |
-
height: 100%;
|
| 230 |
-
border: 0;
|
| 231 |
-
}
|
| 232 |
-
|
| 233 |
-
.logs {
|
| 234 |
-
padding: 16px;
|
| 235 |
-
}
|
| 236 |
-
|
| 237 |
-
.logs h3 {
|
| 238 |
-
margin: 0 0 10px;
|
| 239 |
-
font-size: 0.9rem;
|
| 240 |
-
color: var(--muted);
|
| 241 |
-
text-transform: uppercase;
|
| 242 |
-
letter-spacing: 0.05em;
|
| 243 |
-
}
|
| 244 |
-
|
| 245 |
-
#terminal {
|
| 246 |
-
background: #091121;
|
| 247 |
-
border: 1px solid var(--line);
|
| 248 |
-
border-radius: 10px;
|
| 249 |
-
height: 160px;
|
| 250 |
-
overflow-y: auto;
|
| 251 |
-
padding: 10px;
|
| 252 |
-
font-family: Consolas, "SFMono-Regular", Menlo, monospace;
|
| 253 |
-
font-size: 0.83rem;
|
| 254 |
-
color: #c9d6ed;
|
| 255 |
-
}
|
| 256 |
-
|
| 257 |
-
.log-line {
|
| 258 |
-
padding: 2px 0;
|
| 259 |
-
border-bottom: 1px solid rgba(155, 176, 207, 0.08);
|
| 260 |
-
}
|
| 261 |
-
|
| 262 |
-
.log-time {
|
| 263 |
-
color: #7084a8;
|
| 264 |
-
margin-right: 8px;
|
| 265 |
-
font-size: 0.72rem;
|
| 266 |
-
}
|
| 267 |
-
|
| 268 |
-
@media (max-width: 1120px) {
|
| 269 |
-
.controls-grid {
|
| 270 |
-
grid-template-columns: 1fr 1fr;
|
| 271 |
-
}
|
| 272 |
-
|
| 273 |
-
.actions {
|
| 274 |
-
grid-template-columns: 1fr;
|
| 275 |
-
}
|
| 276 |
-
|
| 277 |
-
.metrics {
|
| 278 |
-
grid-template-columns: 1fr 1fr;
|
| 279 |
-
}
|
| 280 |
-
}
|
| 281 |
-
|
| 282 |
-
@media (max-width: 680px) {
|
| 283 |
-
body {
|
| 284 |
-
padding: 12px;
|
| 285 |
-
}
|
| 286 |
-
|
| 287 |
-
.controls-grid,
|
| 288 |
-
.metrics {
|
| 289 |
-
grid-template-columns: 1fr;
|
| 290 |
-
}
|
| 291 |
-
|
| 292 |
-
.graph-wrap {
|
| 293 |
-
height: 760px;
|
| 294 |
-
}
|
| 295 |
-
}
|
| 296 |
-
</style>
|
| 297 |
-
</head>
|
| 298 |
-
<body>
|
| 299 |
-
<div class="shell">
|
| 300 |
-
<header class="card header">
|
| 301 |
-
<div class="title">
|
| 302 |
-
<h1>AntiAtropos SRE Control Console</h1>
|
| 303 |
-
<p>Simulated environment with direct observability through Prometheus and Grafana</p>
|
| 304 |
-
</div>
|
| 305 |
-
<div class="links">
|
| 306 |
-
<a class="link-btn" href="/docs" target="_blank">API Docs</a>
|
| 307 |
-
<a class="link-btn" href="/prometheus/" target="_blank">Open Prometheus</a>
|
| 308 |
-
<a class="link-btn" href="/grafana/" target="_blank">Open Grafana</a>
|
| 309 |
-
</div>
|
| 310 |
-
</header>
|
| 311 |
-
|
| 312 |
-
<main class="layout">
|
| 313 |
-
<section class="card controls">
|
| 314 |
-
<div class="controls-grid">
|
| 315 |
-
<div class="field">
|
| 316 |
-
<label for="action-type">Action Type</label>
|
| 317 |
-
<select id="action-type">
|
| 318 |
-
<option value="NO_OP">NO_OP</option>
|
| 319 |
-
<option value="SCALE_UP">SCALE_UP</option>
|
| 320 |
-
<option value="SCALE_DOWN">SCALE_DOWN</option>
|
| 321 |
-
<option value="REROUTE_TRAFFIC">REROUTE_TRAFFIC</option>
|
| 322 |
-
<option value="SHED_LOAD">SHED_LOAD</option>
|
| 323 |
-
</select>
|
| 324 |
-
</div>
|
| 325 |
-
<div class="field">
|
| 326 |
-
<label for="node-id">Target Node</label>
|
| 327 |
-
<select id="node-id">
|
| 328 |
-
<option value="node-0">node-0 (VIP)</option>
|
| 329 |
-
<option value="node-1">node-1</option>
|
| 330 |
-
<option value="node-2">node-2</option>
|
| 331 |
-
<option value="node-3">node-3</option>
|
| 332 |
-
<option value="node-4">node-4</option>
|
| 333 |
-
</select>
|
| 334 |
-
</div>
|
| 335 |
-
<div class="field">
|
| 336 |
-
<label for="parameter">Parameter</label>
|
| 337 |
-
<input id="parameter" type="number" step="0.1" value="0.0">
|
| 338 |
-
</div>
|
| 339 |
-
<div class="actions">
|
| 340 |
-
<button class="btn btn-primary" onclick="resetEnv()">Reset Episode</button>
|
| 341 |
-
<button class="btn" onclick="stepEnv()">Execute Step</button>
|
| 342 |
-
</div>
|
| 343 |
-
</div>
|
| 344 |
-
</section>
|
| 345 |
-
|
| 346 |
-
<section class="card metrics">
|
| 347 |
-
<div class="metric">
|
| 348 |
-
<div class="name">Cluster ID</div>
|
| 349 |
-
<div id="cluster-id" class="value">---</div>
|
| 350 |
-
</div>
|
| 351 |
-
<div class="metric">
|
| 352 |
-
<div class="name">Reward</div>
|
| 353 |
-
<div id="last-reward" class="value">0.0000</div>
|
| 354 |
-
</div>
|
| 355 |
-
<div class="metric">
|
| 356 |
-
<div class="name">Lyapunov Energy</div>
|
| 357 |
-
<div id="lyapunov-val" class="value">0.0000</div>
|
| 358 |
-
</div>
|
| 359 |
-
<div class="metric">
|
| 360 |
-
<div class="name">Mode</div>
|
| 361 |
-
<div id="mode-val" class="value">simulated</div>
|
| 362 |
-
</div>
|
| 363 |
-
<div class="metric">
|
| 364 |
-
<div class="name">Step</div>
|
| 365 |
-
<div id="step-val" class="value">0</div>
|
| 366 |
-
</div>
|
| 367 |
-
</section>
|
| 368 |
-
|
| 369 |
-
<section class="card monitor">
|
| 370 |
-
<div class="monitor-head">
|
| 371 |
-
<h2>Required Graphs</h2>
|
| 372 |
-
<p>Raw metrics source: Prometheus. Curated dashboard: Grafana.</p>
|
| 373 |
-
</div>
|
| 374 |
-
<div class="graph-wrap">
|
| 375 |
-
<iframe
|
| 376 |
-
id="grafana-iframe"
|
| 377 |
-
src="/grafana/d/antiatropos-overview/antiatropos-overview?kiosk&theme=dark&refresh=5s&from=now-30m&to=now">
|
| 378 |
-
</iframe>
|
| 379 |
-
</div>
|
| 380 |
-
</section>
|
| 381 |
-
|
| 382 |
-
<section class="card logs">
|
| 383 |
-
<h3>System Logs</h3>
|
| 384 |
-
<div id="terminal">
|
| 385 |
-
<div class="log-line"><span class="log-time">[init]</span>Waiting for interaction.</div>
|
| 386 |
-
</div>
|
| 387 |
-
</section>
|
| 388 |
-
</main>
|
| 389 |
-
</div>
|
| 390 |
-
|
| 391 |
-
<script>
|
| 392 |
-
const terminal = document.getElementById("terminal");
|
| 393 |
-
|
| 394 |
-
function log(message, type = "info") {
|
| 395 |
-
const time = new Date().toLocaleTimeString([], {
|
| 396 |
-
hour12: false,
|
| 397 |
-
hour: "2-digit",
|
| 398 |
-
minute: "2-digit",
|
| 399 |
-
second: "2-digit"
|
| 400 |
-
});
|
| 401 |
-
const row = document.createElement("div");
|
| 402 |
-
row.className = "log-line";
|
| 403 |
-
const color = type === "error" ? "#ff6f7f" : type === "success" ? "#3dcf8e" : "#c9d6ed";
|
| 404 |
-
row.innerHTML = '<span class="log-time">[' + time + "]</span><span style=\"color:" + color + "\">" + message + "</span>";
|
| 405 |
-
terminal.appendChild(row);
|
| 406 |
-
terminal.scrollTop = terminal.scrollHeight;
|
| 407 |
-
}
|
| 408 |
-
|
| 409 |
-
function updateUI(data) {
|
| 410 |
-
const observation = data.observation || {};
|
| 411 |
-
const rewardNode = document.getElementById("last-reward");
|
| 412 |
-
const reward = typeof data.reward === "number" ? data.reward : 0;
|
| 413 |
-
|
| 414 |
-
document.getElementById("cluster-id").innerText = (observation.cluster_id || "---").toString().slice(0, 12);
|
| 415 |
-
document.getElementById("lyapunov-val").innerText = Number(observation.lyapunov_energy || 0).toFixed(4);
|
| 416 |
-
document.getElementById("mode-val").innerText = (observation.mode || "simulated").toString();
|
| 417 |
-
document.getElementById("step-val").innerText = String(observation.step || 0);
|
| 418 |
-
|
| 419 |
-
rewardNode.innerText = reward.toFixed(4);
|
| 420 |
-
rewardNode.className = reward < 0 ? "value bad" : "value good";
|
| 421 |
-
}
|
| 422 |
-
|
| 423 |
-
async function resetEnv() {
|
| 424 |
-
log("Resetting environment...");
|
| 425 |
-
try {
|
| 426 |
-
const response = await fetch("/reset", {
|
| 427 |
-
method: "POST",
|
| 428 |
-
headers: { "Content-Type": "application/json" },
|
| 429 |
-
body: JSON.stringify({})
|
| 430 |
-
});
|
| 431 |
-
const data = await response.json();
|
| 432 |
-
updateUI(data);
|
| 433 |
-
log("Environment reset complete.", "success");
|
| 434 |
-
} catch (err) {
|
| 435 |
-
log("Reset failed: " + err.message, "error");
|
| 436 |
-
}
|
| 437 |
-
}
|
| 438 |
-
|
| 439 |
-
async function stepEnv() {
|
| 440 |
-
const action = {
|
| 441 |
-
action_type: document.getElementById("action-type").value,
|
| 442 |
-
target_node_id: document.getElementById("node-id").value,
|
| 443 |
-
parameter: parseFloat(document.getElementById("parameter").value)
|
| 444 |
-
};
|
| 445 |
-
|
| 446 |
-
log("Dispatching " + action.action_type + " to " + action.target_node_id + " (" + action.parameter + ")");
|
| 447 |
-
|
| 448 |
-
try {
|
| 449 |
-
const response = await fetch("/step", {
|
| 450 |
-
method: "POST",
|
| 451 |
-
headers: { "Content-Type": "application/json" },
|
| 452 |
-
body: JSON.stringify({ action: action })
|
| 453 |
-
});
|
| 454 |
-
const data = await response.json();
|
| 455 |
-
|
| 456 |
-
if (data.detail) {
|
| 457 |
-
log("Invalid payload: " + JSON.stringify(data.detail), "error");
|
| 458 |
-
return;
|
| 459 |
-
}
|
| 460 |
-
|
| 461 |
-
updateUI(data);
|
| 462 |
-
log(
|
| 463 |
-
"Step complete. Reward=" + Number(data.reward || 0).toFixed(3) +
|
| 464 |
-
" Lyapunov=" + Number((data.observation || {}).lyapunov_energy || 0).toFixed(3),
|
| 465 |
-
"success"
|
| 466 |
-
);
|
| 467 |
-
} catch (err) {
|
| 468 |
-
log("Execution failed: " + err.message, "error");
|
| 469 |
-
}
|
| 470 |
-
}
|
| 471 |
-
</script>
|
| 472 |
-
</body>
|
| 473 |
-
</html>
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>AntiAtropos Control Console</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg: #0b1220;
|
| 10 |
+
--bg-soft: #101a2d;
|
| 11 |
+
--panel: #111d33;
|
| 12 |
+
--line: #2b3d5d;
|
| 13 |
+
--text: #e6edf8;
|
| 14 |
+
--muted: #9bb0cf;
|
| 15 |
+
--accent: #ff5a3d;
|
| 16 |
+
--accent-strong: #e14830;
|
| 17 |
+
--ok: #3dcf8e;
|
| 18 |
+
--bad: #ff6f7f;
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
* {
|
| 22 |
+
box-sizing: border-box;
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
body {
|
| 26 |
+
margin: 0;
|
| 27 |
+
padding: 24px;
|
| 28 |
+
background:
|
| 29 |
+
radial-gradient(circle at top right, rgba(255, 90, 61, 0.18), transparent 40%),
|
| 30 |
+
radial-gradient(circle at top left, rgba(74, 140, 255, 0.18), transparent 35%),
|
| 31 |
+
var(--bg);
|
| 32 |
+
color: var(--text);
|
| 33 |
+
font-family: "Segoe UI", "Helvetica Neue", Arial, sans-serif;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
.shell {
|
| 37 |
+
max-width: 1440px;
|
| 38 |
+
margin: 0 auto;
|
| 39 |
+
display: grid;
|
| 40 |
+
gap: 18px;
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
.card {
|
| 44 |
+
background: linear-gradient(180deg, rgba(17, 29, 51, 0.88), rgba(15, 25, 44, 0.92));
|
| 45 |
+
border: 1px solid var(--line);
|
| 46 |
+
border-radius: 16px;
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
.header {
|
| 50 |
+
padding: 20px 22px;
|
| 51 |
+
display: flex;
|
| 52 |
+
justify-content: space-between;
|
| 53 |
+
align-items: center;
|
| 54 |
+
gap: 16px;
|
| 55 |
+
flex-wrap: wrap;
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
.title h1 {
|
| 59 |
+
margin: 0;
|
| 60 |
+
font-size: 1.5rem;
|
| 61 |
+
letter-spacing: 0.01em;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
.title p {
|
| 65 |
+
margin: 4px 0 0;
|
| 66 |
+
color: var(--muted);
|
| 67 |
+
font-size: 0.95rem;
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
.links {
|
| 71 |
+
display: flex;
|
| 72 |
+
gap: 10px;
|
| 73 |
+
flex-wrap: wrap;
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
.link-btn {
|
| 77 |
+
display: inline-flex;
|
| 78 |
+
align-items: center;
|
| 79 |
+
justify-content: center;
|
| 80 |
+
height: 38px;
|
| 81 |
+
padding: 0 14px;
|
| 82 |
+
border-radius: 10px;
|
| 83 |
+
border: 1px solid var(--line);
|
| 84 |
+
color: var(--text);
|
| 85 |
+
text-decoration: none;
|
| 86 |
+
background: var(--bg-soft);
|
| 87 |
+
font-size: 0.9rem;
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
.layout {
|
| 91 |
+
display: grid;
|
| 92 |
+
grid-template-columns: 1fr;
|
| 93 |
+
gap: 18px;
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
.controls {
|
| 97 |
+
padding: 16px;
|
| 98 |
+
display: grid;
|
| 99 |
+
grid-template-columns: 1fr;
|
| 100 |
+
gap: 14px;
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
.controls-grid {
|
| 104 |
+
display: grid;
|
| 105 |
+
grid-template-columns: repeat(4, minmax(0, 1fr));
|
| 106 |
+
gap: 12px;
|
| 107 |
+
align-items: end;
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
.field label {
|
| 111 |
+
display: block;
|
| 112 |
+
color: var(--muted);
|
| 113 |
+
font-size: 0.78rem;
|
| 114 |
+
font-weight: 600;
|
| 115 |
+
letter-spacing: 0.04em;
|
| 116 |
+
margin-bottom: 6px;
|
| 117 |
+
text-transform: uppercase;
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
.field select,
|
| 121 |
+
.field input {
|
| 122 |
+
width: 100%;
|
| 123 |
+
height: 44px;
|
| 124 |
+
border-radius: 10px;
|
| 125 |
+
border: 1px solid var(--line);
|
| 126 |
+
background: #0c162a;
|
| 127 |
+
color: var(--text);
|
| 128 |
+
padding: 0 12px;
|
| 129 |
+
font-size: 0.95rem;
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
.actions {
|
| 133 |
+
display: grid;
|
| 134 |
+
grid-template-columns: 180px 1fr;
|
| 135 |
+
gap: 10px;
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
.btn {
|
| 139 |
+
border: 1px solid var(--line);
|
| 140 |
+
border-radius: 10px;
|
| 141 |
+
height: 44px;
|
| 142 |
+
cursor: pointer;
|
| 143 |
+
font-weight: 600;
|
| 144 |
+
font-size: 0.95rem;
|
| 145 |
+
color: var(--text);
|
| 146 |
+
background: var(--bg-soft);
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
.btn-primary {
|
| 150 |
+
background: linear-gradient(135deg, var(--accent), var(--accent-strong));
|
| 151 |
+
border-color: transparent;
|
| 152 |
+
color: #fff;
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
.metrics {
|
| 156 |
+
padding: 16px;
|
| 157 |
+
display: grid;
|
| 158 |
+
grid-template-columns: repeat(5, minmax(0, 1fr));
|
| 159 |
+
gap: 10px;
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
.metric {
|
| 163 |
+
background: #0d172a;
|
| 164 |
+
border: 1px solid var(--line);
|
| 165 |
+
border-radius: 12px;
|
| 166 |
+
padding: 12px;
|
| 167 |
+
min-height: 86px;
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
.metric .name {
|
| 171 |
+
color: var(--muted);
|
| 172 |
+
font-size: 0.78rem;
|
| 173 |
+
text-transform: uppercase;
|
| 174 |
+
letter-spacing: 0.05em;
|
| 175 |
+
margin-bottom: 8px;
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
.metric .value {
|
| 179 |
+
font-family: Consolas, "SFMono-Regular", Menlo, monospace;
|
| 180 |
+
font-size: 1.18rem;
|
| 181 |
+
font-weight: 700;
|
| 182 |
+
color: var(--text);
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
.metric .value.good {
|
| 186 |
+
color: var(--ok);
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
.metric .value.bad {
|
| 190 |
+
color: var(--bad);
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
.monitor {
|
| 194 |
+
padding: 16px;
|
| 195 |
+
display: grid;
|
| 196 |
+
gap: 10px;
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
.monitor-head {
|
| 200 |
+
display: flex;
|
| 201 |
+
justify-content: space-between;
|
| 202 |
+
align-items: center;
|
| 203 |
+
gap: 12px;
|
| 204 |
+
flex-wrap: wrap;
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
.monitor-head h2 {
|
| 208 |
+
margin: 0;
|
| 209 |
+
font-size: 1.05rem;
|
| 210 |
+
font-weight: 700;
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
.monitor-head p {
|
| 214 |
+
margin: 0;
|
| 215 |
+
color: var(--muted);
|
| 216 |
+
font-size: 0.85rem;
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
.graph-wrap {
|
| 220 |
+
height: 920px;
|
| 221 |
+
border: 1px solid var(--line);
|
| 222 |
+
border-radius: 12px;
|
| 223 |
+
overflow: hidden;
|
| 224 |
+
background: #0a1324;
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
iframe {
|
| 228 |
+
width: 100%;
|
| 229 |
+
height: 100%;
|
| 230 |
+
border: 0;
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
.logs {
|
| 234 |
+
padding: 16px;
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
.logs h3 {
|
| 238 |
+
margin: 0 0 10px;
|
| 239 |
+
font-size: 0.9rem;
|
| 240 |
+
color: var(--muted);
|
| 241 |
+
text-transform: uppercase;
|
| 242 |
+
letter-spacing: 0.05em;
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
#terminal {
|
| 246 |
+
background: #091121;
|
| 247 |
+
border: 1px solid var(--line);
|
| 248 |
+
border-radius: 10px;
|
| 249 |
+
height: 160px;
|
| 250 |
+
overflow-y: auto;
|
| 251 |
+
padding: 10px;
|
| 252 |
+
font-family: Consolas, "SFMono-Regular", Menlo, monospace;
|
| 253 |
+
font-size: 0.83rem;
|
| 254 |
+
color: #c9d6ed;
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
.log-line {
|
| 258 |
+
padding: 2px 0;
|
| 259 |
+
border-bottom: 1px solid rgba(155, 176, 207, 0.08);
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
.log-time {
|
| 263 |
+
color: #7084a8;
|
| 264 |
+
margin-right: 8px;
|
| 265 |
+
font-size: 0.72rem;
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
@media (max-width: 1120px) {
|
| 269 |
+
.controls-grid {
|
| 270 |
+
grid-template-columns: 1fr 1fr;
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
.actions {
|
| 274 |
+
grid-template-columns: 1fr;
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
.metrics {
|
| 278 |
+
grid-template-columns: 1fr 1fr;
|
| 279 |
+
}
|
| 280 |
+
}
|
| 281 |
+
|
| 282 |
+
@media (max-width: 680px) {
|
| 283 |
+
body {
|
| 284 |
+
padding: 12px;
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
.controls-grid,
|
| 288 |
+
.metrics {
|
| 289 |
+
grid-template-columns: 1fr;
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
.graph-wrap {
|
| 293 |
+
height: 760px;
|
| 294 |
+
}
|
| 295 |
+
}
|
| 296 |
+
</style>
|
| 297 |
+
</head>
|
| 298 |
+
<body>
|
| 299 |
+
<div class="shell">
|
| 300 |
+
<header class="card header">
|
| 301 |
+
<div class="title">
|
| 302 |
+
<h1>AntiAtropos SRE Control Console</h1>
|
| 303 |
+
<p>Simulated environment with direct observability through Prometheus and Grafana</p>
|
| 304 |
+
</div>
|
| 305 |
+
<div class="links">
|
| 306 |
+
<a class="link-btn" href="/docs" target="_blank">API Docs</a>
|
| 307 |
+
<a class="link-btn" href="/prometheus/" target="_blank">Open Prometheus</a>
|
| 308 |
+
<a class="link-btn" href="/grafana/" target="_blank">Open Grafana</a>
|
| 309 |
+
</div>
|
| 310 |
+
</header>
|
| 311 |
+
|
| 312 |
+
<main class="layout">
|
| 313 |
+
<section class="card controls">
|
| 314 |
+
<div class="controls-grid">
|
| 315 |
+
<div class="field">
|
| 316 |
+
<label for="action-type">Action Type</label>
|
| 317 |
+
<select id="action-type">
|
| 318 |
+
<option value="NO_OP">NO_OP</option>
|
| 319 |
+
<option value="SCALE_UP">SCALE_UP</option>
|
| 320 |
+
<option value="SCALE_DOWN">SCALE_DOWN</option>
|
| 321 |
+
<option value="REROUTE_TRAFFIC">REROUTE_TRAFFIC</option>
|
| 322 |
+
<option value="SHED_LOAD">SHED_LOAD</option>
|
| 323 |
+
</select>
|
| 324 |
+
</div>
|
| 325 |
+
<div class="field">
|
| 326 |
+
<label for="node-id">Target Node</label>
|
| 327 |
+
<select id="node-id">
|
| 328 |
+
<option value="node-0">node-0 (VIP)</option>
|
| 329 |
+
<option value="node-1">node-1</option>
|
| 330 |
+
<option value="node-2">node-2</option>
|
| 331 |
+
<option value="node-3">node-3</option>
|
| 332 |
+
<option value="node-4">node-4</option>
|
| 333 |
+
</select>
|
| 334 |
+
</div>
|
| 335 |
+
<div class="field">
|
| 336 |
+
<label for="parameter">Parameter</label>
|
| 337 |
+
<input id="parameter" type="number" step="0.1" value="0.0">
|
| 338 |
+
</div>
|
| 339 |
+
<div class="actions">
|
| 340 |
+
<button class="btn btn-primary" onclick="resetEnv()">Reset Episode</button>
|
| 341 |
+
<button class="btn" onclick="stepEnv()">Execute Step</button>
|
| 342 |
+
</div>
|
| 343 |
+
</div>
|
| 344 |
+
</section>
|
| 345 |
+
|
| 346 |
+
<section class="card metrics">
|
| 347 |
+
<div class="metric">
|
| 348 |
+
<div class="name">Cluster ID</div>
|
| 349 |
+
<div id="cluster-id" class="value">---</div>
|
| 350 |
+
</div>
|
| 351 |
+
<div class="metric">
|
| 352 |
+
<div class="name">Reward</div>
|
| 353 |
+
<div id="last-reward" class="value">0.0000</div>
|
| 354 |
+
</div>
|
| 355 |
+
<div class="metric">
|
| 356 |
+
<div class="name">Lyapunov Energy</div>
|
| 357 |
+
<div id="lyapunov-val" class="value">0.0000</div>
|
| 358 |
+
</div>
|
| 359 |
+
<div class="metric">
|
| 360 |
+
<div class="name">Mode</div>
|
| 361 |
+
<div id="mode-val" class="value">simulated</div>
|
| 362 |
+
</div>
|
| 363 |
+
<div class="metric">
|
| 364 |
+
<div class="name">Step</div>
|
| 365 |
+
<div id="step-val" class="value">0</div>
|
| 366 |
+
</div>
|
| 367 |
+
</section>
|
| 368 |
+
|
| 369 |
+
<section class="card monitor">
|
| 370 |
+
<div class="monitor-head">
|
| 371 |
+
<h2>Required Graphs</h2>
|
| 372 |
+
<p>Raw metrics source: Prometheus. Curated dashboard: Grafana.</p>
|
| 373 |
+
</div>
|
| 374 |
+
<div class="graph-wrap">
|
| 375 |
+
<iframe
|
| 376 |
+
id="grafana-iframe"
|
| 377 |
+
src="/grafana/d/antiatropos-overview/antiatropos-overview?kiosk&theme=dark&refresh=5s&from=now-30m&to=now">
|
| 378 |
+
</iframe>
|
| 379 |
+
</div>
|
| 380 |
+
</section>
|
| 381 |
+
|
| 382 |
+
<section class="card logs">
|
| 383 |
+
<h3>System Logs</h3>
|
| 384 |
+
<div id="terminal">
|
| 385 |
+
<div class="log-line"><span class="log-time">[init]</span>Waiting for interaction.</div>
|
| 386 |
+
</div>
|
| 387 |
+
</section>
|
| 388 |
+
</main>
|
| 389 |
+
</div>
|
| 390 |
+
|
| 391 |
+
<script>
|
| 392 |
+
const terminal = document.getElementById("terminal");
|
| 393 |
+
|
| 394 |
+
function log(message, type = "info") {
|
| 395 |
+
const time = new Date().toLocaleTimeString([], {
|
| 396 |
+
hour12: false,
|
| 397 |
+
hour: "2-digit",
|
| 398 |
+
minute: "2-digit",
|
| 399 |
+
second: "2-digit"
|
| 400 |
+
});
|
| 401 |
+
const row = document.createElement("div");
|
| 402 |
+
row.className = "log-line";
|
| 403 |
+
const color = type === "error" ? "#ff6f7f" : type === "success" ? "#3dcf8e" : "#c9d6ed";
|
| 404 |
+
row.innerHTML = '<span class="log-time">[' + time + "]</span><span style=\"color:" + color + "\">" + message + "</span>";
|
| 405 |
+
terminal.appendChild(row);
|
| 406 |
+
terminal.scrollTop = terminal.scrollHeight;
|
| 407 |
+
}
|
| 408 |
+
|
| 409 |
+
function updateUI(data) {
|
| 410 |
+
const observation = data.observation || {};
|
| 411 |
+
const rewardNode = document.getElementById("last-reward");
|
| 412 |
+
const reward = typeof data.reward === "number" ? data.reward : 0;
|
| 413 |
+
|
| 414 |
+
document.getElementById("cluster-id").innerText = (observation.cluster_id || "---").toString().slice(0, 12);
|
| 415 |
+
document.getElementById("lyapunov-val").innerText = Number(observation.lyapunov_energy || 0).toFixed(4);
|
| 416 |
+
document.getElementById("mode-val").innerText = (observation.mode || "simulated").toString();
|
| 417 |
+
document.getElementById("step-val").innerText = String(observation.step || 0);
|
| 418 |
+
|
| 419 |
+
rewardNode.innerText = reward.toFixed(4);
|
| 420 |
+
rewardNode.className = reward < 0 ? "value bad" : "value good";
|
| 421 |
+
}
|
| 422 |
+
|
| 423 |
+
async function resetEnv() {
|
| 424 |
+
log("Resetting environment...");
|
| 425 |
+
try {
|
| 426 |
+
const response = await fetch("/reset", {
|
| 427 |
+
method: "POST",
|
| 428 |
+
headers: { "Content-Type": "application/json" },
|
| 429 |
+
body: JSON.stringify({})
|
| 430 |
+
});
|
| 431 |
+
const data = await response.json();
|
| 432 |
+
updateUI(data);
|
| 433 |
+
log("Environment reset complete.", "success");
|
| 434 |
+
} catch (err) {
|
| 435 |
+
log("Reset failed: " + err.message, "error");
|
| 436 |
+
}
|
| 437 |
+
}
|
| 438 |
+
|
| 439 |
+
async function stepEnv() {
|
| 440 |
+
const action = {
|
| 441 |
+
action_type: document.getElementById("action-type").value,
|
| 442 |
+
target_node_id: document.getElementById("node-id").value,
|
| 443 |
+
parameter: parseFloat(document.getElementById("parameter").value)
|
| 444 |
+
};
|
| 445 |
+
|
| 446 |
+
log("Dispatching " + action.action_type + " to " + action.target_node_id + " (" + action.parameter + ")");
|
| 447 |
+
|
| 448 |
+
try {
|
| 449 |
+
const response = await fetch("/step", {
|
| 450 |
+
method: "POST",
|
| 451 |
+
headers: { "Content-Type": "application/json" },
|
| 452 |
+
body: JSON.stringify({ action: action })
|
| 453 |
+
});
|
| 454 |
+
const data = await response.json();
|
| 455 |
+
|
| 456 |
+
if (data.detail) {
|
| 457 |
+
log("Invalid payload: " + JSON.stringify(data.detail), "error");
|
| 458 |
+
return;
|
| 459 |
+
}
|
| 460 |
+
|
| 461 |
+
updateUI(data);
|
| 462 |
+
log(
|
| 463 |
+
"Step complete. Reward=" + Number(data.reward || 0).toFixed(3) +
|
| 464 |
+
" Lyapunov=" + Number((data.observation || {}).lyapunov_energy || 0).toFixed(3),
|
| 465 |
+
"success"
|
| 466 |
+
);
|
| 467 |
+
} catch (err) {
|
| 468 |
+
log("Execution failed: " + err.message, "error");
|
| 469 |
+
}
|
| 470 |
+
}
|
| 471 |
+
</script>
|
| 472 |
+
</body>
|
| 473 |
+
</html>
|
deploy/kind-maxpods-250.yaml
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
kind: Cluster
|
| 2 |
+
apiVersion: kind.x-k8s.io/v1alpha4
|
| 3 |
+
name: antiatropos-local
|
| 4 |
+
nodes:
|
| 5 |
+
- role: control-plane
|
| 6 |
+
kubeadmConfigPatches:
|
| 7 |
+
- |
|
| 8 |
+
kind: InitConfiguration
|
| 9 |
+
nodeRegistration:
|
| 10 |
+
kubeletExtraArgs:
|
| 11 |
+
max-pods: "250"
|
deploy/local-laptop.yaml
ADDED
|
@@ -0,0 +1,365 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
apiVersion: v1
|
| 2 |
+
kind: Namespace
|
| 3 |
+
metadata:
|
| 4 |
+
name: prod-sre
|
| 5 |
+
---
|
| 6 |
+
apiVersion: apps/v1
|
| 7 |
+
kind: Deployment
|
| 8 |
+
metadata:
|
| 9 |
+
name: auth
|
| 10 |
+
namespace: prod-sre
|
| 11 |
+
spec:
|
| 12 |
+
replicas: 1
|
| 13 |
+
selector:
|
| 14 |
+
matchLabels:
|
| 15 |
+
app: auth
|
| 16 |
+
template:
|
| 17 |
+
metadata:
|
| 18 |
+
labels:
|
| 19 |
+
app: auth
|
| 20 |
+
annotations:
|
| 21 |
+
prometheus.io/scrape: "true"
|
| 22 |
+
prometheus.io/port: "8080"
|
| 23 |
+
prometheus.io/path: "/metrics.txt"
|
| 24 |
+
spec:
|
| 25 |
+
containers:
|
| 26 |
+
- name: auth
|
| 27 |
+
image: python:3.12-alpine
|
| 28 |
+
env:
|
| 29 |
+
- name: NODE_ID
|
| 30 |
+
value: node-4
|
| 31 |
+
- name: BASE_QUEUE
|
| 32 |
+
value: "6"
|
| 33 |
+
command: ["/bin/sh", "-lc"]
|
| 34 |
+
args:
|
| 35 |
+
- |
|
| 36 |
+
mkdir -p /www
|
| 37 |
+
echo ok > /www/index.html
|
| 38 |
+
python -m http.server 8080 --directory /www >/tmp/http.log 2>&1 &
|
| 39 |
+
req=0; err=0; cpu_total=0
|
| 40 |
+
while true; do
|
| 41 |
+
t=$(date +%s)
|
| 42 |
+
noise=$((t % 11))
|
| 43 |
+
req=$((req + 30 + noise))
|
| 44 |
+
q=$((BASE_QUEUE + (t % 20) - 10))
|
| 45 |
+
if [ "$q" -lt 0 ]; then q=0; fi
|
| 46 |
+
err=$((err + q / 20))
|
| 47 |
+
cpu_inc=$((10 + q / 10))
|
| 48 |
+
cpu_total=$((cpu_total + cpu_inc))
|
| 49 |
+
lat_ms=$((35 + q * 3))
|
| 50 |
+
b005=$((req / 5)); b01=$((req / 3)); b025=$((req / 2)); b05=$((req * 3 / 4)); b1=$req; b2=$req
|
| 51 |
+
lat_sum=$(awk "BEGIN {printf \"%.3f\", $req * $lat_ms / 1000.0}")
|
| 52 |
+
{
|
| 53 |
+
echo "# HELP http_requests_total Synthetic request counter"
|
| 54 |
+
echo "# TYPE http_requests_total counter"
|
| 55 |
+
echo "http_requests_total{node_id=\"${NODE_ID}\",status=\"200\"} ${req}"
|
| 56 |
+
echo "http_requests_total{node_id=\"${NODE_ID}\",status=\"500\"} ${err}"
|
| 57 |
+
echo "# HELP queue_depth Synthetic queue depth"
|
| 58 |
+
echo "# TYPE queue_depth gauge"
|
| 59 |
+
echo "queue_depth{node_id=\"${NODE_ID}\"} ${q}"
|
| 60 |
+
echo "# HELP container_cpu_usage_seconds_total Synthetic CPU counter"
|
| 61 |
+
echo "# TYPE container_cpu_usage_seconds_total counter"
|
| 62 |
+
echo "container_cpu_usage_seconds_total{node_id=\"${NODE_ID}\"} ${cpu_total}"
|
| 63 |
+
echo "# HELP http_request_duration_seconds Synthetic request duration histogram"
|
| 64 |
+
echo "# TYPE http_request_duration_seconds histogram"
|
| 65 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.05\"} ${b005}"
|
| 66 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.1\"} ${b01}"
|
| 67 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.25\"} ${b025}"
|
| 68 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.5\"} ${b05}"
|
| 69 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"1\"} ${b1}"
|
| 70 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"2\"} ${b2}"
|
| 71 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
|
| 72 |
+
echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
|
| 73 |
+
echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
|
| 74 |
+
} > /www/metrics.txt
|
| 75 |
+
sleep 2
|
| 76 |
+
done
|
| 77 |
+
---
|
| 78 |
+
apiVersion: apps/v1
|
| 79 |
+
kind: Deployment
|
| 80 |
+
metadata:
|
| 81 |
+
name: cart
|
| 82 |
+
namespace: prod-sre
|
| 83 |
+
spec:
|
| 84 |
+
replicas: 1
|
| 85 |
+
selector:
|
| 86 |
+
matchLabels:
|
| 87 |
+
app: cart
|
| 88 |
+
template:
|
| 89 |
+
metadata:
|
| 90 |
+
labels:
|
| 91 |
+
app: cart
|
| 92 |
+
annotations:
|
| 93 |
+
prometheus.io/scrape: "true"
|
| 94 |
+
prometheus.io/port: "8080"
|
| 95 |
+
prometheus.io/path: "/metrics.txt"
|
| 96 |
+
spec:
|
| 97 |
+
containers:
|
| 98 |
+
- name: cart
|
| 99 |
+
image: python:3.12-alpine
|
| 100 |
+
env:
|
| 101 |
+
- name: NODE_ID
|
| 102 |
+
value: node-3
|
| 103 |
+
- name: BASE_QUEUE
|
| 104 |
+
value: "14"
|
| 105 |
+
command: ["/bin/sh", "-lc"]
|
| 106 |
+
args:
|
| 107 |
+
- |
|
| 108 |
+
mkdir -p /www
|
| 109 |
+
echo ok > /www/index.html
|
| 110 |
+
python -m http.server 8080 --directory /www >/tmp/http.log 2>&1 &
|
| 111 |
+
req=0; err=0; cpu_total=0
|
| 112 |
+
while true; do
|
| 113 |
+
t=$(date +%s)
|
| 114 |
+
noise=$((t % 11))
|
| 115 |
+
req=$((req + 30 + noise))
|
| 116 |
+
q=$((BASE_QUEUE + (t % 20) - 10))
|
| 117 |
+
if [ "$q" -lt 0 ]; then q=0; fi
|
| 118 |
+
err=$((err + q / 20))
|
| 119 |
+
cpu_inc=$((10 + q / 10))
|
| 120 |
+
cpu_total=$((cpu_total + cpu_inc))
|
| 121 |
+
lat_ms=$((35 + q * 3))
|
| 122 |
+
b005=$((req / 5)); b01=$((req / 3)); b025=$((req / 2)); b05=$((req * 3 / 4)); b1=$req; b2=$req
|
| 123 |
+
lat_sum=$(awk "BEGIN {printf \"%.3f\", $req * $lat_ms / 1000.0}")
|
| 124 |
+
{
|
| 125 |
+
echo "# HELP http_requests_total Synthetic request counter"
|
| 126 |
+
echo "# TYPE http_requests_total counter"
|
| 127 |
+
echo "http_requests_total{node_id=\"${NODE_ID}\",status=\"200\"} ${req}"
|
| 128 |
+
echo "http_requests_total{node_id=\"${NODE_ID}\",status=\"500\"} ${err}"
|
| 129 |
+
echo "# HELP queue_depth Synthetic queue depth"
|
| 130 |
+
echo "# TYPE queue_depth gauge"
|
| 131 |
+
echo "queue_depth{node_id=\"${NODE_ID}\"} ${q}"
|
| 132 |
+
echo "# HELP container_cpu_usage_seconds_total Synthetic CPU counter"
|
| 133 |
+
echo "# TYPE container_cpu_usage_seconds_total counter"
|
| 134 |
+
echo "container_cpu_usage_seconds_total{node_id=\"${NODE_ID}\"} ${cpu_total}"
|
| 135 |
+
echo "# HELP http_request_duration_seconds Synthetic request duration histogram"
|
| 136 |
+
echo "# TYPE http_request_duration_seconds histogram"
|
| 137 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.05\"} ${b005}"
|
| 138 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.1\"} ${b01}"
|
| 139 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.25\"} ${b025}"
|
| 140 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.5\"} ${b05}"
|
| 141 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"1\"} ${b1}"
|
| 142 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"2\"} ${b2}"
|
| 143 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
|
| 144 |
+
echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
|
| 145 |
+
echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
|
| 146 |
+
} > /www/metrics.txt
|
| 147 |
+
sleep 2
|
| 148 |
+
done
|
| 149 |
+
---
|
| 150 |
+
apiVersion: apps/v1
|
| 151 |
+
kind: Deployment
|
| 152 |
+
metadata:
|
| 153 |
+
name: catalog
|
| 154 |
+
namespace: prod-sre
|
| 155 |
+
spec:
|
| 156 |
+
replicas: 1
|
| 157 |
+
selector:
|
| 158 |
+
matchLabels:
|
| 159 |
+
app: catalog
|
| 160 |
+
template:
|
| 161 |
+
metadata:
|
| 162 |
+
labels:
|
| 163 |
+
app: catalog
|
| 164 |
+
annotations:
|
| 165 |
+
prometheus.io/scrape: "true"
|
| 166 |
+
prometheus.io/port: "8080"
|
| 167 |
+
prometheus.io/path: "/metrics.txt"
|
| 168 |
+
spec:
|
| 169 |
+
containers:
|
| 170 |
+
- name: catalog
|
| 171 |
+
image: python:3.12-alpine
|
| 172 |
+
env:
|
| 173 |
+
- name: NODE_ID
|
| 174 |
+
value: node-2
|
| 175 |
+
- name: BASE_QUEUE
|
| 176 |
+
value: "20"
|
| 177 |
+
command: ["/bin/sh", "-lc"]
|
| 178 |
+
args:
|
| 179 |
+
- |
|
| 180 |
+
mkdir -p /www
|
| 181 |
+
echo ok > /www/index.html
|
| 182 |
+
python -m http.server 8080 --directory /www >/tmp/http.log 2>&1 &
|
| 183 |
+
req=0; err=0; cpu_total=0
|
| 184 |
+
while true; do
|
| 185 |
+
t=$(date +%s)
|
| 186 |
+
noise=$((t % 11))
|
| 187 |
+
req=$((req + 30 + noise))
|
| 188 |
+
q=$((BASE_QUEUE + (t % 20) - 10))
|
| 189 |
+
if [ "$q" -lt 0 ]; then q=0; fi
|
| 190 |
+
err=$((err + q / 20))
|
| 191 |
+
cpu_inc=$((10 + q / 10))
|
| 192 |
+
cpu_total=$((cpu_total + cpu_inc))
|
| 193 |
+
lat_ms=$((35 + q * 3))
|
| 194 |
+
b005=$((req / 5)); b01=$((req / 3)); b025=$((req / 2)); b05=$((req * 3 / 4)); b1=$req; b2=$req
|
| 195 |
+
lat_sum=$(awk "BEGIN {printf \"%.3f\", $req * $lat_ms / 1000.0}")
|
| 196 |
+
{
|
| 197 |
+
echo "# HELP http_requests_total Synthetic request counter"
|
| 198 |
+
echo "# TYPE http_requests_total counter"
|
| 199 |
+
echo "http_requests_total{node_id=\"${NODE_ID}\",status=\"200\"} ${req}"
|
| 200 |
+
echo "http_requests_total{node_id=\"${NODE_ID}\",status=\"500\"} ${err}"
|
| 201 |
+
echo "# HELP queue_depth Synthetic queue depth"
|
| 202 |
+
echo "# TYPE queue_depth gauge"
|
| 203 |
+
echo "queue_depth{node_id=\"${NODE_ID}\"} ${q}"
|
| 204 |
+
echo "# HELP container_cpu_usage_seconds_total Synthetic CPU counter"
|
| 205 |
+
echo "# TYPE container_cpu_usage_seconds_total counter"
|
| 206 |
+
echo "container_cpu_usage_seconds_total{node_id=\"${NODE_ID}\"} ${cpu_total}"
|
| 207 |
+
echo "# HELP http_request_duration_seconds Synthetic request duration histogram"
|
| 208 |
+
echo "# TYPE http_request_duration_seconds histogram"
|
| 209 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.05\"} ${b005}"
|
| 210 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.1\"} ${b01}"
|
| 211 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.25\"} ${b025}"
|
| 212 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.5\"} ${b05}"
|
| 213 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"1\"} ${b1}"
|
| 214 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"2\"} ${b2}"
|
| 215 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
|
| 216 |
+
echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
|
| 217 |
+
echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
|
| 218 |
+
} > /www/metrics.txt
|
| 219 |
+
sleep 2
|
| 220 |
+
done
|
| 221 |
+
---
|
| 222 |
+
apiVersion: apps/v1
|
| 223 |
+
kind: Deployment
|
| 224 |
+
metadata:
|
| 225 |
+
name: checkout
|
| 226 |
+
namespace: prod-sre
|
| 227 |
+
spec:
|
| 228 |
+
replicas: 1
|
| 229 |
+
selector:
|
| 230 |
+
matchLabels:
|
| 231 |
+
app: checkout
|
| 232 |
+
template:
|
| 233 |
+
metadata:
|
| 234 |
+
labels:
|
| 235 |
+
app: checkout
|
| 236 |
+
annotations:
|
| 237 |
+
prometheus.io/scrape: "true"
|
| 238 |
+
prometheus.io/port: "8080"
|
| 239 |
+
prometheus.io/path: "/metrics.txt"
|
| 240 |
+
spec:
|
| 241 |
+
containers:
|
| 242 |
+
- name: checkout
|
| 243 |
+
image: python:3.12-alpine
|
| 244 |
+
env:
|
| 245 |
+
- name: NODE_ID
|
| 246 |
+
value: node-1
|
| 247 |
+
- name: BASE_QUEUE
|
| 248 |
+
value: "24"
|
| 249 |
+
command: ["/bin/sh", "-lc"]
|
| 250 |
+
args:
|
| 251 |
+
- |
|
| 252 |
+
mkdir -p /www
|
| 253 |
+
echo ok > /www/index.html
|
| 254 |
+
python -m http.server 8080 --directory /www >/tmp/http.log 2>&1 &
|
| 255 |
+
req=0; err=0; cpu_total=0
|
| 256 |
+
while true; do
|
| 257 |
+
t=$(date +%s)
|
| 258 |
+
noise=$((t % 11))
|
| 259 |
+
req=$((req + 30 + noise))
|
| 260 |
+
q=$((BASE_QUEUE + (t % 20) - 10))
|
| 261 |
+
if [ "$q" -lt 0 ]; then q=0; fi
|
| 262 |
+
err=$((err + q / 20))
|
| 263 |
+
cpu_inc=$((10 + q / 10))
|
| 264 |
+
cpu_total=$((cpu_total + cpu_inc))
|
| 265 |
+
lat_ms=$((35 + q * 3))
|
| 266 |
+
b005=$((req / 5)); b01=$((req / 3)); b025=$((req / 2)); b05=$((req * 3 / 4)); b1=$req; b2=$req
|
| 267 |
+
lat_sum=$(awk "BEGIN {printf \"%.3f\", $req * $lat_ms / 1000.0}")
|
| 268 |
+
{
|
| 269 |
+
echo "# HELP http_requests_total Synthetic request counter"
|
| 270 |
+
echo "# TYPE http_requests_total counter"
|
| 271 |
+
echo "http_requests_total{node_id=\"${NODE_ID}\",status=\"200\"} ${req}"
|
| 272 |
+
echo "http_requests_total{node_id=\"${NODE_ID}\",status=\"500\"} ${err}"
|
| 273 |
+
echo "# HELP queue_depth Synthetic queue depth"
|
| 274 |
+
echo "# TYPE queue_depth gauge"
|
| 275 |
+
echo "queue_depth{node_id=\"${NODE_ID}\"} ${q}"
|
| 276 |
+
echo "# HELP container_cpu_usage_seconds_total Synthetic CPU counter"
|
| 277 |
+
echo "# TYPE container_cpu_usage_seconds_total counter"
|
| 278 |
+
echo "container_cpu_usage_seconds_total{node_id=\"${NODE_ID}\"} ${cpu_total}"
|
| 279 |
+
echo "# HELP http_request_duration_seconds Synthetic request duration histogram"
|
| 280 |
+
echo "# TYPE http_request_duration_seconds histogram"
|
| 281 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.05\"} ${b005}"
|
| 282 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.1\"} ${b01}"
|
| 283 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.25\"} ${b025}"
|
| 284 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.5\"} ${b05}"
|
| 285 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"1\"} ${b1}"
|
| 286 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"2\"} ${b2}"
|
| 287 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
|
| 288 |
+
echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
|
| 289 |
+
echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
|
| 290 |
+
} > /www/metrics.txt
|
| 291 |
+
sleep 2
|
| 292 |
+
done
|
| 293 |
+
---
|
| 294 |
+
apiVersion: apps/v1
|
| 295 |
+
kind: Deployment
|
| 296 |
+
metadata:
|
| 297 |
+
name: payments
|
| 298 |
+
namespace: prod-sre
|
| 299 |
+
spec:
|
| 300 |
+
replicas: 2
|
| 301 |
+
selector:
|
| 302 |
+
matchLabels:
|
| 303 |
+
app: payments
|
| 304 |
+
template:
|
| 305 |
+
metadata:
|
| 306 |
+
labels:
|
| 307 |
+
app: payments
|
| 308 |
+
annotations:
|
| 309 |
+
prometheus.io/scrape: "true"
|
| 310 |
+
prometheus.io/port: "8080"
|
| 311 |
+
prometheus.io/path: "/metrics.txt"
|
| 312 |
+
spec:
|
| 313 |
+
containers:
|
| 314 |
+
- name: payments
|
| 315 |
+
image: python:3.12-alpine
|
| 316 |
+
env:
|
| 317 |
+
- name: NODE_ID
|
| 318 |
+
value: node-0
|
| 319 |
+
- name: BASE_QUEUE
|
| 320 |
+
value: "30"
|
| 321 |
+
command: ["/bin/sh", "-lc"]
|
| 322 |
+
args:
|
| 323 |
+
- |
|
| 324 |
+
mkdir -p /www
|
| 325 |
+
echo ok > /www/index.html
|
| 326 |
+
python -m http.server 8080 --directory /www >/tmp/http.log 2>&1 &
|
| 327 |
+
req=0; err=0; cpu_total=0
|
| 328 |
+
while true; do
|
| 329 |
+
t=$(date +%s)
|
| 330 |
+
noise=$((t % 11))
|
| 331 |
+
req=$((req + 30 + noise))
|
| 332 |
+
q=$((BASE_QUEUE + (t % 20) - 10))
|
| 333 |
+
if [ "$q" -lt 0 ]; then q=0; fi
|
| 334 |
+
err=$((err + q / 20))
|
| 335 |
+
cpu_inc=$((10 + q / 10))
|
| 336 |
+
cpu_total=$((cpu_total + cpu_inc))
|
| 337 |
+
lat_ms=$((35 + q * 3))
|
| 338 |
+
b005=$((req / 5)); b01=$((req / 3)); b025=$((req / 2)); b05=$((req * 3 / 4)); b1=$req; b2=$req
|
| 339 |
+
lat_sum=$(awk "BEGIN {printf \"%.3f\", $req * $lat_ms / 1000.0}")
|
| 340 |
+
{
|
| 341 |
+
echo "# HELP http_requests_total Synthetic request counter"
|
| 342 |
+
echo "# TYPE http_requests_total counter"
|
| 343 |
+
echo "http_requests_total{node_id=\"${NODE_ID}\",status=\"200\"} ${req}"
|
| 344 |
+
echo "http_requests_total{node_id=\"${NODE_ID}\",status=\"500\"} ${err}"
|
| 345 |
+
echo "# HELP queue_depth Synthetic queue depth"
|
| 346 |
+
echo "# TYPE queue_depth gauge"
|
| 347 |
+
echo "queue_depth{node_id=\"${NODE_ID}\"} ${q}"
|
| 348 |
+
echo "# HELP container_cpu_usage_seconds_total Synthetic CPU counter"
|
| 349 |
+
echo "# TYPE container_cpu_usage_seconds_total counter"
|
| 350 |
+
echo "container_cpu_usage_seconds_total{node_id=\"${NODE_ID}\"} ${cpu_total}"
|
| 351 |
+
echo "# HELP http_request_duration_seconds Synthetic request duration histogram"
|
| 352 |
+
echo "# TYPE http_request_duration_seconds histogram"
|
| 353 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.05\"} ${b005}"
|
| 354 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.1\"} ${b01}"
|
| 355 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.25\"} ${b025}"
|
| 356 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.5\"} ${b05}"
|
| 357 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"1\"} ${b1}"
|
| 358 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"2\"} ${b2}"
|
| 359 |
+
echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
|
| 360 |
+
echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
|
| 361 |
+
echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
|
| 362 |
+
} > /www/metrics.txt
|
| 363 |
+
sleep 2
|
| 364 |
+
done
|
| 365 |
+
|
deploy/local/datasource-local.yaml
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
apiVersion: 1
|
| 2 |
+
|
| 3 |
+
datasources:
|
| 4 |
+
- name: Prometheus
|
| 5 |
+
uid: PBFA97CFB590B2093
|
| 6 |
+
type: prometheus
|
| 7 |
+
access: proxy
|
| 8 |
+
url: http://prometheus-local:9090
|
| 9 |
+
isDefault: true
|
| 10 |
+
editable: true
|
deploy/local/grafana-local-values.yaml
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
adminUser: admin
|
| 2 |
+
adminPassword: antiatropos
|
| 3 |
+
|
| 4 |
+
service:
|
| 5 |
+
type: ClusterIP
|
| 6 |
+
|
| 7 |
+
persistence:
|
| 8 |
+
enabled: false
|
| 9 |
+
|
| 10 |
+
resources:
|
| 11 |
+
requests:
|
| 12 |
+
cpu: 100m
|
| 13 |
+
memory: 192Mi
|
| 14 |
+
limits:
|
| 15 |
+
cpu: 400m
|
| 16 |
+
memory: 384Mi
|
| 17 |
+
|
| 18 |
+
datasources:
|
| 19 |
+
datasources.yaml:
|
| 20 |
+
apiVersion: 1
|
| 21 |
+
datasources:
|
| 22 |
+
- name: Prometheus
|
| 23 |
+
type: prometheus
|
| 24 |
+
access: proxy
|
| 25 |
+
url: http://prometheus-server.monitoring.svc.cluster.local
|
| 26 |
+
isDefault: true
|
| 27 |
+
editable: true
|
| 28 |
+
|
| 29 |
+
sidecar:
|
| 30 |
+
dashboards:
|
| 31 |
+
enabled: true
|
| 32 |
+
label: grafana_dashboard
|
| 33 |
+
labelValue: "1"
|
| 34 |
+
searchNamespace: ALL
|
deploy/local/prometheus-local-values.yaml
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
alertmanager:
|
| 2 |
+
enabled: false
|
| 3 |
+
|
| 4 |
+
kube-state-metrics:
|
| 5 |
+
enabled: false
|
| 6 |
+
|
| 7 |
+
prometheus-node-exporter:
|
| 8 |
+
enabled: false
|
| 9 |
+
|
| 10 |
+
prometheus-pushgateway:
|
| 11 |
+
enabled: false
|
| 12 |
+
|
| 13 |
+
extraScrapeConfigs: |
|
| 14 |
+
- job_name: 'antiatropos-fastapi'
|
| 15 |
+
metrics_path: /metrics
|
| 16 |
+
static_configs:
|
| 17 |
+
- targets: ['host.docker.internal:8000']
|
| 18 |
+
|
| 19 |
+
- job_name: 'prod-sre-annotated-pods'
|
| 20 |
+
kubernetes_sd_configs:
|
| 21 |
+
- role: pod
|
| 22 |
+
namespaces:
|
| 23 |
+
names: ['prod-sre']
|
| 24 |
+
relabel_configs:
|
| 25 |
+
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
|
| 26 |
+
action: keep
|
| 27 |
+
regex: true
|
| 28 |
+
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
|
| 29 |
+
action: replace
|
| 30 |
+
target_label: __metrics_path__
|
| 31 |
+
regex: (.+)
|
| 32 |
+
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
|
| 33 |
+
action: replace
|
| 34 |
+
regex: ([^:]+)(?::\d+)?;(\d+)
|
| 35 |
+
replacement: $1:$2
|
| 36 |
+
target_label: __address__
|
| 37 |
+
|
| 38 |
+
server:
|
| 39 |
+
persistentVolume:
|
| 40 |
+
enabled: false
|
| 41 |
+
resources:
|
| 42 |
+
requests:
|
| 43 |
+
cpu: 100m
|
| 44 |
+
memory: 256Mi
|
| 45 |
+
limits:
|
| 46 |
+
cpu: 500m
|
| 47 |
+
memory: 512Mi
|
| 48 |
+
service:
|
| 49 |
+
type: ClusterIP
|
deploy/nginx.conf
CHANGED
|
@@ -1,89 +1,89 @@
|
|
| 1 |
-
worker_processes auto;
|
| 2 |
-
pid /tmp/nginx.pid;
|
| 3 |
-
error_log /dev/stderr info;
|
| 4 |
-
|
| 5 |
-
events {
|
| 6 |
-
worker_connections 1024;
|
| 7 |
-
}
|
| 8 |
-
|
| 9 |
-
http {
|
| 10 |
-
include /etc/nginx/mime.types;
|
| 11 |
-
default_type application/octet-stream;
|
| 12 |
-
sendfile on;
|
| 13 |
-
keepalive_timeout 65;
|
| 14 |
-
access_log /dev/stdout;
|
| 15 |
-
|
| 16 |
-
map $http_upgrade $connection_upgrade {
|
| 17 |
-
default upgrade;
|
| 18 |
-
'' close;
|
| 19 |
-
}
|
| 20 |
-
|
| 21 |
-
server {
|
| 22 |
-
listen 7860;
|
| 23 |
-
server_name _;
|
| 24 |
-
|
| 25 |
-
client_max_body_size 50m;
|
| 26 |
-
proxy_read_timeout 3600s;
|
| 27 |
-
proxy_send_timeout 3600s;
|
| 28 |
-
|
| 29 |
-
location = /prometheus {
|
| 30 |
-
return 301 /prometheus/;
|
| 31 |
-
}
|
| 32 |
-
|
| 33 |
-
location = /grafana {
|
| 34 |
-
return 301 /grafana/;
|
| 35 |
-
}
|
| 36 |
-
|
| 37 |
-
location /prometheus/ {
|
| 38 |
-
proxy_pass http://127.0.0.1:9090;
|
| 39 |
-
proxy_http_version 1.1;
|
| 40 |
-
proxy_set_header Host $host;
|
| 41 |
-
proxy_set_header X-Real-IP $remote_addr;
|
| 42 |
-
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 43 |
-
proxy_set_header X-Forwarded-Host $host;
|
| 44 |
-
proxy_set_header X-Forwarded-Proto $scheme;
|
| 45 |
-
proxy_set_header X-Forwarded-Prefix /prometheus;
|
| 46 |
-
}
|
| 47 |
-
|
| 48 |
-
location /grafana/ {
|
| 49 |
-
proxy_pass http://127.0.0.1:3000;
|
| 50 |
-
proxy_http_version 1.1;
|
| 51 |
-
proxy_set_header Host $host;
|
| 52 |
-
proxy_set_header X-Real-IP $remote_addr;
|
| 53 |
-
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 54 |
-
proxy_set_header X-Forwarded-Host $host;
|
| 55 |
-
proxy_set_header X-Forwarded-Proto $scheme;
|
| 56 |
-
proxy_set_header X-Forwarded-Prefix /grafana;
|
| 57 |
-
}
|
| 58 |
-
|
| 59 |
-
location /grafana/api/live/ {
|
| 60 |
-
proxy_pass http://127.0.0.1:3000;
|
| 61 |
-
proxy_http_version 1.1;
|
| 62 |
-
proxy_set_header Upgrade $http_upgrade;
|
| 63 |
-
proxy_set_header Connection $connection_upgrade;
|
| 64 |
-
proxy_set_header Host $host;
|
| 65 |
-
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 66 |
-
proxy_set_header X-Forwarded-Host $host;
|
| 67 |
-
proxy_set_header X-Forwarded-Proto $scheme;
|
| 68 |
-
proxy_set_header X-Forwarded-Prefix /grafana;
|
| 69 |
-
}
|
| 70 |
-
|
| 71 |
-
location / {
|
| 72 |
-
root /var/www/html;
|
| 73 |
-
index index.html;
|
| 74 |
-
try_files $uri $uri/ @fastapi;
|
| 75 |
-
}
|
| 76 |
-
|
| 77 |
-
location @fastapi {
|
| 78 |
-
proxy_pass http://127.0.0.1:8000;
|
| 79 |
-
proxy_http_version 1.1;
|
| 80 |
-
proxy_set_header Host $host;
|
| 81 |
-
proxy_set_header X-Real-IP $remote_addr;
|
| 82 |
-
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 83 |
-
proxy_set_header X-Forwarded-Host $host;
|
| 84 |
-
proxy_set_header X-Forwarded-Proto $scheme;
|
| 85 |
-
proxy_set_header Upgrade $http_upgrade;
|
| 86 |
-
proxy_set_header Connection $connection_upgrade;
|
| 87 |
-
}
|
| 88 |
-
}
|
| 89 |
-
}
|
|
|
|
| 1 |
+
worker_processes auto;
|
| 2 |
+
pid /tmp/nginx.pid;
|
| 3 |
+
error_log /dev/stderr info;
|
| 4 |
+
|
| 5 |
+
events {
|
| 6 |
+
worker_connections 1024;
|
| 7 |
+
}
|
| 8 |
+
|
| 9 |
+
http {
|
| 10 |
+
include /etc/nginx/mime.types;
|
| 11 |
+
default_type application/octet-stream;
|
| 12 |
+
sendfile on;
|
| 13 |
+
keepalive_timeout 65;
|
| 14 |
+
access_log /dev/stdout;
|
| 15 |
+
|
| 16 |
+
map $http_upgrade $connection_upgrade {
|
| 17 |
+
default upgrade;
|
| 18 |
+
'' close;
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
server {
|
| 22 |
+
listen 7860;
|
| 23 |
+
server_name _;
|
| 24 |
+
|
| 25 |
+
client_max_body_size 50m;
|
| 26 |
+
proxy_read_timeout 3600s;
|
| 27 |
+
proxy_send_timeout 3600s;
|
| 28 |
+
|
| 29 |
+
location = /prometheus {
|
| 30 |
+
return 301 /prometheus/;
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
location = /grafana {
|
| 34 |
+
return 301 /grafana/;
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
location /prometheus/ {
|
| 38 |
+
proxy_pass http://127.0.0.1:9090;
|
| 39 |
+
proxy_http_version 1.1;
|
| 40 |
+
proxy_set_header Host $host;
|
| 41 |
+
proxy_set_header X-Real-IP $remote_addr;
|
| 42 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 43 |
+
proxy_set_header X-Forwarded-Host $host;
|
| 44 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
| 45 |
+
proxy_set_header X-Forwarded-Prefix /prometheus;
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
location /grafana/ {
|
| 49 |
+
proxy_pass http://127.0.0.1:3000;
|
| 50 |
+
proxy_http_version 1.1;
|
| 51 |
+
proxy_set_header Host $host;
|
| 52 |
+
proxy_set_header X-Real-IP $remote_addr;
|
| 53 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 54 |
+
proxy_set_header X-Forwarded-Host $host;
|
| 55 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
| 56 |
+
proxy_set_header X-Forwarded-Prefix /grafana;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
location /grafana/api/live/ {
|
| 60 |
+
proxy_pass http://127.0.0.1:3000;
|
| 61 |
+
proxy_http_version 1.1;
|
| 62 |
+
proxy_set_header Upgrade $http_upgrade;
|
| 63 |
+
proxy_set_header Connection $connection_upgrade;
|
| 64 |
+
proxy_set_header Host $host;
|
| 65 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 66 |
+
proxy_set_header X-Forwarded-Host $host;
|
| 67 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
| 68 |
+
proxy_set_header X-Forwarded-Prefix /grafana;
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
location / {
|
| 72 |
+
root /var/www/html;
|
| 73 |
+
index index.html;
|
| 74 |
+
try_files $uri $uri/ @fastapi;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
location @fastapi {
|
| 78 |
+
proxy_pass http://127.0.0.1:8000;
|
| 79 |
+
proxy_http_version 1.1;
|
| 80 |
+
proxy_set_header Host $host;
|
| 81 |
+
proxy_set_header X-Real-IP $remote_addr;
|
| 82 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 83 |
+
proxy_set_header X-Forwarded-Host $host;
|
| 84 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
| 85 |
+
proxy_set_header Upgrade $http_upgrade;
|
| 86 |
+
proxy_set_header Connection $connection_upgrade;
|
| 87 |
+
}
|
| 88 |
+
}
|
| 89 |
+
}
|